From 20e079a381b777550ce7872151b17fb4a11cd7ae Mon Sep 17 00:00:00 2001
From: Peter Drmota <49479443+pmldrmota@users.noreply.github.com>
Date: Mon, 15 Nov 2021 05:09:16 +0100
Subject: [PATCH] AD9910 driver feature extension and SUServo IIR readability
 (#1500)

* coredevice.ad9910: Add set_cfr2 function and extend arguments of set_cfr1 and set_sync

* SUServo: Wrap CPLD and DDS devices in a list

* SUServo: Refactor [nfc]

Co-authored-by: drmota <peter.drmota@physics.ox.ac.uk>
Co-authored-by: David Nadlinger <code@klickverbot.at>
---
 RELEASE_NOTES.rst                         |   3 +
 artiq/coredevice/ad9910.py                |  72 +++++--
 artiq/coredevice/suservo.py               |  56 ++---
 artiq/examples/kasli_suservo/device_db.py |   6 +-
 artiq/frontend/artiq_ddb_template.py      |  15 +-
 artiq/gateware/suservo/__init__.py        |  10 +
 artiq/gateware/suservo/iir.py             | 246 +++++++++++++---------
 artiq/gateware/suservo/servo.py           |  66 +++++-
 artiq/gateware/targets/kasli.py           |   6 +-
 9 files changed, 301 insertions(+), 179 deletions(-)

diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
index c743f2005..467e4e9c4 100644
--- a/RELEASE_NOTES.rst
+++ b/RELEASE_NOTES.rst
@@ -104,6 +104,9 @@ Breaking changes:
 * ``quamash`` has been replaced with ``qasync``.
 * Protocols are updated to use device endian.
 * Analyzer dump format includes a byte for device endianness.
+* To support variable numbers of Urukul cards in the future, the
+  ``artiq.coredevice.suservo.SUServo`` constructor now accepts two device name lists,
+  ``cpld_devices`` and ``dds_devices``, rather than four individual arguments.
 * Experiment classes with underscore-prefixed names are now ignored when ``artiq_client``
   determines which experiment to submit (consistent with ``artiq_run``).
 
diff --git a/artiq/coredevice/ad9910.py b/artiq/coredevice/ad9910.py
index 95ad66896..49bfe9a90 100644
--- a/artiq/coredevice/ad9910.py
+++ b/artiq/coredevice/ad9910.py
@@ -374,18 +374,25 @@ class AD9910:
             data[(n - preload) + i] = self.bus.read()
 
     @kernel
-    def set_cfr1(self, power_down: TInt32 = 0b0000,
+    def set_cfr1(self,
+                 power_down: TInt32 = 0b0000,
                  phase_autoclear: TInt32 = 0,
-                 drg_load_lrr: TInt32 = 0, drg_autoclear: TInt32 = 0,
-                 internal_profile: TInt32 = 0, ram_destination: TInt32 = 0,
-                 ram_enable: TInt32 = 0, manual_osk_external: TInt32 = 0,
-                 osk_enable: TInt32 = 0, select_auto_osk: TInt32 = 0):
+                 drg_load_lrr: TInt32 = 0,
+                 drg_autoclear: TInt32 = 0,
+                 phase_clear: TInt32 = 0,
+                 internal_profile: TInt32 = 0,
+                 ram_destination: TInt32 = 0,
+                 ram_enable: TInt32 = 0,
+                 manual_osk_external: TInt32 = 0,
+                 osk_enable: TInt32 = 0,
+                 select_auto_osk: TInt32 = 0):
         """Set CFR1. See the AD9910 datasheet for parameter meanings.
 
         This method does not pulse IO_UPDATE.
 
         :param power_down: Power down bits.
         :param phase_autoclear: Autoclear phase accumulator.
+        :param phase_clear: Asynchronous, static reset of the phase accumulator.
         :param drg_load_lrr: Load digital ramp generator LRR.
         :param drg_autoclear: Autoclear digital ramp generator.
         :param internal_profile: Internal profile control.
@@ -405,11 +412,41 @@ class AD9910:
                      (drg_load_lrr << 15) |
                      (drg_autoclear << 14) |
                      (phase_autoclear << 13) |
+                     (phase_clear << 11) |
                      (osk_enable << 9) |
                      (select_auto_osk << 8) |
                      (power_down << 4) |
                      2)  # SDIO input only, MSB first
 
+    @kernel
+    def set_cfr2(self, 
+                 asf_profile_enable: TInt32 = 1, 
+                 drg_enable: TInt32 = 0, 
+                 effective_ftw: TInt32 = 1,
+                 sync_validation_disable: TInt32 = 0, 
+                 matched_latency_enable: TInt32 = 0):
+        """Set CFR2. See the AD9910 datasheet for parameter meanings.
+
+        This method does not pulse IO_UPDATE.
+
+        :param asf_profile_enable: Enable amplitude scale from single tone profiles.
+        :param drg_enable: Digital ramp enable.
+        :param effective_ftw: Read effective FTW.
+        :param sync_validation_disable: Disable the SYNC_SMP_ERR pin indicating
+            (active high) detection of a synchronization pulse sampling error.
+        :param matched_latency_enable: Simultaneous application of amplitude,
+            phase, and frequency changes to the DDS arrive at the output
+
+            * matched_latency_enable = 0: in the order listed
+            * matched_latency_enable = 1: simultaneously.
+        """
+        self.write32(_AD9910_REG_CFR2,
+                     (asf_profile_enable << 24) |
+                     (drg_enable << 19) |
+                     (effective_ftw << 16) |
+                     (matched_latency_enable << 7) |
+                     (sync_validation_disable << 5))
+
     @kernel
     def init(self, blind: TBool = False):
         """Initialize and configure the DDS.
@@ -442,7 +479,7 @@ class AD9910:
         # enable amplitude scale from profiles
         # read effective FTW
         # sync timing validation disable (enabled later)
-        self.write32(_AD9910_REG_CFR2, 0x01010020)
+        self.set_cfr2(sync_validation_disable=1)
         self.cpld.io_update.pulse(1 * us)
         cfr3 = (0x0807c000 | (self.pll_vco << 24) |
                 (self.pll_cp << 19) | (self.pll_en << 8) |
@@ -465,7 +502,7 @@ class AD9910:
                     if i >= 100 - 1:
                         raise ValueError("PLL lock timeout")
         delay(10 * us)  # slack
-        if self.sync_data.sync_delay_seed >= 0:
+        if self.sync_data.sync_delay_seed >= 0 and not blind:
             self.tune_sync_delay(self.sync_data.sync_delay_seed)
         delay(1 * ms)
 
@@ -875,20 +912,26 @@ class AD9910:
         self.cpld.cfg_sw(self.chip_select - 4, state)
 
     @kernel
-    def set_sync(self, in_delay: TInt32, window: TInt32):
+    def set_sync(self, 
+                 in_delay: TInt32, 
+                 window: TInt32, 
+                 en_sync_gen: TInt32 = 0):
         """Set the relevant parameters in the multi device synchronization
         register. See the AD9910 datasheet for details. The SYNC clock
         generator preset value is set to zero, and the SYNC_OUT generator is
-        disabled.
+        disabled by default.
 
         :param in_delay: SYNC_IN delay tap (0-31) in steps of ~75ps
         :param window: Symmetric SYNC_IN validation window (0-15) in
             steps of ~75ps for both hold and setup margin.
+        :param en_sync_gen: Whether to enable the DDS-internal sync generator
+            (SYNC_OUT, cf. sync_sel == 1). Should be left off for the normal
+            use case, where the SYNC clock is supplied by the core device.
         """
         self.write32(_AD9910_REG_SYNC,
                      (window << 28) |  # SYNC S/H validation delay
                      (1 << 27) |  # SYNC receiver enable
-                     (0 << 26) |  # SYNC generator disable
+                     (en_sync_gen << 26) |  # SYNC generator enable
                      (0 << 25) |  # SYNC generator SYS rising edge
                      (0 << 18) |  # SYNC preset
                      (0 << 11) |  # SYNC output delay
@@ -904,9 +947,10 @@ class AD9910:
 
         Also modifies CFR2.
         """
-        self.write32(_AD9910_REG_CFR2, 0x01010020)  # clear SMP_ERR
+        self.set_cfr2(sync_validation_disable=1)  # clear SMP_ERR
         self.cpld.io_update.pulse(1 * us)
-        self.write32(_AD9910_REG_CFR2, 0x01010000)  # enable SMP_ERR
+        delay(10 * us)  # slack
+        self.set_cfr2(sync_validation_disable=0)  # enable SMP_ERR
         self.cpld.io_update.pulse(1 * us)
 
     @kernel
@@ -984,7 +1028,7 @@ class AD9910:
         # set up DRG
         self.set_cfr1(drg_load_lrr=1, drg_autoclear=1)
         # DRG -> FTW, DRG enable
-        self.write32(_AD9910_REG_CFR2, 0x01090000)
+        self.set_cfr2(drg_enable=1)
         # no limits
         self.write64(_AD9910_REG_RAMP_LIMIT, -1, 0)
         # DRCTL=0, dt=1 t_SYNC_CLK
@@ -1005,7 +1049,7 @@ class AD9910:
         ftw = self.read32(_AD9910_REG_FTW)  # read out effective FTW
         delay(100 * us)  # slack
         # disable DRG
-        self.write32(_AD9910_REG_CFR2, 0x01010000)
+        self.set_cfr2(drg_enable=0)
         self.cpld.io_update.pulse_mu(8)
         return ftw & 1
 
diff --git a/artiq/coredevice/suservo.py b/artiq/coredevice/suservo.py
index 932adf35b..1d0a72dad 100644
--- a/artiq/coredevice/suservo.py
+++ b/artiq/coredevice/suservo.py
@@ -57,32 +57,26 @@ class SUServo:
 
     :param channel: RTIO channel number
     :param pgia_device: Name of the Sampler PGIA gain setting SPI bus
-    :param cpld0_device: Name of the first Urukul CPLD SPI bus
-    :param cpld1_device: Name of the second Urukul CPLD SPI bus
-    :param dds0_device: Name of the AD9910 device for the DDS on the first
-        Urukul
-    :param dds1_device: Name of the AD9910 device for the DDS on the second
-        Urukul
+    :param cpld_devices: Names of the Urukul CPLD SPI buses
+    :param dds_devices: Names of the AD9910 devices
     :param gains: Initial value for PGIA gains shift register
         (default: 0x0000). Knowledge of this state is not transferred
         between experiments.
     :param core_device: Core device name
     """
-    kernel_invariants = {"channel", "core", "pgia", "cpld0", "cpld1",
-                         "dds0", "dds1", "ref_period_mu"}
+    kernel_invariants = {"channel", "core", "pgia", "cplds", "ddses",
+                         "ref_period_mu"}
 
     def __init__(self, dmgr, channel, pgia_device,
-                 cpld0_device, cpld1_device,
-                 dds0_device, dds1_device,
+                 cpld_devices, dds_devices,
                  gains=0x0000, core_device="core"):
 
         self.core = dmgr.get(core_device)
         self.pgia = dmgr.get(pgia_device)
         self.pgia.update_xfer_duration_mu(div=4, length=16)
-        self.dds0 = dmgr.get(dds0_device)
-        self.dds1 = dmgr.get(dds1_device)
-        self.cpld0 = dmgr.get(cpld0_device)
-        self.cpld1 = dmgr.get(cpld1_device)
+        assert len(dds_devices) == len(cpld_devices)
+        self.ddses = [dmgr.get(dds) for dds in dds_devices]
+        self.cplds = [dmgr.get(cpld) for cpld in cpld_devices]
         self.channel = channel
         self.gains = gains
         self.ref_period_mu = self.core.seconds_to_mu(
@@ -109,17 +103,15 @@ class SUServo:
             sampler.SPI_CONFIG | spi.SPI_END,
             16, 4, sampler.SPI_CS_PGIA)
 
-        self.cpld0.init(blind=True)
-        cfg0 = self.cpld0.cfg_reg
-        self.cpld0.cfg_write(cfg0 | (0xf << urukul.CFG_MASK_NU))
-        self.dds0.init(blind=True)
-        self.cpld0.cfg_write(cfg0)
+        for i in range(len(self.cplds)):
+            cpld = self.cplds[i]
+            dds = self.ddses[i]
 
-        self.cpld1.init(blind=True)
-        cfg1 = self.cpld1.cfg_reg
-        self.cpld1.cfg_write(cfg1 | (0xf << urukul.CFG_MASK_NU))
-        self.dds1.init(blind=True)
-        self.cpld1.cfg_write(cfg1)
+            cpld.init(blind=True)
+            prev_cpld_cfg = cpld.cfg_reg
+            cpld.cfg_write(prev_cpld_cfg | (0xf << urukul.CFG_MASK_NU))
+            dds.init(blind=True)
+            cpld.cfg_write(prev_cpld_cfg)
 
     @kernel
     def write(self, addr, value):
@@ -257,9 +249,11 @@ class Channel:
         self.servo = dmgr.get(servo_device)
         self.core = self.servo.core
         self.channel = channel
-        # FIXME: this assumes the mem channel is right after the control
-        # channels
-        self.servo_channel = self.channel + 8 - self.servo.channel
+        # This assumes the mem channel is right after the control channels
+        # Make sure this is always the case in eem.py
+        self.servo_channel = (self.channel + 4 * len(self.servo.cplds) -
+                              self.servo.channel)
+        self.dds = self.servo.ddses[self.servo_channel // 4]
 
     @kernel
     def set(self, en_out, en_iir=0, profile=0):
@@ -311,12 +305,8 @@ class Channel:
             see :meth:`dds_offset_to_mu`
         :param phase: DDS phase in turns
         """
-        if self.servo_channel < 4:
-            dds = self.servo.dds0
-        else:
-            dds = self.servo.dds1
-        ftw = dds.frequency_to_ftw(frequency)
-        pow_ = dds.turns_to_pow(phase)
+        ftw = self.dds.frequency_to_ftw(frequency)
+        pow_ = self.dds.turns_to_pow(phase)
         offs = self.dds_offset_to_mu(offset)
         self.set_dds_mu(profile, ftw, offs, pow_)
 
diff --git a/artiq/examples/kasli_suservo/device_db.py b/artiq/examples/kasli_suservo/device_db.py
index d33bfb280..fdb85dc47 100644
--- a/artiq/examples/kasli_suservo/device_db.py
+++ b/artiq/examples/kasli_suservo/device_db.py
@@ -191,10 +191,8 @@ device_db = {
         "arguments": {
             "channel": 24,
             "pgia_device": "spi_sampler0_pgia",
-            "cpld0_device": "urukul0_cpld",
-            "cpld1_device": "urukul1_cpld",
-            "dds0_device": "urukul0_dds",
-            "dds1_device": "urukul1_dds"
+            "cpld_devices": ["urukul0_cpld", "urukul1_cpld"],
+            "dds_devices": ["urukul0_dds", "urukul1_dds"],
         }
     },
 
diff --git a/artiq/frontend/artiq_ddb_template.py b/artiq/frontend/artiq_ddb_template.py
index 52408a0d4..0a14a06be 100755
--- a/artiq/frontend/artiq_ddb_template.py
+++ b/artiq/frontend/artiq_ddb_template.py
@@ -364,8 +364,7 @@ class PeripheralManager:
     def process_suservo(self, rtio_offset, peripheral):
         suservo_name = self.get_name("suservo")
         sampler_name = self.get_name("sampler")
-        urukul0_name = self.get_name("urukul")
-        urukul1_name = self.get_name("urukul")
+        urukul_names = [self.get_name("urukul") for _ in range(2)]
         channel = count(0)
         for i in range(8):
             self.gen("""
@@ -386,16 +385,14 @@ class PeripheralManager:
                 "arguments": {{
                     "channel": 0x{suservo_channel:06x},
                     "pgia_device": "spi_{sampler_name}_pgia",
-                    "cpld0_device": "{urukul0_name}_cpld",
-                    "cpld1_device": "{urukul1_name}_cpld",
-                    "dds0_device": "{urukul0_name}_dds",
-                    "dds1_device": "{urukul1_name}_dds"
+                    "cpld_devices": {cpld_names_list},
+                    "dds_devices": {dds_names_list}
                 }}
             }}""",
             suservo_name=suservo_name,
             sampler_name=sampler_name,
-            urukul0_name=urukul0_name,
-            urukul1_name=urukul1_name,
+            cpld_names_list=[urukul_name + "_cpld" for urukul_name in urukul_names],
+            dds_names_list=[urukul_name + "_dds" for urukul_name in urukul_names],
             suservo_channel=rtio_offset+next(channel))
         self.gen("""
             device_db["spi_{sampler_name}_pgia"] = {{
@@ -407,7 +404,7 @@ class PeripheralManager:
             sampler_name=sampler_name,
             sampler_channel=rtio_offset+next(channel))
         pll_vco = peripheral.get("pll_vco")
-        for urukul_name in (urukul0_name, urukul1_name):
+        for urukul_name in urukul_names:
             self.gen("""
                 device_db["spi_{urukul_name}"] = {{
                     "type": "local",
diff --git a/artiq/gateware/suservo/__init__.py b/artiq/gateware/suservo/__init__.py
index e69de29bb..7a1df77ac 100644
--- a/artiq/gateware/suservo/__init__.py
+++ b/artiq/gateware/suservo/__init__.py
@@ -0,0 +1,10 @@
+"""Gateware implementation of the Sampler-Urukul (AD9910) DDS amplitude servo.
+
+General conventions:
+
+ - ``t_...`` signals and constants refer to time spans measured in the gateware
+   module's default clock (typically a 125 MHz RTIO clock).
+ - ``start`` signals cause modules to proceed with the next servo iteration iff
+   they are currently idle (i.e. their value is irrelevant while the module is
+   busy, so they are not necessarily one-clock-period strobes).
+"""
diff --git a/artiq/gateware/suservo/iir.py b/artiq/gateware/suservo/iir.py
index 0ebab3f13..0ec9bfa09 100644
--- a/artiq/gateware/suservo/iir.py
+++ b/artiq/gateware/suservo/iir.py
@@ -1,9 +1,7 @@
 from collections import namedtuple
 import logging
-
 from migen import *
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -222,31 +220,30 @@ class IIR(Module):
         assert w.word <= w.coeff  # same memory
         assert w.state + w.coeff + 3 <= w.accu
 
-        # m_coeff of active profiles should only be accessed during
+        # m_coeff of active profiles should only be accessed externally during
         # ~processing
         self.specials.m_coeff = Memory(
                 width=2*w.coeff,  # Cat(pow/ftw/offset, cfg/a/b)
                 depth=4 << w.profile + w.channel)
-        # m_state[x] should only be read during ~(shifting |
-        # loading)
-        # m_state[y] of active profiles should only be read during
+        # m_state[x] should only be read externally during ~(shifting | loading)
+        # m_state[y] of active profiles should only be read externally during
         # ~processing
         self.specials.m_state = Memory(
                 width=w.state,  # y1,x0,x1
                 depth=(1 << w.profile + w.channel) + (2 << w.channel))
         # ctrl should only be updated synchronously
         self.ctrl = [Record([
-            ("profile", w.profile),
-            ("en_out", 1),
-            ("en_iir", 1),
-            ("clip", 1),
-            ("stb", 1)])
-            for i in range(1 << w.channel)]
+                ("profile", w.profile),
+                ("en_out", 1),
+                ("en_iir", 1),
+                ("clip", 1),
+                ("stb", 1)])
+                for i in range(1 << w.channel)]
         # only update during ~loading
         self.adc = [Signal((w.adc, True), reset_less=True)
                 for i in range(1 << w.channel)]
         # Cat(ftw0, ftw1, pow, asf)
-        # only read during ~processing
+        # only read externally during ~processing
         self.dds = [Signal(4*w.word, reset_less=True)
                 for i in range(1 << w.channel)]
         # perform one IIR iteration, start with loading,
@@ -270,100 +267,116 @@ class IIR(Module):
         en_iirs = Array([ch.en_iir for ch in self.ctrl])
         clips = Array([ch.clip for ch in self.ctrl])
 
-        # state counter
-        state = Signal(w.channel + 2)
-        # pipeline group activity flags (SR)
-        stage = Signal(3)
+        # Main state machine sequencing the steps of each servo iteration. The
+        # module IDLEs until self.start is asserted, and then runs through LOAD,
+        # PROCESS and SHIFT in order (see description of corresponding flags
+        # above). The steps share the same memory ports, and are executed
+        # strictly sequentially.
+        #
+        # LOAD/SHIFT just read/write one address per cycle; the duration needed
+        # to iterate over all channels is determined by counting cycles.
+        #
+        # The PROCESSing step is split across a three-stage pipeline, where each
+        # stage has up to four clock cycles latency. We feed the first stage
+        # using the (MSBs of) t_current_step, and, after all channels have been
+        # covered, proceed once the pipeline has completely drained.
         self.submodules.fsm = fsm = FSM("IDLE")
-        state_clr = Signal()
-        stage_en = Signal()
+        t_current_step = Signal(w.channel + 2)
+        t_current_step_clr = Signal()
+
+        # pipeline group activity flags (SR)
+        #  0: load from memory
+        #  1: compute
+        #  2: write to output registers (DDS profiles, clip flags)
+        stages_active = Signal(3)
         fsm.act("IDLE",
                 self.done.eq(1),
-                state_clr.eq(1),
+                t_current_step_clr.eq(1),
                 If(self.start,
                     NextState("LOAD")
                 )
         )
         fsm.act("LOAD",
                 self.loading.eq(1),
-                If(state == (1 << w.channel) - 1,
-                    state_clr.eq(1),
-                    stage_en.eq(1),
+                If(t_current_step == (1 << w.channel) - 1,
+                    t_current_step_clr.eq(1),
+                    NextValue(stages_active[0], 1),
                     NextState("PROCESS")
                 )
         )
         fsm.act("PROCESS",
                 self.processing.eq(1),
                 # this is technically wasting three cycles
-                # (one for setting stage, and phase=2,3 with stage[2])
-                If(stage == 0,
-                    state_clr.eq(1),
-                    NextState("SHIFT")
+                # (one for setting stages_active, and phase=2,3 with stages_active[2])
+                If(stages_active == 0,
+                    t_current_step_clr.eq(1),
+                    NextState("SHIFT"),
                 )
         )
         fsm.act("SHIFT",
                 self.shifting.eq(1),
-                If(state == (2 << w.channel) - 1,
+                If(t_current_step == (2 << w.channel) - 1,
                     NextState("IDLE")
                 )
         )
 
         self.sync += [
-                state.eq(state + 1),
-                If(state_clr,
-                    state.eq(0),
-                ),
-                If(stage_en,
-                    stage[0].eq(1)
+                If(t_current_step_clr,
+                    t_current_step.eq(0)
+                ).Else(
+                    t_current_step.eq(t_current_step + 1)
                 )
         ]
 
-        # pipeline group channel pointer
+        # global pipeline phase (lower two bits of t_current_step)
+        pipeline_phase = Signal(2, reset_less=True)
+        # pipeline group channel pointer (SR)
         # for each pipeline stage, this is the channel currently being
         # processed
         channel = [Signal(w.channel, reset_less=True) for i in range(3)]
+        self.comb += Cat(pipeline_phase, channel[0]).eq(t_current_step)
+        self.sync += [
+            If(pipeline_phase == 3,
+                Cat(channel[1:]).eq(Cat(channel[:-1])),
+                stages_active[1:].eq(stages_active[:-1]),
+                If(channel[0] == (1 << w.channel) - 1,
+                    stages_active[0].eq(0)
+                )
+            )
+        ]
+
         # pipeline group profile pointer (SR)
         # for each pipeline stage, this is the profile currently being
         # processed
         profile = [Signal(w.profile, reset_less=True) for i in range(2)]
-        # pipeline phase (lower two bits of state)
-        phase = Signal(2, reset_less=True)
-
-        self.comb += Cat(phase, channel[0]).eq(state)
         self.sync += [
-                Case(phase, {
-                    0: [
-                        profile[0].eq(profiles[channel[0]]),
-                        profile[1].eq(profile[0])
-                    ],
-                    3: [
-                        Cat(channel[1:]).eq(Cat(channel[:-1])),
-                        stage[1:].eq(stage[:-1]),
-                        If(channel[0] == (1 << w.channel) - 1,
-                            stage[0].eq(0)
-                        )
-                    ]
-                })
+            If(pipeline_phase == 0,
+                profile[0].eq(profiles[channel[0]]),
+                profile[1].eq(profile[0]),
+            )
         ]
 
         m_coeff = self.m_coeff.get_port()
         m_state = self.m_state.get_port(write_capable=True)  # mode=READ_FIRST
         self.specials += m_state, m_coeff
 
+        #
+        # Hook up main IIR filter.
+        #
+
         dsp = DSP(w)
         self.submodules += dsp
 
         offset_clr = Signal()
-
         self.comb += [
-                m_coeff.adr.eq(Cat(phase, profile[0],
-                    Mux(phase==0, channel[1], channel[0]))),
+                m_coeff.adr.eq(Cat(pipeline_phase, profile[0],
+                    Mux(pipeline_phase == 0, channel[1], channel[0]))),
                 dsp.offset[-w.coeff - 1:].eq(Mux(offset_clr, 0,
                     Cat(m_coeff.dat_r[:w.coeff], m_coeff.dat_r[w.coeff - 1])
                 )),
                 dsp.coeff.eq(m_coeff.dat_r[w.coeff:]),
                 dsp.state.eq(m_state.dat_r),
-                Case(phase, {
+                Case(pipeline_phase, {
                     0: dsp.accu_clr.eq(1),
                     2: [
                         offset_clr.eq(1),
@@ -373,6 +386,11 @@ class IIR(Module):
                 })
         ]
 
+
+        #
+        # Arbitrate state memory access between steps.
+        #
+
         # selected adc and profile delay (combinatorial from dat_r)
         # both share the same coeff word (sel in the lower 8 bits)
         sel_profile = Signal(w.channel)
@@ -389,13 +407,13 @@ class IIR(Module):
                 sel_profile.eq(m_coeff.dat_r[w.coeff:]),
                 dly_profile.eq(m_coeff.dat_r[w.coeff + 8:]),
                 If(self.shifting,
-                    m_state.adr.eq(state | (1 << w.profile + w.channel)),
+                    m_state.adr.eq(t_current_step | (1 << w.profile + w.channel)),
                     m_state.dat_w.eq(m_state.dat_r),
-                    m_state.we.eq(state[0])
+                    m_state.we.eq(t_current_step[0])
                 ),
                 If(self.loading,
-                    m_state.adr.eq((state << 1) | (1 << w.profile + w.channel)),
-                    m_state.dat_w[-w.adc - 1:-1].eq(Array(self.adc)[state]),
+                    m_state.adr.eq((t_current_step << 1) | (1 << w.profile + w.channel)),
+                    m_state.dat_w[-w.adc - 1:-1].eq(Array(self.adc)[t_current_step]),
                     m_state.dat_w[-1].eq(m_state.dat_w[-2]),
                     m_state.we.eq(1)
                 ),
@@ -405,16 +423,20 @@ class IIR(Module):
                         Cat(profile[1], channel[2]),
                         # read old y
                         Cat(profile[0], channel[0]),
-                        # x0 (recent)
+                        # read x0 (recent)
                         0 | (sel_profile << 1) | (1 << w.profile + w.channel),
-                        # x1 (old)
+                        # read x1 (old)
                         1 | (sel << 1) | (1 << w.profile + w.channel),
-                    ])[phase]),
+                    ])[pipeline_phase]),
                     m_state.dat_w.eq(dsp.output),
-                    m_state.we.eq((phase == 0) & stage[2] & en[1]),
+                    m_state.we.eq((pipeline_phase == 0) & stages_active[2] & en[1]),
                 )
         ]
 
+        #
+        # Compute auxiliary signals (delayed servo enable, clip indicators, etc.).
+        #
+
         # internal channel delay counters
         dlys = Array([Signal(w.dly)
             for i in range(1 << w.channel)])
@@ -434,51 +456,65 @@ class IIR(Module):
         en_out = Signal(reset_less=True)
         # latched channel en_iir
         en_iir = Signal(reset_less=True)
+
+        self.sync += [
+            Case(pipeline_phase, {
+                0: [
+                    dly.eq(dlys[channel[0]]),
+                    en_out.eq(en_outs[channel[0]]),
+                    en_iir.eq(en_iirs[channel[0]]),
+                    If(stages_active[2] & en[1] & dsp.clip,
+                        clips[channel[2]].eq(1)
+                    )
+                ],
+                2: [
+                    en[0].eq(0),
+                    en[1].eq(en[0]),
+                    sel.eq(sel_profile),
+                    If(stages_active[0] & en_out,
+                        If(dly != dly_profile,
+                            dlys[channel[0]].eq(dly + 1)
+                        ).Elif(en_iir,
+                            en[0].eq(1)
+                        )
+                    )
+                ],
+            }),
+        ]
+
+        #
+        # Update DDS profile with FTW/POW/ASF
+        # Stage 0 loads the POW, stage 1 the FTW, and stage 2 writes
+        # the ASF computed by the IIR filter.
+        #
+
         # muxing
         ddss = Array(self.dds)
 
         self.sync += [
-                Case(phase, {
-                    0: [
-                        dly.eq(dlys[channel[0]]),
-                        en_out.eq(en_outs[channel[0]]),
-                        en_iir.eq(en_iirs[channel[0]]),
-                        If(stage[1],
-                            ddss[channel[1]][:w.word].eq(m_coeff.dat_r)
-                        ),
-                        If(stage[2] & en[1] & dsp.clip,
-                            clips[channel[2]].eq(1)
-                        )
-                    ],
-                    1: [
-                        If(stage[1],
-                            ddss[channel[1]][w.word:2*w.word].eq(
-                                m_coeff.dat_r),
-                        ),
-                        If(stage[2],
-                            ddss[channel[2]][3*w.word:].eq(
-                                m_state.dat_r[w.state - w.asf - 1:w.state - 1])
-                        )
-                    ],
-                    2: [
-                        en[0].eq(0),
-                        en[1].eq(en[0]),
-                        sel.eq(sel_profile),
-                        If(stage[0],
-                            ddss[channel[0]][2*w.word:3*w.word].eq(
-                                m_coeff.dat_r),
-                            If(en_out,
-                                If(dly != dly_profile,
-                                    dlys[channel[0]].eq(dly + 1)
-                                ).Elif(en_iir,
-                                    en[0].eq(1)
-                                )
-                            )
-                        )
-                    ],
-                    3: [
-                    ],
-                }),
+            Case(pipeline_phase, {
+                0: [
+                    If(stages_active[1],
+                        ddss[channel[1]][:w.word].eq(m_coeff.dat_r),  # ftw0
+                    ),
+                ],
+                1: [
+                    If(stages_active[1],
+                        ddss[channel[1]][w.word:2 * w.word].eq(m_coeff.dat_r),  # ftw1
+                    ),
+                    If(stages_active[2],
+                        ddss[channel[2]][3*w.word:].eq(  # asf
+                            m_state.dat_r[w.state - w.asf - 1:w.state - 1])
+                    )
+                ],
+                2: [
+                    If(stages_active[0],
+                        ddss[channel[0]][2*w.word:3*w.word].eq(m_coeff.dat_r),  # pow
+                    ),
+                ],
+                3: [
+                ],
+            }),
         ]
 
     def _coeff(self, channel, profile, coeff):
diff --git a/artiq/gateware/suservo/servo.py b/artiq/gateware/suservo/servo.py
index 08b31a3bc..1aec95f02 100644
--- a/artiq/gateware/suservo/servo.py
+++ b/artiq/gateware/suservo/servo.py
@@ -5,32 +5,76 @@ from .iir import IIR, IIRWidths
 from .dds_ser import DDS, DDSParams
 
 
+def predict_timing(adc_p, iir_p, dds_p):
+    """
+    The following is a sketch of the timing for 1 Sampler (8 ADCs) and N Urukuls
+    Shown here, the cycle duration is limited by the IIR loading+processing time.
+
+    ADC|CONVH|CONV|READ|RTT|IDLE|CONVH|CONV|READ|RTT|IDLE|CONVH|CONV|READ|RTT|...
+       |4    |57  |16  |8  | .. |4    |57  |16  |8  | .. |4    |57  |16  |8  |...
+    ---+-------------------+------------------------+------------------------+---
+    IIR|                   |LOAD|PROC         |SHIFT|LOAD|PROC         |SHIFT|...
+       |                   |8   |16*N+9       |16   |8   |16*N+9       |16   |...
+    ---+--------------------------------------+------------------------+---------
+    DDS|                                      |CMD|PROF|WAIT|IO_UP|IDLE|CMD|PR...
+       |                                      |16 |128 |1   |1    | .. |16 |  ...
+
+    IIR loading starts once the ADC presents its data, the DDSes are updated
+    once the IIR processing is over. These are the only blocking processes.
+    IIR shifting happens in parallel to writing to the DDSes and ADC conversions
+    take place while the IIR filter is processing or the DDSes are being
+    written to, depending on the cycle duration (given by whichever module
+    takes the longest).
+    """
+    t_adc = (adc_p.t_cnvh + adc_p.t_conv + adc_p.t_rtt +
+        adc_p.channels*adc_p.width//adc_p.lanes) + 1
+    # load adc_p.channels values, process dds_p.channels
+    # (4 processing phases and 2 additional stages à 4 phases
+    # to complete the processing of the last channel)
+    t_iir = adc_p.channels + 4*dds_p.channels + 8 + 1
+    t_dds = (dds_p.width*2 + 1)*dds_p.clk + 1
+    t_cycle = max(t_adc, t_iir, t_dds)
+    return t_adc, t_iir, t_dds, t_cycle
+
 class Servo(Module):
     def __init__(self, adc_pads, dds_pads, adc_p, iir_p, dds_p):
+        t_adc, t_iir, t_dds, t_cycle = predict_timing(adc_p, iir_p, dds_p)
+        assert t_iir + 2*adc_p.channels < t_cycle, "need shifting time"
+
         self.submodules.adc = ADC(adc_pads, adc_p)
         self.submodules.iir = IIR(iir_p)
         self.submodules.dds = DDS(dds_pads, dds_p)
 
         # adc channels are reversed on Sampler
-        for i, j, k, l in zip(reversed(self.adc.data), self.iir.adc,
-                self.iir.dds, self.dds.profile):
-            self.comb += j.eq(i), l.eq(k)
-
-        t_adc = (adc_p.t_cnvh + adc_p.t_conv + adc_p.t_rtt +
-            adc_p.channels*adc_p.width//adc_p.lanes) + 1
-        t_iir = ((1 + 4 + 1) << iir_p.channel) + 1
-        t_dds = (dds_p.width*2 + 1)*dds_p.clk + 1
-
-        t_cycle = max(t_adc, t_iir, t_dds)
-        assert t_iir + (2 << iir_p.channel) < t_cycle, "need shifting time"
+        for iir, adc in zip(self.iir.adc, reversed(self.adc.data)):
+            self.comb += iir.eq(adc)
+        for dds, iir in zip(self.dds.profile, self.iir.dds):
+            self.comb += dds.eq(iir)
 
+        # If high, a new cycle is started if the current cycle (if any) is
+        # finished. Consequently, if low, servo iterations cease after the
+        # current cycle is finished. Don't care while the first step (ADC)
+        # is active.
         self.start = Signal()
+
+        # Counter for delay between end of ADC cycle and start of next one,
+        # depending on the duration of the other steps.
         t_restart = t_cycle - t_adc + 1
         assert t_restart > 1
         cnt = Signal(max=t_restart)
         cnt_done = Signal()
         active = Signal(3)
+
+        # Indicates whether different steps (0: ADC, 1: IIR, 2: DDS) are
+        # currently active (exposed for simulation only), with each bit being
+        # reset once the successor step is launched. Depending on the
+        # timing details of the different steps, any number can be concurrently
+        # active (e.g. ADC read from iteration n, IIR computation from iteration
+        # n - 1, and DDS write from iteration n - 2).
+
+        # Asserted once per cycle when the DDS write has been completed.
         self.done = Signal()
+
         self.sync += [
                 If(self.dds.done,
                     active[2].eq(0)
diff --git a/artiq/gateware/targets/kasli.py b/artiq/gateware/targets/kasli.py
index 311028fcb..cf8b5760f 100755
--- a/artiq/gateware/targets/kasli.py
+++ b/artiq/gateware/targets/kasli.py
@@ -228,9 +228,9 @@ class SUServo(StandaloneBase):
             ttl_serdes_7series.Output_8X, ttl_serdes_7series.Output_8X)
 
         # EEM3/2: Sampler, EEM5/4: Urukul, EEM7/6: Urukul
-        eem.SUServo.add_std(
-            self, eems_sampler=(3, 2),
-            eems_urukul0=(5, 4), eems_urukul1=(7, 6))
+        eem.SUServo.add_std(self, 
+                            eems_sampler=(3, 2), 
+                            eems_urukul=[[5, 4], [7, 6]])
 
         for i in (1, 2):
             sfp_ctl = self.platform.request("sfp_ctl", i)