diff --git a/artiq/coredevice/ad9910.py b/artiq/coredevice/ad9910.py
index 801b689c..bc19afe2 100644
--- a/artiq/coredevice/ad9910.py
+++ b/artiq/coredevice/ad9910.py
@@ -277,6 +277,10 @@ class AD9910:
 
         :param addr: Register address
         """
+        return self.read32_impl(addr)
+
+    @kernel
+    def read32_impl(self, addr):
         self.bus.set_config_mu(urukul.SPI_CONFIG, 8,
                                urukul.SPIT_DDS_WR, self.chip_select)
         self.bus.write((addr | 0x80) << 24)
@@ -981,7 +985,8 @@ class AD9910:
 
     @kernel
     def tune_sync_delay(self,
-                        search_seed: TInt32 = 15) -> TTuple([TInt32, TInt32]):
+                        search_seed: TInt32 = 15,
+                        cpld_channel_idx: TInt32 = -1) -> TTuple([TInt32, TInt32]):
         """Find a stable SYNC_IN delay.
 
         This method first locates a valid SYNC_IN delay at zero validation
@@ -997,6 +1002,9 @@ class AD9910:
             Defaults to 15 (half range).
         :return: Tuple of optimal delay and window size.
         """
+        if cpld_channel_idx == -1:
+            cpld_channel_idx = self.chip_select - 4
+        assert 0 <= cpld_channel_idx < 4, "Invalid channel index"
         if not self.cpld.sync_div:
             raise ValueError("parent cpld does not drive SYNC")
         search_span = 31
@@ -1019,7 +1027,7 @@ class AD9910:
                 delay(100 * us)
                 err = urukul_sta_smp_err(self.cpld.sta_read())
                 delay(100 * us)  # slack
-                if not (err >> (self.chip_select - 4)) & 1:
+                if not (err >> cpld_channel_idx) & 1:
                     next_seed = in_delay
                     break
             if next_seed >= 0:  # valid delay found, scan next window
diff --git a/artiq/coredevice/suservo.py b/artiq/coredevice/suservo.py
index a89cdcca..f7b516a4 100644
--- a/artiq/coredevice/suservo.py
+++ b/artiq/coredevice/suservo.py
@@ -1,9 +1,11 @@
 from artiq.language.core import kernel, delay, delay_mu, portable
 from artiq.language.units import us, ns
+from artiq.language import *
 from artiq.coredevice.rtio import rtio_output, rtio_input_data
 from artiq.coredevice import spi2 as spi
-from artiq.coredevice import urukul, sampler
+from artiq.coredevice import urukul, sampler, ad9910
 from math import ceil, log2
+from numpy import int32, int64
 
 
 COEFF_WIDTH = 18  # Must match gateware IIRWidths.coeff
@@ -11,6 +13,7 @@ Y_FULL_SCALE_MU = (1 << (COEFF_WIDTH - 1)) - 1
 T_CYCLE = (2*(8 + 64) + 2)*8*ns  # Must match gateware Servo.t_cycle.
 COEFF_SHIFT = 11  # Must match gateware IIRWidths.shift
 PROFILE_WIDTH = 5  # Must match gateware IIRWidths.profile
+FINE_TS_WIDTH = 3  # Must match gateware IIRWidths.ioup_dly
 
 
 @portable
@@ -39,7 +42,7 @@ class SUServo:
     and a photodetector connected to Sampler.
 
     Additionally SU Servo supports multiple preconfigured profiles per channel
-    and features like automatic integrator hold.
+    and features like automatic integrator hold and coherent phase tracking.
 
     Notes:
 
@@ -63,7 +66,8 @@ class SUServo:
     """
     kernel_invariants = {"channel", "core", "pgia", "cplds", "ddses",
                          "ref_period_mu", "num_channels", "coeff_sel",
-                         "corrected_fs", "state_sel", "config_addr", "write_enable"}
+                         "state_sel", "io_dly_addr", "config_addr",
+                         "corrected_fs", "write_enable"}
 
     def __init__(self, dmgr, channel, pgia_device,
                  cpld_devices, dds_devices,
@@ -86,6 +90,7 @@ class SUServo:
         self.num_channels = 4 * len(dds_devices)
         channel_width = ceil(log2(self.num_channels))
         coeff_depth = PROFILE_WIDTH + channel_width + 3
+        self.io_dly_addr = 1 << (coeff_depth - 2)
         self.state_sel = 2 << (coeff_depth - 2)
         self.config_addr = 3 << (coeff_depth - 2)
         self.coeff_sel = 1 << coeff_depth
@@ -119,8 +124,20 @@ class SUServo:
             prev_cpld_cfg = cpld.cfg_reg
             cpld.cfg_write(prev_cpld_cfg | (0xf << urukul.CFG_MASK_NU))
             dds.init(blind=True)
+
+            if dds.sync_data.sync_delay_seed != -1:
+                for channel_idx in range(4):
+                    mask_nu_this = 1 << (urukul.CFG_MASK_NU + channel_idx)
+                    cpld.cfg_write(prev_cpld_cfg | mask_nu_this)
+                    delay(8 * us)
+                    dds.tune_sync_delay(dds.sync_data.sync_delay_seed,
+                                        cpld_channel_idx=channel_idx)
+                    delay(50 * us)
             cpld.cfg_write(prev_cpld_cfg)
 
+        self.set_io_update_delays(
+            [dds.sync_data.io_update_delay for dds in self.ddses])
+
     @kernel
     def write(self, addr, value):
         """Write to servo memory.
@@ -245,6 +262,18 @@ class SUServo:
         gain = (self.gains >> (channel*2)) & 0b11
         return adc_mu_to_volts(val, gain, self.corrected_fs)
 
+    @kernel
+    def set_io_update_delays(self, dlys):
+        """Set IO_UPDATE pulse alignment delays.
+
+        :param dlys: List of delays for each Urukul
+        """
+        bits = 0
+        mask_fine_ts = (1 << FINE_TS_WIDTH) - 1
+        for i in range(len(dlys)):
+            bits |= (dlys[i] & mask_fine_ts) << (FINE_TS_WIDTH * i)
+        self.write(self.io_dly_addr, bits)
+
 
 class Channel:
     """Sampler-Urukul Servo channel
@@ -265,7 +294,7 @@ class Channel:
         return [(channel, None)]
 
     @kernel
-    def set(self, en_out, en_iir=0, profile=0):
+    def set(self, en_out, en_iir=0, profile=0, en_pt=0):
         """Operate channel.
 
         This method does not advance the timeline. Output RF switch setting
@@ -279,9 +308,26 @@ class Channel:
         :param en_out: RF switch enable
         :param en_iir: IIR updates enable
         :param profile: Active profile (0-31)
+        :param en_pt: Coherent phase tracking enable
+            * en_pt=1: "coherent phase mode"
+            * en_pt=0: "continuous phase mode"
+            (see :func:`artiq.coredevice.ad9910.AD9910.set_phase_mode` for a
+            definition of the phase modes)
         """
         rtio_output(self.channel << 8,
-                    en_out | (en_iir << 1) | (profile << 2))
+                    en_out | (en_iir << 1) | (en_pt << 2) | (profile << 3))
+
+    @kernel
+    def set_reference_time(self):
+        """Set reference time for "coherent phase mode" (see :meth:`set`).
+
+        This method does not advance the timeline.
+        With en_pt=1 (see :meth:`set`), the tracked DDS output phase of
+        this channel will refer to the current timeline position.
+
+        """
+        fine_ts = now_mu() & ((1 << FINE_TS_WIDTH) - 1)
+        rtio_output(self.channel << 8 | 1, self.dds.sysclk_per_mu * fine_ts)
 
     @kernel
     def set_dds_mu(self, profile, ftw, offs, pow_=0):
@@ -592,3 +638,217 @@ class Channel:
             raise ValueError("Invalid SUServo y-value!")
         self.set_y_mu(profile, y_mu)
         return y_mu
+
+
+class CPLD(urukul.CPLD):
+    """
+    This module contains a subclass of the Urukul driver class in artiq.coredevice
+    adapted to use CPLD read-back via half-duplex SPI. Only the 8 LSBs can be read
+    back as the read-back buffer on the CPLD is 8 bits wide.
+    """
+
+    def __init__(self, dmgr, spi_device, io_update_device=None,
+                 **kwargs):
+        # Separate IO_UPDATE TTL output device used by SUServo core,
+        # if active, else by artiq.coredevice.suservo.AD9910
+        # :meth:`measure_io_update_alignment`.
+        # The urukul.CPLD driver utilises the CPLD CFG register
+        # option instead for pulsing IO_UPDATE of masked DDSs.
+        self.io_update_ttl = dmgr.get(io_update_device)
+        urukul.CPLD.__init__(self, dmgr, spi_device, **kwargs)
+
+    @kernel
+    def enable_readback(self):
+        """
+        This method sets the RB_EN flag in the Urukul CPLD configuration
+        register. Once set, the CPLD expects an alternating sequence of
+        two SPI transactions:
+
+            * 1: Any transaction. If returning data, the 8 LSBs
+                of that will be stored in the CPLD.
+
+            * 2: One read transaction in half-duplex SPI mode shifting
+                out data from the CPLD over MOSI (use :meth:`readback`).
+
+        To end this protocol, call :meth:`disable_readback` during step 1.
+        """
+        self.cfg_write(self.cfg_reg | (1 << urukul.CFG_RB_EN))
+
+    @kernel
+    def disable_readback(self):
+        """
+        This method clears the RB_EN flag in the Urukul CPLD configuration
+        register. This marks the end of the readback protocol (see
+        :meth:`enable_readback`).
+        """
+        self.cfg_write(self.cfg_reg & ~(1 << urukul.CFG_RB_EN))
+
+    @kernel
+    def sta_read(self, full=False):
+        """
+        Read from status register
+
+        :param full: retrieve status register by concatenating data from
+            several readback transactions.
+        """
+        self.enable_readback()
+        self.sta_read_impl()
+        delay(16 * us)  # slack
+        r = self.readback() << urukul.STA_RF_SW
+        delay(16 * us)  # slack
+        if full:
+            self.enable_readback()  # dummy write
+            r |= self.readback(urukul.CS_RB_PLL_LOCK) << urukul.STA_PLL_LOCK
+            delay(16 * us)  # slack
+            self.enable_readback()  # dummy write
+            r |= self.readback(urukul.CS_RB_PROTO_REV) << urukul.STA_PROTO_REV
+            delay(16 * us)  # slack
+        self.disable_readback()
+        return r
+
+    @kernel
+    def proto_rev_read(self):
+        """Read 8 LSBs of proto_rev"""
+        self.enable_readback()
+        self.enable_readback()  # dummy write
+        r = self.readback(urukul.CS_RB_PROTO_REV)
+        self.disable_readback()
+        return r
+
+    @kernel
+    def pll_lock_read(self):
+        """Read PLL lock status"""
+        self.enable_readback()
+        self.enable_readback()  # dummy write
+        r = self.readback(urukul.CS_RB_PLL_LOCK)
+        self.disable_readback()
+        return r & 0xf
+
+    @kernel
+    def get_att_mu(self):
+        # Different behaviour to urukul.CPLD.get_att_mu: Here, the
+        # latch enable of the attenuators activates 31.5dB
+        # attenuation during the transactions.
+        att_reg = int32(0)
+        self.enable_readback()
+        for i in range(4):
+            self.core.break_realtime()
+            self.bus.set_config_mu(urukul.SPI_CONFIG | spi.SPI_END, 8,
+                                   urukul.SPIT_ATT_RD, urukul.CS_ATT)
+            self.bus.write(0)  # shift in zeros, shift out next 8 bits
+            r = self.readback() & 0xff
+            att_reg |= r << (8 * i)
+
+        delay(16 * us)  # slack
+        self.disable_readback()
+
+        self.att_reg = int32(att_reg)
+        delay(8 * us)  # slack
+        self.set_all_att_mu(self.att_reg)  # shift and latch current value again
+        return self.att_reg
+
+    @kernel
+    def readback(self, cs=urukul.CS_RB_LSBS):
+        """Read from the readback register in half-duplex SPI mode
+        See :meth:`enable_readback` for usage instructions.
+
+        :param cs: Select data to be returned from the readback register.
+             - urukul.CS_RB_LSBS does not modify the readback register upon readback
+             - urukul.CS_RB_PROTO_REV loads the 8 LSBs of proto_rev
+             - urukul.CS_PLL_LOCK loads the PLL lock status bits concatenated with the
+               IFC mode bits
+        :return: CPLD readback register.
+        """
+        self.bus.set_config_mu(
+            urukul.SPI_CONFIG | spi.SPI_END | spi.SPI_INPUT | spi.SPI_HALF_DUPLEX,
+            8, urukul.SPIT_CFG_RD, cs)
+        self.bus.write(0)
+        return int32(self.bus.read())
+
+
+class AD9910(ad9910.AD9910):
+    """
+    This module contains a subclass of the AD9910 driver class in artiq.coredevice
+    using CPLD read-back via half-duplex SPI.
+    """
+
+    # Re-declare set of kernel invariants to avoid warning about non-existent
+    # `sw` attribute, as the AD9910 (instance) constructor writes to the
+    # class attributes.
+    kernel_invariants = {
+        "chip_select", "cpld", "core", "bus", "ftw_per_hz", "sysclk_per_mu"
+    }
+
+    @kernel
+    def read32(self, addr):
+        """ Read from a 32-bit register
+
+        This method returns only the 8 LSBs of the return value.
+        """
+        self.cpld.enable_readback()
+        self.read32_impl(addr)
+        delay(12 * us)  # slack
+        r = self.cpld.readback()
+        delay(12 * us)  # slack
+        self.cpld.disable_readback()
+        return r
+
+    @kernel
+    def read64(self, addr):
+        # 3-wire SPI transactions consisting of multiple transfers are not supported.
+        raise NotImplementedError
+
+    @kernel
+    def read_ram(self, data):
+        # 3-wire SPI transactions consisting of multiple transfers are not supported.
+        raise NotImplementedError
+
+    @kernel
+    def measure_io_update_alignment(self, delay_start, delay_stop):
+        """Use the digital ramp generator to locate the alignment between
+        IO_UPDATE and SYNC_CLK.
+
+        Refer to `artiq.coredevice.ad9910` :meth:`measure_io_update_alignment`.
+        In order that this method can operate the io_update_ttl also used by the SUServo
+        core, deactivate the servo before (see :meth:`set_config`).
+        """
+        # set up DRG
+        self.set_cfr1(drg_load_lrr=1, drg_autoclear=1)
+        # DRG -> FTW, DRG enable
+        self.set_cfr2(drg_enable=1)
+        # no limits
+        self.write64(ad9910._AD9910_REG_RAMP_LIMIT, -1, 0)
+        # DRCTL=0, dt=1 t_SYNC_CLK
+        self.write32(ad9910._AD9910_REG_RAMP_RATE, 0x00010000)
+        # dFTW = 1, (work around negative slope)
+        self.write64(ad9910._AD9910_REG_RAMP_STEP, -1, 0)
+        # un-mask DDS
+        cfg_masked = self.cpld.cfg_reg
+        self.cpld.cfg_write(cfg_masked & ~(0xf << urukul.CFG_MASK_NU))
+        delay(70 * us)  # slack
+        # delay io_update after RTIO edge
+        t = now_mu() + 8 & ~7
+        at_mu(t + delay_start)
+        # assumes a maximum t_SYNC_CLK period
+        self.cpld.io_update_ttl.pulse(self.core.mu_to_seconds(16 - delay_start))  # realign
+        # re-mask DDS
+        self.cpld.cfg_write(cfg_masked)
+        delay(10 * us)  # slack
+        # disable DRG autoclear and LRR on io_update
+        self.set_cfr1()
+        delay(10 * us)  # slack
+        # stop DRG
+        self.write64(ad9910._AD9910_REG_RAMP_STEP, 0, 0)
+        delay(10 * us)  # slack
+        # un-mask DDS
+        self.cpld.cfg_write(cfg_masked & ~(0xf << urukul.CFG_MASK_NU))
+        at_mu(t + 0x20000 + delay_stop)
+        self.cpld.io_update_ttl.pulse_mu(16 - delay_stop)  # realign
+        # re-mask DDS
+        self.cpld.cfg_write(cfg_masked)
+        ftw = self.read32(ad9910._AD9910_REG_FTW)  # read out effective FTW
+        delay(100 * us)  # slack
+        # disable DRG
+        self.set_cfr2(drg_enable=0)
+        self.cpld.io_update.pulse_mu(16)
+        return ftw & 1
diff --git a/artiq/coredevice/urukul.py b/artiq/coredevice/urukul.py
index 2fd66bd6..61fd4762 100644
--- a/artiq/coredevice/urukul.py
+++ b/artiq/coredevice/urukul.py
@@ -24,6 +24,7 @@ SPIT_DDS_RD = 16
 CFG_RF_SW = 0
 CFG_LED = 4
 CFG_PROFILE = 8
+CFG_RB_EN = 11
 CFG_IO_UPDATE = 12
 CFG_MASK_NU = 13
 CFG_CLK_SEL0 = 17
@@ -51,18 +52,23 @@ CS_DDS_CH0 = 4
 CS_DDS_CH1 = 5
 CS_DDS_CH2 = 6
 CS_DDS_CH3 = 7
+# chip selects for readback
+CS_RB_PROTO_REV = 1
+CS_RB_PLL_LOCK = 2
+CS_RB_LSBS = 3
 
 # Default profile
 DEFAULT_PROFILE = 7
 
 
 @portable
-def urukul_cfg(rf_sw, led, profile, io_update, mask_nu,
+def urukul_cfg(rf_sw, led, profile, rb_en, io_update, mask_nu,
                clk_sel, sync_sel, rst, io_rst, clk_div):
     """Build Urukul CPLD configuration register"""
     return ((rf_sw << CFG_RF_SW) |
             (led << CFG_LED) |
             (profile << CFG_PROFILE) |
+            (rb_en << CFG_RB_EN) |
             (io_update << CFG_IO_UPDATE) |
             (mask_nu << CFG_MASK_NU) |
             ((clk_sel & 0x01) << CFG_CLK_SEL0) |
@@ -191,7 +197,7 @@ class CPLD:
             assert sync_div is None
             sync_div = 0
 
-        self.cfg_reg = urukul_cfg(rf_sw=rf_sw, led=0, profile=DEFAULT_PROFILE,
+        self.cfg_reg = urukul_cfg(rf_sw=rf_sw, led=0, profile=DEFAULT_PROFILE, rb_en=0,
                                   io_update=0, mask_nu=0, clk_sel=clk_sel,
                                   sync_sel=sync_sel,
                                   rst=0, io_rst=0, clk_div=clk_div)
@@ -226,6 +232,10 @@ class CPLD:
 
         :return: The status register value.
         """
+        return self.sta_read_impl()
+
+    @kernel
+    def sta_read_impl(self):
         self.bus.set_config_mu(SPI_CONFIG | spi.SPI_END | spi.SPI_INPUT, 24,
                                SPIT_CFG_RD, CS_CFG)
         self.bus.write(self.cfg_reg << 8)
diff --git a/artiq/examples/kasli_suservo/device_db.py b/artiq/examples/kasli_suservo/device_db.py
index c52b82a9..8e9d8752 100644
--- a/artiq/examples/kasli_suservo/device_db.py
+++ b/artiq/examples/kasli_suservo/device_db.py
@@ -142,53 +142,66 @@ device_db = {
         "arguments": {"channel": 15},
     },
 
+    "ttl_urukul0_io_update": {
+        "type": "local",
+        "module": "artiq.coredevice.ttl",
+        "class": "TTLOut",
+        "arguments": {"channel": 16}
+    },
+    "ttl_urukul1_io_update": {
+        "type": "local",
+        "module": "artiq.coredevice.ttl",
+        "class": "TTLOut",
+        "arguments": {"channel": 17}
+    },
+
     "suservo0_ch0": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 16, "servo_device": "suservo0"}
+        "arguments": {"channel": 18, "servo_device": "suservo0"}
     },
     "suservo0_ch1": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 17, "servo_device": "suservo0"}
+        "arguments": {"channel": 19, "servo_device": "suservo0"}
     },
     "suservo0_ch2": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 18, "servo_device": "suservo0"}
+        "arguments": {"channel": 20, "servo_device": "suservo0"}
     },
     "suservo0_ch3": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 19, "servo_device": "suservo0"}
+        "arguments": {"channel": 21, "servo_device": "suservo0"}
     },
     "suservo0_ch4": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 20, "servo_device": "suservo0"}
+        "arguments": {"channel": 22, "servo_device": "suservo0"}
     },
     "suservo0_ch5": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 21, "servo_device": "suservo0"}
+        "arguments": {"channel": 23, "servo_device": "suservo0"}
     },
     "suservo0_ch6": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 22, "servo_device": "suservo0"}
+        "arguments": {"channel": 24, "servo_device": "suservo0"}
     },
     "suservo0_ch7": {
         "type": "local",
         "module": "artiq.coredevice.suservo",
         "class": "Channel",
-        "arguments": {"channel": 23, "servo_device": "suservo0"}
+        "arguments": {"channel": 25, "servo_device": "suservo0"}
     },
 
     "suservo0": {
@@ -196,7 +209,7 @@ device_db = {
         "module": "artiq.coredevice.suservo",
         "class": "SUServo",
         "arguments": {
-            "channel": 24,
+            "channel": 26,
             "pgia_device": "spi_sampler0_pgia",
             "cpld_devices": ["urukul0_cpld", "urukul1_cpld"],
             "dds_devices": ["urukul0_dds", "urukul1_dds"],
@@ -207,33 +220,37 @@ device_db = {
         "type": "local",
         "module": "artiq.coredevice.spi2",
         "class": "SPIMaster",
-        "arguments": {"channel": 25}
+        "arguments": {"channel": 27}
     },
 
     "spi_urukul0": {
         "type": "local",
         "module": "artiq.coredevice.spi2",
         "class": "SPIMaster",
-        "arguments": {"channel": 26}
+        "arguments": {"channel": 28}
     },
     "urukul0_cpld": {
         "type": "local",
-        "module": "artiq.coredevice.urukul",
+        "module": "artiq.coredevice.suservo",
         "class": "CPLD",
         "arguments": {
             "spi_device": "spi_urukul0",
+            "io_update_device": "ttl_urukul0_io_update",
+            "sync_device": "clkgen_dds_sync_in",
             "refclk": 100e6,
             "clk_sel": 0
         }
     },
     "urukul0_dds": {
         "type": "local",
-        "module": "artiq.coredevice.ad9910",
+        "module": "artiq.coredevice.suservo",
         "class": "AD9910",
         "arguments": {
             "pll_n": 40,
             "chip_select": 3,
             "cpld_device": "urukul0_cpld",
+            "io_update_delay": 0,
+            "sync_delay_seed": -1,
         }
     },
 
@@ -241,26 +258,40 @@ device_db = {
         "type": "local",
         "module": "artiq.coredevice.spi2",
         "class": "SPIMaster",
-        "arguments": {"channel": 27}
+        "arguments": {"channel": 29}
     },
     "urukul1_cpld": {
         "type": "local",
-        "module": "artiq.coredevice.urukul",
+        "module": "artiq.coredevice.suservo",
         "class": "CPLD",
         "arguments": {
             "spi_device": "spi_urukul1",
+            "io_update_device": "ttl_urukul1_io_update",
+            "sync_device": "clkgen_dds_sync_in",
             "refclk": 100e6,
             "clk_sel": 0
         }
     },
     "urukul1_dds": {
         "type": "local",
-        "module": "artiq.coredevice.ad9910",
+        "module": "artiq.coredevice.suservo",
         "class": "AD9910",
         "arguments": {
             "pll_n": 40,
             "chip_select": 3,
             "cpld_device": "urukul1_cpld",
+            "io_update_delay": 0,
+            "sync_delay_seed": -1,
+        }
+    },
+
+    "clkgen_dds_sync_in": {
+        "type": "local",
+        "module": "artiq.coredevice.ttl",
+        "class": "TTLClockGen",
+        "arguments": {
+            "channel": 30,
+            "acc_width": 4
         }
     },
 
diff --git a/artiq/frontend/artiq_ddb_template.py b/artiq/frontend/artiq_ddb_template.py
index 5459756f..75eaadcb 100755
--- a/artiq/frontend/artiq_ddb_template.py
+++ b/artiq/frontend/artiq_ddb_template.py
@@ -424,6 +424,16 @@ class PeripheralManager:
         sampler_name = self.get_name("sampler")
         urukul_names = [self.get_name("urukul") for _ in range(2)]
         channel = count(0)
+        for urukul_name in urukul_names:
+            self.gen("""
+                device_db["ttl_{urukul_name}_io_update"] = {{
+                    "type": "local",
+                    "module": "artiq.coredevice.ttl",
+                    "class": "TTLOut",
+                    "arguments": {{"channel": 0x{ttl_channel:06x}}}
+                }}""",
+                urukul_name=urukul_name,
+                ttl_channel=rtio_offset+next(channel))
         for i in range(8):
             self.gen("""
                 device_db["{suservo_name}_ch{suservo_chn}"] = {{
@@ -472,17 +482,19 @@ class PeripheralManager:
                 }}
                 device_db["{urukul_name}_cpld"] = {{
                     "type": "local",
-                    "module": "artiq.coredevice.urukul",
+                    "module": "artiq.coredevice.suservo",
                     "class": "CPLD",
                     "arguments": {{
                         "spi_device": "spi_{urukul_name}",
+                        "io_update_device": "ttl_{urukul_name}_io_update",
+                        "sync_device": "clkgen_{suservo_name}_dds_sync_in",
                         "refclk": {refclk},
                         "clk_sel": {clk_sel}
                     }}
                 }}
                 device_db["{urukul_name}_dds"] = {{
                     "type": "local",
-                    "module": "artiq.coredevice.ad9910",
+                    "module": "artiq.coredevice.suservo",
                     "class": "AD9910",
                     "arguments": {{
                         "pll_n": {pll_n},
@@ -490,12 +502,25 @@ class PeripheralManager:
                         "cpld_device": "{urukul_name}_cpld"{pll_vco}
                     }}
                 }}""",
+                suservo_name=suservo_name,
                 urukul_name=urukul_name,
                 urukul_channel=rtio_offset+next(channel),
                 refclk=peripheral.get("refclk", self.master_description["rtio_frequency"]),
                 clk_sel=peripheral["clk_sel"],
                 pll_vco=",\n        \"pll_vco\": {}".format(pll_vco) if pll_vco is not None else "",
                 pll_n=peripheral["pll_n"])
+        self.gen("""
+            device_db["clkgen_{suservo_name}_dds_sync_in"] = {{
+                "type": "local",
+                "module": "artiq.coredevice.ttl",
+                "class": "TTLClockGen",
+                "arguments": {{
+                    "channel": 0x{clkgen_channel:06x},
+                    "acc_width": 4
+                }}
+            }}""",
+            suservo_name=suservo_name,
+            clkgen_channel=rtio_offset+next(channel))
         return next(channel)
 
     def process_zotino(self, rtio_offset, peripheral):
diff --git a/artiq/gateware/eem.py b/artiq/gateware/eem.py
index ce00f94f..93d01c07 100644
--- a/artiq/gateware/eem.py
+++ b/artiq/gateware/eem.py
@@ -6,6 +6,7 @@ from artiq.gateware import rtio
 from artiq.gateware.rtio.phy import spi2, ad53xx_monitor, dds, grabber
 from artiq.gateware.suservo import servo, pads as servo_pads
 from artiq.gateware.rtio.phy import servo as rtservo, fastino, phaser
+from artiq.gateware.rtio.phy import ttl_simple
 
 
 def _eem_signal(i):
@@ -545,7 +546,8 @@ class SUServo(_EEM):
     @classmethod
     def add_std(cls, target, eems_sampler, eems_urukul,
                 t_rtt=4, clk=1, shift=11, profile=5,
-                iostandard=default_iostandard):
+                sync_gen_cls=ttl_simple.ClockGen,
+                iostandard=default_iostandard, sysclk_per_clk=8):
         """Add a 8-channel Sampler-Urukul Servo
 
         :param t_rtt: upper estimate for clock round-trip propagation time from
@@ -561,6 +563,8 @@ class SUServo(_EEM):
             (default: 11)
         :param profile: log2 of the number of profiles for each DDS channel
             (default: 5)
+        :param sysclk_per_clk: DDS "sysclk" (4*refclk = 1GHz typ.) cycles per
+            FPGA "sys" clock (125MHz typ.) cycles (default: 8)
         """
         cls.add_extension(
             target, *(eems_sampler + sum(eems_urukul, [])),
@@ -572,6 +576,8 @@ class SUServo(_EEM):
         urukul_pads = servo_pads.UrukulPads(
             target.platform, *eem_urukul)
         target.submodules += sampler_pads, urukul_pads
+        target.rtio_channels.extend(
+            rtio.Channel.from_phy(phy) for phy in urukul_pads.io_update_phys)
         # timings in units of RTIO coarse period
         adc_p = servo.ADCParams(width=16, channels=8, lanes=4, t_cnvh=4,
                                 # account for SCK DDR to CONV latency
@@ -579,19 +585,20 @@ class SUServo(_EEM):
                                 t_conv=57 - 4, t_rtt=t_rtt + 4)
         iir_p = servo.IIRWidths(state=25, coeff=18, adc=16, asf=14, word=16,
                                 accu=48, shift=shift, profile=profile, dly=8)
-        dds_p = servo.DDSParams(width=8 + 32 + 16 + 16,
-                                channels=4 * len(eem_urukul), clk=clk)
+        dds_p = servo.DDSParams(width=8 + 32 + 16 + 16, sysclk_per_clk=sysclk_per_clk,
+                                channels=4*len(eem_urukul), clk=clk)
         su = servo.Servo(sampler_pads, urukul_pads, adc_p, iir_p, dds_p)
         su = ClockDomainsRenamer("rio_phy")(su)
         # explicitly name the servo submodule to enable the migen namer to derive
         # a name for the adc return clock domain
         setattr(target.submodules, "suservo_eem{}".format(eems_sampler[0]), su)
 
-        ctrls = [rtservo.RTServoCtrl(ctrl) for ctrl in su.iir.ctrl]
+        ctrls = [rtservo.RTServoCtrl(ctrl, ctrl_reftime)
+                 for ctrl, ctrl_reftime in zip(su.iir.ctrl, su.iir.ctrl_reftime)]
         target.submodules += ctrls
         target.rtio_channels.extend(
             rtio.Channel.from_phy(ctrl) for ctrl in ctrls)
-        mem = rtservo.RTServoMem(iir_p, su)
+        mem = rtservo.RTServoMem(iir_p, su, urukul_pads.io_update_phys)
         target.submodules += mem
         target.rtio_channels.append(rtio.Channel.from_phy(mem, ififo_depth=4))
 
@@ -601,19 +608,20 @@ class SUServo(_EEM):
         target.submodules += phy
         target.rtio_channels.append(rtio.Channel.from_phy(phy, ififo_depth=4))
 
-        dds_sync = Signal(reset=0)
-        for j, eem_urukuli in enumerate(eem_urukul):
-            # connect quad-SPI
+        for eem_urukuli in eem_urukul:
             spi_p, spi_n = (
                 target.platform.request("{}_spi_p".format(eem_urukuli)),
                 target.platform.request("{}_spi_n".format(eem_urukuli)))
             phy = spi2.SPIMaster(spi_p, spi_n)
             target.submodules += phy
             target.rtio_channels.append(rtio.Channel.from_phy(phy, ififo_depth=4))
-            # connect `reset_sync_in`
-            pads = target.platform.request("{}_dds_reset_sync_in".format(eem_urukuli))
-            target.specials += DifferentialOutput(dds_sync, pads.p, pads.n)
-            # connect RF switches
+
+        if sync_gen_cls is not None:  # AD9910 variant and SYNC_IN from EEM
+            phy = sync_gen_cls(urukul_pads.dds_reset_sync_in, ftw_width=4)
+            target.submodules += phy
+            target.rtio_channels.append(rtio.Channel.from_phy(phy))
+
+        for j, eem_urukuli in enumerate(eem_urukul):
             for i, signal in enumerate("sw0 sw1 sw2 sw3".split()):
                 pads = target.platform.request("{}_{}".format(eem_urukuli, signal))
                 target.specials += DifferentialOutput(
diff --git a/artiq/gateware/rtio/phy/servo.py b/artiq/gateware/rtio/phy/servo.py
index 379e7ba3..246208c8 100644
--- a/artiq/gateware/rtio/phy/servo.py
+++ b/artiq/gateware/rtio/phy/servo.py
@@ -1,25 +1,32 @@
 from migen import *
-
 from artiq.gateware.rtio import rtlink
 
 
 class RTServoCtrl(Module):
     """Per channel RTIO control interface"""
-    def __init__(self, ctrl):
+    def __init__(self, ctrl, ctrl_reftime):
         self.rtlink = rtlink.Interface(
-            rtlink.OInterface(len(ctrl.profile) + 2))
+            rtlink.OInterface(
+                data_width=max(len(ctrl.profile) + 3,
+                               len(ctrl_reftime.sysclks_fine)),
+                address_width=1)
+            )
 
         # # #
 
+        sel_ref = self.rtlink.o.address[0]
         self.comb += [
-                ctrl.stb.eq(self.rtlink.o.stb),
-                self.rtlink.o.busy.eq(0)
+                ctrl.stb.eq(self.rtlink.o.stb & ~sel_ref),
+                self.rtlink.o.busy.eq(0),
+                ctrl_reftime.stb.eq(self.rtlink.o.stb & sel_ref),
         ]
+        ctrl_cases = {
+            0: Cat(ctrl.en_out, ctrl.en_iir, ctrl.en_pt, ctrl.profile).eq(
+                            self.rtlink.o.data),
+            1: ctrl_reftime.sysclks_fine.eq(self.rtlink.o.data),
+        }
         self.sync.rio_phy += [
-                If(self.rtlink.o.stb,
-                    Cat(ctrl.en_out, ctrl.en_iir, ctrl.profile).eq(
-                            self.rtlink.o.data)
-                )
+                If(self.rtlink.o.stb, Case(self.rtlink.o.address, ctrl_cases))
         ]
 
 
@@ -53,7 +60,7 @@ class RTServoMem(Module):
                  destination    |  sel  |  sel_coeff   |
                 ----------------|-------|--------------|
                  IIR coeff mem  |   -   |       1      |
-                 Reserved       |   1   |       0      |
+                 DDS delay mem  |   1   |       0      |
                  IIR state mem  |   2   |       0      |
                  config (write) |   3   |       0      |
                  status (read)  |   3   |       0      |
@@ -72,7 +79,7 @@ class RTServoMem(Module):
     (instead of having to decide whether to sign- or zero-extend per address), as
     all unsigned values are less wide than w.coeff.
     """
-    def __init__(self, w, servo):
+    def __init__(self, w, servo, io_update_phys):
         m_coeff = servo.iir.m_coeff.get_port(write_capable=True,
                 mode=READ_FIRST,
                 we_granularity=w.coeff, clock_domain="rio")
@@ -110,7 +117,7 @@ class RTServoMem(Module):
         # # #
 
         config = Signal(w.coeff, reset=0)
-        status = Signal(8 + len(servo.iir.ctrl))
+        status = Signal(len(self.rtlink.i.data))
         pad = Signal(6)
         assert len(status) <= len(self.rtlink.i.data)
         self.comb += [
@@ -124,7 +131,7 @@ class RTServoMem(Module):
                 1 +  # sel_coeff
                 1 +  # high_coeff
                 len(m_coeff.adr))
-        # ensure that we can fit config/status into the state address space
+        # ensure that we can fit config/io_dly/status into the state address space
         assert len(self.rtlink.o.address) + len(self.rtlink.o.data) - w.coeff >= (
                 1 +  # we
                 1 +  # sel_coeff
@@ -172,6 +179,11 @@ class RTServoMem(Module):
                     read_high.eq(high_coeff),
                 )
         ]
+
+        # I/O update alignment delays
+        ioup_dlys = Cat(*[phy.fine_ts for phy in io_update_phys])
+        assert w.coeff >= len(ioup_dlys)
+
         self.sync.rio_phy += [
                 If(self.rtlink.o.stb & we & (sel == 3),
                     config.eq(self.rtlink.o.data)
@@ -179,11 +191,15 @@ class RTServoMem(Module):
                 If(read & (read_sel == 3),
                     [_.clip.eq(0) for _ in servo.iir.ctrl]
                 ),
+                If(self.rtlink.o.stb & we & (sel == 1),
+                    ioup_dlys.eq(self.rtlink.o.data)
+                ),
         ]
+
         # read return value by destination
         read_acts = Array([
                 Mux(read_high, m_coeff.dat_r[w.coeff:], m_coeff.dat_r[:w.coeff]),
-                0,
+                ioup_dlys,
                 m_state.dat_r[w.state - w.coeff:],
                 status
         ])
diff --git a/artiq/gateware/suservo/dds_ser.py b/artiq/gateware/suservo/dds_ser.py
index 38d1f6d9..cdccfcc9 100644
--- a/artiq/gateware/suservo/dds_ser.py
+++ b/artiq/gateware/suservo/dds_ser.py
@@ -1,4 +1,5 @@
 import logging
+from collections import namedtuple
 
 from migen import *
 
@@ -6,11 +7,11 @@ from artiq.coredevice.urukul import DEFAULT_PROFILE
 
 from . import spi
 
-
 logger = logging.getLogger(__name__)
 
-
-DDSParams = spi.SPIParams
+DDSParams = namedtuple("DDSParams", spi.SPIParams._fields + (
+    "sysclk_per_clk",  # DDS_CLK per FPGA system clock
+))
 
 
 class DDS(spi.SPISimple):
diff --git a/artiq/gateware/suservo/iir.py b/artiq/gateware/suservo/iir.py
index 6b975b75..3fad77a6 100644
--- a/artiq/gateware/suservo/iir.py
+++ b/artiq/gateware/suservo/iir.py
@@ -1,6 +1,7 @@
 from collections import namedtuple
 import logging
 from migen import *
+from migen.genlib.coding import Encoder
 
 logger = logging.getLogger(__name__)
 
@@ -98,14 +99,14 @@ class IIR(Module):
     This module implements a multi-channel IIR (infinite impulse response)
     filter processor optimized for synthesis on FPGAs.
 
-    The module is parametrized by passing a ``IIRWidths()`` object which
-    will be abbreviated W here.
+    The module is parametrized by passing a ``IIRWidths()`` object, and
+    two more objects which will be abbreviated W, W_O and W_I here.
 
-    It reads 1 << W.channels input channels (typically from an ADC)
+    It reads W_I.channels input channels (typically from an ADC)
     and on each iteration processes the data using a first-order IIR filter.
     At the end of the cycle each the output of the filter together with
     additional data (typically frequency tunning word and phase offset word
-    for a DDS) are presented at the 1 << W.channels outputs of the module.
+    for a DDS) are presented at the W_O.channels outputs of the module.
 
     Profile memory
     ==============
@@ -144,10 +145,10 @@ class IIR(Module):
     -------------
 
     The state memory holds all Y1 values (IIR processor outputs) for all
-    profiles of all channels in the lower half (1 << W.profile + W.channel
-    addresses) and the pairs of old and new ADC input values X1, and X0,
-    in the upper half (1 << W.channel addresses). Each memory location is
-    W.state bits wide.
+    profiles of all channels in the lower half (1 << W.profile)*W_O.channels
+    addresses, and the pairs of old and new ADC input values X1, and X0,
+    in the upper half (W_I.channels addresses).
+    Each memory location is W.state bits wide.
 
     Real-time control
     =================
@@ -156,15 +157,16 @@ class IIR(Module):
 
         * The active profile, PROFILE
         * Whether to perform IIR filter iterations, EN_IIR
+        * Whether to track the DDS phase coherently, EN_PT
         * The RF switch state enabling output from the channel, EN_OUT
 
     Delayed IIR processing
     ======================
 
-    The IIR filter iterations on a given channel are only performed all of the
-    following are true:
+    The IIR filter iterations on a given channel are only performed if all of
+    the following are true:
 
-        * PROFILE, EN_IIR, EN_OUT have not been updated in the within the
+        * PROFILE, EN_IIR, EN_OUT have not been updated within the
           last DLY cycles
         * EN_IIR is asserted
         * EN_OUT is asserted
@@ -175,9 +177,8 @@ class IIR(Module):
     Typical design at the DSP level. This does not include the description of
     the pipelining or the overall latency involved.
 
-    IIRWidths(state=25, coeff=18, adc=16,
-        asf=14, word=16, accu=48, shift=11,
-        channel=3, profile=5, dly=8)
+    IIRWidths(state=25, coeff=18, adc=16, asf=14,
+        word=16, accu=48, shift=11, profile=5, dly=8)
 
     X0 = ADC * 2^(25 - 1 - 16)
     X1 = X0 delayed by one cycle
@@ -212,13 +213,23 @@ class IIR(Module):
     --/--: signal with a given bit width always includes a sign bit
     -->--: flow is to the right and down unless otherwise indicated
     """
-    def __init__(self, w, w_i, w_o):
+    def __init__(self, w, w_i, w_o, t_cycle):
         for v in (w, w_i, w_o):
             for i, j in enumerate(v):
                 assert j > 0, (i, j, v)
         assert w.word <= w.coeff  # same memory
         assert w.state + w.coeff + 3 <= w.accu
 
+        # Reference counter for coherent phase tracking (we assume this doesn't
+        # roll over – a good assumption, as the period is, for a typical clock
+        # frequency, 2^48 / 125 MHz = ~26 days).
+        self.t_running = Signal(48, reset_less=True)
+
+        # If true, internal DDS phase tracking state is reset, matching DDS
+        # chips with phase cleared (and zero FTW) before the start of the
+        # iteration. Automatically reset at the end of the iteration.
+        self.reset_dds_phase = Signal()
+
         # m_coeff of active profiles should only be accessed externally during
         # ~processing
         self.specials.m_coeff = Memory(
@@ -235,9 +246,24 @@ class IIR(Module):
                 ("profile", w.profile),
                 ("en_out", 1),
                 ("en_iir", 1),
+                ("en_pt", 1),
                 ("clip", 1),
                 ("stb", 1)])
                 for i in range(w_o.channels)]
+        # "Shadow copy" of phase accumulator in DDS accumulator for each output
+        # channel.
+        self.specials.m_accum_ftw = Memory(
+                width=2 * w.word,
+                depth=w_o.channels)
+        # ctrl_reftime should only be updated synchronously
+        self.ctrl_reftime = [Record([
+                ("sysclks_fine", bits_for(w_o.sysclk_per_clk - 1)),
+                ("stb", 1)])
+                for i in range(w_o.channels)]
+        # Reference time for each output channel.
+        self.specials.m_t_ref = Memory(
+                width=len(self.t_running),
+                depth=w_o.channels)
         # only update during ~loading
         self.adc = [Signal((w.adc, True), reset_less=True)
                 for i in range(w_i.channels)]
@@ -264,8 +290,15 @@ class IIR(Module):
         profiles = Array([ch.profile for ch in self.ctrl])
         en_outs = Array([ch.en_out for ch in self.ctrl])
         en_iirs = Array([ch.en_iir for ch in self.ctrl])
+        en_pts = Array([ch.en_pt for ch in self.ctrl])
         clips = Array([ch.clip for ch in self.ctrl])
 
+        # Sample of the reference counter at the start of the current iteration,
+        # such that a common reference time is used for phase calculations
+        # across all channels, in DDS sysclk units.
+        sysclks_to_iter_start = Signal(
+            len(self.t_running) + bits_for(w_o.sysclk_per_clk - 1))
+
         # Main state machine sequencing the steps of each servo iteration. The
         # module IDLEs until self.start is asserted, and then runs through LOAD,
         # PROCESS and SHIFT in order (see description of corresponding flags
@@ -292,6 +325,7 @@ class IIR(Module):
                 self.done.eq(1),
                 t_current_step_clr.eq(1),
                 If(self.start,
+                    NextValue(sysclks_to_iter_start, self.t_running * w_o.sysclk_per_clk),
                     NextState("LOAD")
                 )
         )
@@ -310,6 +344,7 @@ class IIR(Module):
                 If(stages_active == 0,
                     t_current_step_clr.eq(1),
                     NextState("SHIFT"),
+                    NextValue(self.reset_dds_phase, 0)
                 )
         )
         fsm.act("SHIFT",
@@ -479,25 +514,81 @@ class IIR(Module):
             }),
         ]
 
+        # Update coarse reference time from t_running upon ctrl_reftime strobe
+        ref_stb_encoder = Encoder(w_o.channels)
+        m_t_ref_stb = self.m_t_ref.get_port(write_capable=True)
+        self.specials += m_t_ref_stb
+        self.submodules += ref_stb_encoder
+        self.comb += [
+                ref_stb_encoder.i.eq(Cat([ch.stb for ch in self.ctrl_reftime])),
+                m_t_ref_stb.adr.eq(ref_stb_encoder.o),
+                m_t_ref_stb.we.eq(~ref_stb_encoder.n),
+                m_t_ref_stb.dat_w.eq(self.t_running),
+        ]
+
         #
-        # Update DDS profile with FTW/POW/ASF
-        # Stage 0 loads the POW, stage 1 the FTW, and stage 2 writes
-        # the ASF computed by the IIR filter.
+        # Update DDS profile with FTW/POW/ASF (including phase tracking, if
+        # enabled). Stage 0 loads the POW, stage 1 the FTW, and stage 2 writes
+        # the ASF computed by the IIR filter (and adds any phase correction).
         #
 
         # muxing
         ddss = Array(self.dds)
+        sysclks_ref_fine = Array([ch.sysclks_fine for ch in self.ctrl_reftime])
+
+        # registered copy of FTW on channel[1]
+        current_ftw = Signal(2 * w.word, reset_less=True)
+        # target effective DDS phase (accumulator + POW) at the coming io_update
+        target_dds_phase = Signal.like(current_ftw)
+        # DDS-internal phase accumulated until the coming io_update
+        accum_dds_phase = Signal.like(current_ftw)
+        # correction to add to the bare POW to yield a phase-coherent DDS output
+        correcting_pow = Signal(w.word, reset_less=True)
+        # sum of all FTWs on channel[1], updated with current FTW during the
+        # calculation
+        accum_ftw = Signal.like(current_ftw)
+        # sum of previous FTWs on channel[1] (or 0 on phase coherence reference
+        # reset)
+        prev_accum_ftw = Signal.like(current_ftw)
+        # time since reference time at coming io_update in DDS sysclk units
+        sysclks_to_ref = Signal.like(sysclks_to_iter_start)
+        # t_ref in DDS sysclk units
+        sysclks_ref_to_iter_start = Signal.like(sysclks_to_iter_start)
+
+        m_t_ref = self.m_t_ref.get_port()
+        m_accum_ftw = self.m_accum_ftw.get_port(write_capable=True, mode=READ_FIRST)
+        self.specials += m_accum_ftw, m_t_ref
+        prev_accum_ftw = Signal.like(accum_ftw)
+        self.comb += [
+            prev_accum_ftw.eq(Mux(self.reset_dds_phase, 0, m_accum_ftw.dat_r)),
+            m_accum_ftw.adr.eq(channel[1]),
+            m_accum_ftw.we.eq((pipeline_phase == 3) & stages_active[1]),
+            m_accum_ftw.dat_w.eq(accum_ftw),
+            m_t_ref.adr.eq(channel[0]),
+        ]
 
+        sysclks_per_iter = t_cycle * w_o.sysclk_per_clk
         self.sync += [
             Case(pipeline_phase, {
                 0: [
                     If(stages_active[1],
                         ddss[channel[1]][:w.word].eq(m_coeff.dat_r),  # ftw0
+                        current_ftw[:w.word].eq(m_coeff.dat_r),
+                        sysclks_ref_to_iter_start.eq(m_t_ref.dat_r * w_o.sysclk_per_clk),
+                    ),
+                    If(stages_active[2] & en_pts[channel[2]],
+                        # add pow correction if phase tracking enabled
+                        ddss[channel[2]][2*w.word:3*w.word].eq(
+                            ddss[channel[2]][2*w.word:3*w.word] + correcting_pow),
                     ),
                 ],
                 1: [
                     If(stages_active[1],
                         ddss[channel[1]][w.word:2 * w.word].eq(m_coeff.dat_r),  # ftw1
+                        current_ftw[w.word:].eq(m_coeff.dat_r),
+                        sysclks_to_ref.eq(sysclks_to_iter_start - (
+                            sysclks_ref_to_iter_start + sysclks_ref_fine[channel[1]])),
+                        accum_dds_phase.eq(prev_accum_ftw * sysclks_per_iter),
                     ),
                     If(stages_active[2],
                         ddss[channel[2]][3*w.word:].eq(  # asf
@@ -506,10 +597,21 @@ class IIR(Module):
                 ],
                 2: [
                     If(stages_active[0],
-                        ddss[channel[0]][2*w.word:3*w.word].eq(m_coeff.dat_r),  # pow
+                        # Load bare POW from profile memory.
+                        ddss[channel[0]][2*w.word:3*w.word].eq(m_coeff.dat_r),
+                    ),
+                    If(stages_active[1],
+                        target_dds_phase.eq(current_ftw * sysclks_to_ref),
+                        accum_ftw.eq(prev_accum_ftw + current_ftw),
                     ),
                 ],
                 3: [
+                    If(stages_active[1],
+                        # Prepare most-significant word to add to POW from
+                        # profile for phase tracking.
+                        correcting_pow.eq(
+                            (target_dds_phase - accum_dds_phase)[w.word:]),
+                    ),
                 ],
             }),
         ]
@@ -518,6 +620,15 @@ class IIR(Module):
         self.widths = w
         self.widths_adc = w_i
         self.widths_dds = w_o
+        self.t_cycle = t_cycle
+        self._state = t_current_step
+        self._stages = stages_active
+        self._dt_start = sysclks_to_iter_start
+        self._sysclks_to_ref = sysclks_to_ref
+        self._sysclks_ref_to_iter_start = sysclks_ref_to_iter_start
+        self._sysclks_ref_fine = sysclks_ref_fine
+        self._ph_acc = accum_dds_phase
+        self._ph_coh = target_dds_phase
         self._dlys = dlys
 
     def _coeff(self, channel, profile, coeff):
@@ -598,6 +709,14 @@ class IIR(Module):
             raise ValueError("no such state", coeff)
         return signed(val, w.state)
 
+    def get_accum_ftw(self, channel):
+        val = yield self.m_accum_ftw[channel]
+        return val
+
+    def get_t_ref(self, channel):
+        val = yield self.m_t_ref[channel]
+        return val
+
     def fast_iter(self):
         """Perform a single processing iteration."""
         assert (yield self.done)
@@ -633,18 +752,26 @@ class IIR(Module):
             v_adc = signed((yield self.adc[i]), w.adc)
             x0 = yield from self.get_state(i, coeff="x0")
             x0s.append(x0)
-            assert v_adc << (w.state - w.adc - 1) == x0, (hex(v_adc), hex(x0))
             logger.debug("adc[%d] adc=%x x0=%x", i, v_adc, x0)
+            assert v_adc << (w.state - w.adc - 1) == x0, (hex(v_adc), hex(x0))
 
         data = []
         # predict output
         for i in range(w_o.channels):
+            t0 = yield self._dt_start
+            dds_ftw_accu = yield from self.get_accum_ftw(i)
+            sysclks_ref = (yield from self.get_t_ref(i)) * self.widths_dds.sysclk_per_clk\
+                           + (yield self.ctrl_reftime[i].sysclks_fine)
+            logger.debug("dt_start=%d dt_ref=%d t_cycle=%d ftw_accu=%#x",
+                         t0, sysclks_ref, self.t_cycle, dds_ftw_accu)
+
             j = yield self.ctrl[i].profile
             en_iir = yield self.ctrl[i].en_iir
             en_out = yield self.ctrl[i].en_out
+            en_pt = yield self.ctrl[i].en_pt
             dly_i = yield self._dlys[i]
-            logger.debug("ctrl[%d] profile=%d en_iir=%d en_out=%d dly=%d",
-                    i, j, en_iir, en_out, dly_i)
+            logger.debug("ctrl[%d] profile=%d en_iir=%d en_out=%d en_pt=%d dly=%d",
+                    i, j, en_iir, en_out, en_pt, dly_i)
 
             cfg = yield from self.get_coeff(i, j, "cfg")
             k_j = cfg & ((1 << bits_for(w_i.channels - 1)) - 1)
@@ -664,9 +791,13 @@ class IIR(Module):
 
             ftw0 = yield from self.get_coeff(i, j, "ftw0")
             ftw1 = yield from self.get_coeff(i, j, "ftw1")
-            pow = yield from self.get_coeff(i, j, "pow")
-            logger.debug("dds[%d,%d] ftw0=%#x ftw1=%#x pow=%#x",
-                    i, j, ftw0, ftw1, pow)
+            _pow = yield from self.get_coeff(i, j, "pow")
+            ph_coh = ((ftw0 | (ftw1 << w.word)) * (t0 - sysclks_ref))
+            ph_accu = dds_ftw_accu * self.t_cycle * self.widths_dds.sysclk_per_clk
+            ph = ph_coh - ph_accu
+            pow = (_pow + (ph >> w.word)) & 0xffff if en_pt else _pow
+            logger.debug("dds[%d,%d] ftw0=%#x ftw1=%#x ph_coh=%#x _pow=%#x pow=%#x",
+                    i, j, ftw0, ftw1, ph_coh, _pow, pow)
 
             y1 = yield from self.get_state(i, j, "y1")
             x1 = yield from self.get_state(k_j, coeff="x1")
@@ -688,6 +819,10 @@ class IIR(Module):
         # wait for output
         assert (yield self.processing)
         while (yield self.processing):
+            logger.debug("sysclks_to_ref=%d sysclks_ref_to_iter_start=%d",
+                         (yield self._sysclks_to_ref),
+                         (yield self._sysclks_ref_to_iter_start))
+            # logger.debug("%d %d %d %d", *[x for x in (yield self._sysclks_ref_fine)])
             yield
 
         assert (yield self.shifting)
diff --git a/artiq/gateware/suservo/pads.py b/artiq/gateware/suservo/pads.py
index 778f05d0..bdae8ee3 100644
--- a/artiq/gateware/suservo/pads.py
+++ b/artiq/gateware/suservo/pads.py
@@ -1,5 +1,7 @@
 from migen import *
 from migen.genlib.io import DifferentialOutput, DifferentialInput, DDROutput
+from artiq.gateware.rtio.phy import ttl_serdes_7series, ttl_serdes_generic
+from artiq.gateware.rtio import rtlink
 
 
 class SamplerPads(Module):
@@ -57,20 +59,79 @@ class SamplerPads(Module):
                 clk=dp.clkout, port=sdop)
 
 
+class OutIoUpdate_8X(Module):
+    def __init__(self, pad):
+        serdes = ttl_serdes_7series._OSERDESE2_8X()
+        self.submodules += serdes
+
+        self.passthrough = Signal()
+        self.data = Signal()
+        self.fine_ts = Signal(3)
+
+        self.rtlink = rtlink.Interface(
+            rtlink.OInterface(1, fine_ts_width=3))
+        self.probes = [serdes.o[-1]]
+        override_en = Signal()
+        override_o = Signal()
+        self.overrides = [override_en, override_o]
+
+        # # #
+
+        self.specials += Instance("IOBUFDS",
+                                  i_I=serdes.ser_out,
+                                  i_T=serdes.t_out,
+                                  io_IO=pad.p,
+                                  io_IOB=pad.n)
+
+        # Just strobe always in non-passthrough mode, as self.data is supposed
+        # to be always valid.
+        self.submodules += ttl_serdes_generic._SerdesDriver(
+            serdes.o,
+            Mux(self.passthrough, self.rtlink.o.stb, 1),
+            Mux(self.passthrough, self.rtlink.o.data, self.data),
+            Mux(self.passthrough, self.rtlink.o.fine_ts, self.fine_ts),
+            override_en, override_o)
+
+        self.comb += self.rtlink.o.busy.eq(~self.passthrough)
+
+
 class UrukulPads(Module):
     def __init__(self, platform, *eems):
         spip, spin = [[
                 platform.request("{}_qspi_{}".format(eem, pol), 0)
                 for eem in eems] for pol in "pn"]
-        ioup = [platform.request("{}_io_update".format(eem), 0)
-                for eem in eems]
+
         self.cs_n = Signal()
         self.clk = Signal()
         self.io_update = Signal()
+        self.passthrough = Signal()
+        self.dds_reset_sync_in = Signal(reset=0)  # sync_in phy (one for all)
+
+        # # #
+
+        self.io_update_phys = []
+        for eem in eems:
+            phy = OutIoUpdate_8X(platform.request("{}_io_update".format(eem), 0))
+            self.io_update_phys.append(phy)
+            setattr(self.submodules, "{}_io_update_phy".format(eem), phy)
+            self.comb += [
+                phy.data.eq(self.io_update),
+                phy.passthrough.eq(self.passthrough),
+            ]
+
+            sync_in_pads = platform.request("{}_dds_reset_sync_in".format(eem))
+            sync_in_r = Signal()
+            self.sync.rio_phy += sync_in_r.eq(self.dds_reset_sync_in)
+            sync_in_o = Signal()
+            self.specials += Instance("ODDR",
+                p_DDR_CLK_EDGE="SAME_EDGE",
+                i_C=ClockSignal("rio_phy"), i_CE=1, i_S=0, i_R=0,
+                i_D1=sync_in_r, i_D2=sync_in_r, o_Q=sync_in_o)
+            self.specials += DifferentialOutput(sync_in_o, sync_in_pads.p, sync_in_pads.n)
+
         self.specials += [(
                 DifferentialOutput(~self.cs_n, spip[i].cs, spin[i].cs),
-                DifferentialOutput(self.clk, spip[i].clk, spin[i].clk),
-                DifferentialOutput(self.io_update, ioup[i].p, ioup[i].n))
+                DifferentialOutput(self.clk, spip[i].clk, spin[i].clk))
                 for i in range(len(eems))]
         for i in range(4 * len(eems)):
             mosi = Signal()
diff --git a/artiq/gateware/suservo/servo.py b/artiq/gateware/suservo/servo.py
index 59529320..15d31027 100644
--- a/artiq/gateware/suservo/servo.py
+++ b/artiq/gateware/suservo/servo.py
@@ -42,7 +42,7 @@ class Servo(Module):
         assert t_iir + 2*adc_p.channels < t_cycle, "need shifting time"
 
         self.submodules.adc = ADC(adc_pads, adc_p)
-        self.submodules.iir = IIR(iir_p, adc_p, dds_p)
+        self.submodules.iir = IIR(iir_p, adc_p, dds_p, t_cycle)
         self.submodules.dds = DDS(dds_pads, dds_p)
 
         # adc channels are reversed on Sampler
@@ -63,7 +63,6 @@ class Servo(Module):
         assert t_restart > 1
         cnt = Signal(max=t_restart)
         cnt_done = Signal()
-        active = Signal(3)
 
         # Indicates whether different steps (0: ADC, 1: IIR, 2: DDS) are
         # currently active (exposed for simulation only), with each bit being
@@ -71,6 +70,8 @@ class Servo(Module):
         # timing details of the different steps, any number can be concurrently
         # active (e.g. ADC read from iteration n, IIR computation from iteration
         # n - 1, and DDS write from iteration n - 2).
+        active = Signal(3)
+        self._active = active  # Exposed for debugging only.
 
         # Asserted once per cycle when the DDS write has been completed.
         self.done = Signal()
@@ -95,6 +96,17 @@ class Servo(Module):
                     cnt.eq(t_restart - 1)
                 )
         ]
+
+        # Count number of cycles since the servo was last started from idle.
+        self.sync += If(active == 0,
+            self.iir.t_running.eq(0),
+            self.iir.reset_dds_phase.eq(1)
+        ).Else(
+            self.iir.t_running.eq(self.iir.t_running + 1)
+        )
+
+        self.sync += dds_pads.passthrough.eq(active == 0)
+
         self.comb += [
                 cnt_done.eq(cnt == 0),
                 self.adc.start.eq(self.start & cnt_done),
diff --git a/artiq/gateware/test/suservo/__init__.py b/artiq/gateware/test/suservo/__init__.py
index e69de29b..7a1df77a 100644
--- a/artiq/gateware/test/suservo/__init__.py
+++ b/artiq/gateware/test/suservo/__init__.py
@@ -0,0 +1,10 @@
+"""Gateware implementation of the Sampler-Urukul (AD9910) DDS amplitude servo.
+
+General conventions:
+
+ - ``t_...`` signals and constants refer to time spans measured in the gateware
+   module's default clock (typically a 125 MHz RTIO clock).
+ - ``start`` signals cause modules to proceed with the next servo iteration iff
+   they are currently idle (i.e. their value is irrelevant while the module is
+   busy, so they are not necessarily one-clock-period strobes).
+"""
diff --git a/artiq/gateware/test/suservo/test_dds.py b/artiq/gateware/test/suservo/test_dds.py
index a666f14c..d9a81675 100644
--- a/artiq/gateware/test/suservo/test_dds.py
+++ b/artiq/gateware/test/suservo/test_dds.py
@@ -5,6 +5,9 @@ from migen import *
 
 from artiq.gateware.suservo.dds_ser import DDSParams, DDS
 
+class OutIoUpdateTB(Module):
+    def __init__(self):
+        self.fine_ts = Signal(3)
 
 class TB(Module):
     def __init__(self, p):
@@ -15,6 +18,12 @@ class TB(Module):
             setattr(self, "mosi{}".format(i), m)
         self.miso = Signal()
         self.io_update = Signal()
+        self.passthrough = Signal()
+
+        self.io_update_phys = []
+        for i in range(p.channels//4):
+            phy = OutIoUpdateTB()
+            self.io_update_phys.append(phy)
 
         clk0 = Signal()
         self.sync += clk0.eq(self.clk)
@@ -23,16 +32,19 @@ class TB(Module):
 
         self.ddss = []
         for i in range(p.channels):
-            dds = Record([("ftw", 32), ("pow", 16), ("asf", 16), ("cmd", 8)])
-            sr = Signal(len(dds))
+            dds = Record([("ftw", 32), ("pow", 16), ("asf", 16),
+                          ("cmd", 8), ("accu", 32), ("phase", 19)])
+            sr = Signal(32 + 16 + 16 + 8)
             self.sync += [
+                    dds.accu.eq(dds.accu + p.sysclk_per_clk * dds.ftw),
                     If(~self.cs_n & sample,
                         sr.eq(Cat(self.mosi[i], sr))
                     ),
                     If(self.io_update,
-                        dds.raw_bits().eq(sr)
+                        dds.raw_bits()[:len(sr)].eq(sr)
                     )
             ]
+            self.comb += dds.phase.eq((dds.pow << 3) + (dds.accu >> 13))
             self.ddss.append(dds)
 
     @passive
@@ -55,7 +67,7 @@ class TB(Module):
 
 
 def main():
-    p = DDSParams(channels=4, width=8 + 32 + 16 + 16, clk=1)
+    p = DDSParams(channels=4, width=8 + 32 + 16 + 16, clk=1, sysclk_per_clk=8)
     tb = TB(p)
     dds = DDS(tb, p)
     tb.submodules += dds
diff --git a/artiq/gateware/test/suservo/test_iir.py b/artiq/gateware/test/suservo/test_iir.py
index 919e7a6b..ab8a9a4a 100644
--- a/artiq/gateware/test/suservo/test_iir.py
+++ b/artiq/gateware/test/suservo/test_iir.py
@@ -2,48 +2,67 @@ import logging
 import unittest
 
 from migen import *
-from artiq.gateware.suservo import iir
+from artiq.gateware.suservo import servo
+from collections import namedtuple
 
+logger = logging.getLogger(__name__)
+
+ADCParamsSim = namedtuple("ADCParams", ["channels"])
+DDSParamsSim = namedtuple("ADCParams", ["channels", "sysclk_per_clk"])
 
 def main():
-    w_kasli = iir.IIRWidths(state=25, coeff=18, adc=16,
-            asf=14, word=16, accu=48, shift=11,
-            channel=3, profile=5, dly=8)
-    w = iir.IIRWidths(state=17, coeff=16, adc=16,
-            asf=14, word=16, accu=48, shift=11,
-            channel=2, profile=1, dly=8)
+    w_kasli = servo.IIRWidths(state=25, coeff=18, adc=16, asf=14,
+            word=16, accu=48, shift=11, profile=5, dly=8)
+    p_adc = ADCParamsSim(channels=8)
+    p_dds = DDSParamsSim(channels=4, sysclk_per_clk=8)
+    w = servo.IIRWidths(state=17, coeff=16, adc=16, asf=14,
+            word=16, accu=48, shift=11, profile=2, dly=8)
 
+    t_iir = p_adc.channels + 4*p_dds.channels + 8 + 1
     def run(dut):
+        yield dut.t_running.eq(0)
         for i, ch in enumerate(dut.adc):
             yield ch.eq(i)
         for i, ch in enumerate(dut.ctrl):
             yield ch.en_iir.eq(1)
             yield ch.en_out.eq(1)
             yield ch.profile.eq(i)
-        for i in range(1 << w.channel):
+            yield ch.en_pt.eq(i)
+        for i, ch in enumerate(dut.ctrl_reftime):
+            yield ch.sysclks_fine.eq(i)
+            yield ch.stb.eq(1)
+            yield
+            yield dut.t_running.eq(dut.t_running + 1)
+            yield ch.stb.eq(0)
+            yield
+            yield dut.t_running.eq(dut.t_running + 1)
+        for i in range(p_adc.channels):
             yield from dut.set_state(i, i << 8, coeff="x1")
             yield from dut.set_state(i, i << 8, coeff="x0")
+        for i in range(p_dds.channels):
             for j in range(1 << w.profile):
                 yield from dut.set_state(i,
                         (j << 1) | (i << 8), profile=j, coeff="y1")
                 for k, l in enumerate("pow offset ftw0 ftw1".split()):
                     yield from dut.set_coeff(i, profile=j, coeff=l,
-                            value=(i << 12) | (j << 8) | (k << 4))
+                            value=(i << 10) | (j << 8) | (k << 4))
         yield
-        for i in range(1 << w.channel):
+        for i in range(p_dds.channels):
             for j in range(1 << w.profile):
-                for k, l in enumerate("cfg a1 b0 b1".split()):
+                for k, l in enumerate("a1 b0 b1".split()):
                     yield from dut.set_coeff(i, profile=j, coeff=l,
-                            value=(i << 12) | (j << 8) | (k << 4))
+                            value=(i << 10) | (j << 8) | (k << 4))
                 yield from dut.set_coeff(i, profile=j, coeff="cfg",
-                        value=(i << 0) | (j << 8))  # sel, dly
+                        value=(i % p_adc.channels) | (j << 8))  # sel, dly
         yield
-        for i in range(10):
+        for i in range(4):
+            logger.debug("check_iter {}".format(i))
             yield from dut.check_iter()
+            yield dut.t_running.eq((yield dut.t_running) + t_iir)
             yield
 
-    dut = iir.IIR(w)
-    run_simulation(dut, [run(dut)], vcd_name="iir.vcd")
+    dut = servo.IIR(w, p_adc, p_dds, t_iir)
+    run_simulation(dut, [run(dut)], vcd_name="servo.vcd")
 
 
 class IIRTest(unittest.TestCase):
diff --git a/artiq/gateware/test/suservo/test_servo.py b/artiq/gateware/test/suservo/test_servo.py
index cc1a73a2..fe1708d0 100644
--- a/artiq/gateware/test/suservo/test_servo.py
+++ b/artiq/gateware/test/suservo/test_servo.py
@@ -1,5 +1,6 @@
 import logging
 import unittest
+import numpy as np
 
 from migen import *
 from migen.genlib import io
@@ -7,15 +8,17 @@ from migen.genlib import io
 from artiq.gateware.test.suservo import test_adc, test_dds
 from artiq.gateware.suservo import servo
 
+logger = logging.getLogger(__name__)
+
 
 class ServoSim(servo.Servo):
     def __init__(self):
         adc_p = servo.ADCParams(width=16, channels=8, lanes=4,
                 t_cnvh=4, t_conv=57 - 4, t_rtt=4 + 4)
         iir_p = servo.IIRWidths(state=25, coeff=18, adc=16, asf=14, word=16,
-                accu=48, shift=11, channel=3, profile=5, dly=8)
+                accu=48, shift=11, profile=5, dly=8)
         dds_p = servo.DDSParams(width=8 + 32 + 16 + 16,
-                channels=adc_p.channels, clk=1)
+                channels=4, clk=1, sysclk_per_clk=8)
 
         self.submodules.adc_tb = test_adc.TB(adc_p)
         self.submodules.dds_tb = test_dds.TB(dds_p)
@@ -23,37 +26,156 @@ class ServoSim(servo.Servo):
         servo.Servo.__init__(self, self.adc_tb, self.dds_tb,
                 adc_p, iir_p, dds_p)
 
+        self.dds_output = []
+
+    def log_flow(self, cycle):
+        su_start = yield self.start
+        adc_start = yield self.adc.start
+        iir_start = yield self.iir.start
+        dds_start = yield self.dds.start
+        su_done = yield self.done
+        adc_done = yield self.adc.done
+        iir_done = yield self.iir.done
+        dds_done = yield self.dds.done
+        active = yield self._active
+        io_update = yield self.dds_tb.io_update
+        passthrough = yield self.dds_tb.passthrough
+        iir_loading = yield self.iir.loading
+        iir_processing = yield self.iir.processing
+        iir_shifting = yield self.iir.shifting
+        dt = yield self.iir.t_running
+        dt_iir = yield self.iir._dt_start
+        state = yield self.iir._state
+        stage0 = yield self.iir._stages[0]
+        stage1 = yield self.iir._stages[1]
+        stage2 = yield self.iir._stages[2]
+        logger.debug(
+            "cycle=%d "
+            #"start=[su=%d adc=%d iir=%d dds=%d] "
+            #"done=[su=%d adc=%d iir=%d dds=%d] "
+            "active=%s load_proc_shft=%d%d%d stages_active=%d%d%d "
+            "io_update=%d passthrough=%d "
+            "dt=%d dt_iir=%d state=%d",
+            cycle,
+            #su_start, adc_start, iir_start, dds_start,
+            #su_done, adc_done, iir_done, dds_done,
+            '{:03b}'.format(active), iir_loading, iir_processing, iir_shifting, stage0, stage1, stage2,
+            io_update, passthrough,
+            dt, dt_iir//8, state
+        )
+
+    def log_state(self, channel, profile, calls=[0]):
+        calls[0] += 1
+        # if not (yield self._active[1]):
+        #     return
+        yield from self.log_flow(calls[0] - 2)
+        return
+        cfg = yield from self.iir.get_coeff(channel, profile, "cfg")
+        sel = cfg & 0x7
+        x0 = yield from self.iir.get_state(sel, coeff="x0")
+        x1 = yield from self.iir.get_state(sel, coeff="x1")
+        y1 = yield from self.iir.get_state(channel, profile, coeff="y1")
+        _pow = yield from self.iir.get_coeff(channel, profile, "pow")
+        pow_iir = yield self.iir.dds[channel][2*self.iir.widths.word:3*self.iir.widths.word]
+        pow_dds = yield self.dds_tb.ddss[channel].pow
+        asf_dds = yield self.dds_tb.ddss[channel].asf
+        ftw_dds = yield self.dds_tb.ddss[channel].ftw
+        accu_dds = yield self.dds_tb.ddss[channel].accu
+        phase_dds = (yield self.dds_tb.ddss[channel].phase)
+        dds_output = np.cos(2*np.pi*phase_dds/2**19)
+        ph_coh = yield self.iir._ph_coh
+        ph_acc = yield self.iir._ph_acc
+        offset = yield from self.iir.get_coeff(channel, profile, "offset")
+        ftw0 = yield from self.iir.get_coeff(channel, profile, "ftw0")
+        ftw1 = yield from self.iir.get_coeff(channel, profile, "ftw1")
+        m_phase = yield from self.iir.get_accum_ftw(channel)
+        iir_adc = yield self.iir.adc[sel]
+        logger.debug("\t"
+                     "ch=%d pr=%d "
+                     # "x0=%d x1=%d adc=%d y1=%d sel=%d "
+                     "ftw=%#x pow_coeff=%#x ftw_accu=%#x "
+                     "ph_coh=%#x ph_acc=%#x "
+                     "pow_iir=%#x pow_dds=%#x ftw_dds=%#x asf_dds=%#x accu_dds=%#x phase_dds=%#x dds_output=%04.3f",
+                     channel, profile,
+                     # x0, x1, iir_adc, y1, sel,
+                     ftw0 | (ftw1 << 16), _pow, m_phase,
+                     ph_coh, ph_acc,
+                     pow_iir, pow_dds, ftw_dds, asf_dds, accu_dds, phase_dds >> 3, dds_output
+        )
+        self.dds_output.append(dds_output)
+        # yield from self.log_registers(profile)
+
+    def log_registers(self, profile):
+        adc_channels = self.iir.widths_adc.channels
+        dds_channels = self.iir.widths_dds.channels
+        x0s = [0]*adc_channels
+        x1s = [0]*adc_channels
+        y1s = [0]*dds_channels
+        for ch in range(adc_channels):
+            x0s[ch] = yield from self.iir.get_state(ch, coeff="x0")
+            x1s[ch] = yield from self.iir.get_state(ch, coeff="x1")
+        for ch in range(dds_channels):
+            y1s[ch] = yield from self.iir.get_state(ch, profile, coeff="y1")
+
+        logger.debug(("x0s = " + '{:05X} ' * adc_channels).format(*x0s))
+        logger.debug(("x1s = " + '{:05X} ' * adc_channels).format(*x1s))
+        logger.debug(("y1s = " + '{:05X} ' * dds_channels).format(*y1s))
+
     def test(self):
         assert (yield self.done)
 
-        adc = 1
+        adc = 7
         x0 = 0x0141
         yield self.adc_tb.data[-adc-1].eq(x0)
-        channel = 3
-        yield self.iir.adc[channel].eq(adc)
+        channel = 0
         yield self.iir.ctrl[channel].en_iir.eq(1)
         yield self.iir.ctrl[channel].en_out.eq(1)
-        profile = 5
+        yield self.iir.ctrl[channel].en_pt.eq(1)
+        profile = 31
         yield self.iir.ctrl[channel].profile.eq(profile)
         x1 = 0x0743
         yield from self.iir.set_state(adc, x1, coeff="x1")
         y1 = 0x1145
         yield from self.iir.set_state(channel, y1,
                 profile=profile, coeff="y1")
-        coeff = dict(pow=0x1333, offset=0x1531, ftw0=0x1727, ftw1=0x1929,
-                a1=0x0135, b0=0x0337, b1=0x0539, cfg=adc | (0 << 3))
+        coeff = dict(pow=0, offset=0x1531, ftw0=0xeb85, ftw1=0x51,
+                a1=0x0135, b0=0x0337, b1=0x0539, cfg=adc)
         for ks in "pow offset ftw0 ftw1", "a1 b0 b1 cfg":
             for k in ks.split():
                 yield from self.iir.set_coeff(channel, value=coeff[k],
                         profile=profile, coeff=k)
             yield
 
+        num_it = 1
+        num_proc_its = [0]*num_it # number of iterations while iir.processing
+        yield from self.log_state(channel, profile)
         yield self.start.eq(1)
         yield
-        yield self.start.eq(0)
-        while not (yield self.dds_tb.io_update):
-            yield
-        yield  # io_update
+        for i in range(num_it):
+            if i == 1:  # change ftw
+                yield from self.iir.set_coeff(channel,
+                    profile=profile, coeff='ftw0', value=coeff['ftw1'])
+                yield from self.iir.set_coeff(channel,
+                    profile=profile, coeff='ftw1', value=coeff['ftw0'])
+            if i == 2:  # change ftw back
+                yield from self.iir.set_coeff(channel,
+                    profile=profile, coeff='ftw0', value=coeff['ftw0'])
+                yield from self.iir.set_coeff(channel,
+                    profile=profile, coeff='ftw1', value=coeff['ftw1'])
+            logger.debug("iteration {}".format(i))
+            yield from self.log_state(channel, profile)
+            if i == num_it-1:
+                yield self.start.eq(0)
+            while not (yield self.dds_tb.io_update):
+                yield
+                if (yield self.iir.processing):
+                    num_proc_its[i] += 1
+                if (yield self.iir._stages) != 0:
+                    yield from self.log_state(channel, profile)
+            yield  # io_update
+        yield from self.log_state(channel, profile)
+        yield
+        yield from self.log_state(channel, profile)
 
         w = self.iir.widths
 
@@ -63,6 +185,8 @@ class ServoSim(servo.Servo):
 
         offset = coeff["offset"] << (w.state - w.coeff - 1)
         a1, b0, b1 = coeff["a1"], coeff["b0"], coeff["b1"]
+
+        # works only for 1 iteration
         out = (
                 0*(1 << w.shift - 1) +  # rounding
                 a1*(y1 + 0) + b0*(x0 + offset) + b1*(x1 + offset)
@@ -76,8 +200,15 @@ class ServoSim(servo.Servo):
         ftw = (coeff["ftw1"] << 16) | coeff["ftw0"]
         assert _ == ftw, (hex(_), hex(ftw))
 
+        t0 = yield self.iir._dt_start
+        # todo: include phase accumulator
+        ph = (ftw * t0) >> 16
+        if (yield self.iir.ctrl[channel].en_pt):
+            pow = (coeff["pow"] + ph) & 0xffff
+        else:
+            pow = coeff["pow"]
         _ = yield self.dds_tb.ddss[channel].pow
-        assert _ == coeff["pow"], (hex(_), hex(coeff["pow"]))
+        assert _ == pow, (hex(_), hex(pow))
 
         _ = yield self.dds_tb.ddss[channel].asf
         asf = y1 >> (w.state - w.asf - 1)
@@ -101,4 +232,5 @@ class ServoTest(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
     main()