diff --git a/artiq/coredevice/__init__.py b/artiq/coredevice/__init__.py
index 70539315d..0a1a4de6c 100644
--- a/artiq/coredevice/__init__.py
+++ b/artiq/coredevice/__init__.py
@@ -1,9 +1,9 @@
 from artiq.coredevice import exceptions, dds, spi
-from artiq.coredevice.exceptions import (RTIOUnderflow, RTIOSequenceError, RTIOOverflow)
+from artiq.coredevice.exceptions import (RTIOUnderflow, RTIOOverflow)
 from artiq.coredevice.dds import (PHASE_MODE_CONTINUOUS, PHASE_MODE_ABSOLUTE,
                                   PHASE_MODE_TRACKING)
 
 __all__ = []
-__all__ += ["RTIOUnderflow", "RTIOSequenceError", "RTIOOverflow"]
+__all__ += ["RTIOUnderflow", "RTIOOverflow"]
 __all__ += ["PHASE_MODE_CONTINUOUS", "PHASE_MODE_ABSOLUTE",
             "PHASE_MODE_TRACKING"]
diff --git a/artiq/coredevice/comm_analyzer.py b/artiq/coredevice/comm_analyzer.py
index d8b6e0434..f4b2d56f0 100644
--- a/artiq/coredevice/comm_analyzer.py
+++ b/artiq/coredevice/comm_analyzer.py
@@ -27,9 +27,9 @@ class ExceptionType(Enum):
     legacy_o_sequence_error_reset = 0b010001
     legacy_o_collision_reset = 0b010010
     legacy_i_overflow_reset = 0b100000
+    legacy_o_sequence_error = 0b010101
 
     o_underflow = 0b010100
-    o_sequence_error = 0b010101
 
     i_overflow = 0b100001
 
diff --git a/artiq/coredevice/exceptions.py b/artiq/coredevice/exceptions.py
index 44d0af86e..de86baf25 100644
--- a/artiq/coredevice/exceptions.py
+++ b/artiq/coredevice/exceptions.py
@@ -78,13 +78,6 @@ class RTIOUnderflow(Exception):
     """
     artiq_builtin = True
 
-class RTIOSequenceError(Exception):
-    """Raised when an event is submitted on a given channel with a timestamp
-    not larger than the previous one.
-
-    The offending event is discarded and the RTIO core keeps operating.
-    """
-    artiq_builtin = True
 
 class RTIOOverflow(Exception):
     """Raised when at least one event could not be registered into the RTIO
@@ -96,26 +89,32 @@ class RTIOOverflow(Exception):
     """
     artiq_builtin = True
 
+
 class DMAError(Exception):
     """Raised when performing an invalid DMA operation."""
     artiq_builtin = True
 
+
 class DDSError(Exception):
     """Raised when attempting to start a DDS batch while already in a batch,
     when too many commands are batched, and when DDS channel settings are
     incorrect.
     """
 
+
 class WatchdogExpired(Exception):
     """Raised when a watchdog expires."""
 
+
 class ClockFailure(Exception):
     """Raised when RTIO PLL has lost lock."""
 
+
 class I2CError(Exception):
     """Raised when a I2C transaction fails."""
     pass
 
+
 class SPIError(Exception):
     """Raised when a SPI transaction fails."""
     pass
diff --git a/artiq/examples/master/repository/coredevice_examples/simple/dma_blink.py b/artiq/examples/master/repository/coredevice_examples/simple/dma_blink.py
index e2866483f..92c96327c 100644
--- a/artiq/examples/master/repository/coredevice_examples/simple/dma_blink.py
+++ b/artiq/examples/master/repository/coredevice_examples/simple/dma_blink.py
@@ -21,6 +21,7 @@ class DMABlink(EnvExperiment):
     def run(self):
         self.core.reset()
         self.record()
+        handle = self.core_dma.get_handle("blink")
         self.core.break_realtime()
         for i in range(5):
-            self.core_dma.playback("blink")
+            self.core_dma.playback_handle(handle)
diff --git a/artiq/firmware/ksupport/api.rs b/artiq/firmware/ksupport/api.rs
index d7c079243..53736a0fa 100644
--- a/artiq/firmware/ksupport/api.rs
+++ b/artiq/firmware/ksupport/api.rs
@@ -108,11 +108,8 @@ static mut API: &'static [(&'static str, *const ())] = &[
     api!(dma_retrieve = ::dma_retrieve),
     api!(dma_playback = ::dma_playback),
 
-    api!(drtio_get_channel_state = ::rtio::drtio_dbg::get_channel_state),
-    api!(drtio_reset_channel_state = ::rtio::drtio_dbg::reset_channel_state),
-    api!(drtio_get_fifo_space = ::rtio::drtio_dbg::get_fifo_space),
     api!(drtio_get_packet_counts = ::rtio::drtio_dbg::get_packet_counts),
-    api!(drtio_get_fifo_space_req_count = ::rtio::drtio_dbg::get_fifo_space_req_count),
+    api!(drtio_get_buffer_space_req_count = ::rtio::drtio_dbg::get_buffer_space_req_count),
 
     api!(i2c_start = ::nrt_bus::i2c::start),
     api!(i2c_restart = ::nrt_bus::i2c::restart),
diff --git a/artiq/firmware/ksupport/lib.rs b/artiq/firmware/ksupport/lib.rs
index d1fb0029f..517dac0e9 100644
--- a/artiq/firmware/ksupport/lib.rs
+++ b/artiq/firmware/ksupport/lib.rs
@@ -384,22 +384,13 @@ extern fn dma_playback(timestamp: i64, ptr: i32) {
         while csr::rtio_dma::enable_read() != 0 {}
         csr::cri_con::selected_write(0);
 
-        let status = csr::rtio_dma::error_status_read();
-        if status != 0 {
+        if csr::rtio_dma::underflow_read() != 0 {
             let timestamp = csr::rtio_dma::error_timestamp_read();
             let channel = csr::rtio_dma::error_channel_read();
-            if status & rtio::RTIO_O_STATUS_UNDERFLOW != 0 {
-                csr::rtio_dma::error_underflow_reset_write(1);
-                raise!("RTIOUnderflow",
-                    "RTIO underflow at {0} mu, channel {1}",
-                    timestamp as i64, channel as i64, 0)
-            }
-            if status & rtio::RTIO_O_STATUS_SEQUENCE_ERROR != 0 {
-                csr::rtio_dma::error_sequence_error_reset_write(1);
-                raise!("RTIOSequenceError",
-                    "RTIO sequence error at {0} mu, channel {1}",
-                    timestamp as i64, channel as i64, 0)
-            }
+            csr::rtio_dma::underflow_write(1);
+            raise!("RTIOUnderflow",
+                "RTIO underflow at {0} mu, channel {1}",
+                timestamp as i64, channel as i64, 0)
         }
     }
 }
diff --git a/artiq/firmware/ksupport/rtio.rs b/artiq/firmware/ksupport/rtio.rs
index 64fd1e4e4..be9106348 100644
--- a/artiq/firmware/ksupport/rtio.rs
+++ b/artiq/firmware/ksupport/rtio.rs
@@ -1,4 +1,3 @@
-
 #[cfg(has_rtio)]
 mod imp {
     use core::ptr::{read_volatile, write_volatile};
@@ -10,7 +9,6 @@ mod imp {
 
     pub const RTIO_O_STATUS_WAIT:           u8 = 1;
     pub const RTIO_O_STATUS_UNDERFLOW:      u8 = 2;
-    pub const RTIO_O_STATUS_SEQUENCE_ERROR: u8 = 4;
     pub const RTIO_I_STATUS_WAIT_EVENT:     u8 = 1;
     pub const RTIO_I_STATUS_OVERFLOW:       u8 = 2;
     pub const RTIO_I_STATUS_WAIT_STATUS:    u8 = 4;
@@ -49,11 +47,6 @@ mod imp {
                 "RTIO underflow at {0} mu, channel {1}, slack {2} mu",
                 timestamp, channel as i64, timestamp - get_counter())
         }
-        if status & RTIO_O_STATUS_SEQUENCE_ERROR != 0 {
-            raise!("RTIOSequenceError",
-                "RTIO sequence error at {0} mu, channel {1}",
-                timestamp, channel as i64, 0)
-        }
     }
 
     pub extern fn output(timestamp: i64, channel: i32, addr: i32, data: i32) {
@@ -200,39 +193,23 @@ mod imp {
 pub use self::imp::*;
 
 pub mod drtio_dbg {
-    use ::send;
-    use ::recv;
-    use kernel_proto::*;
+        use ::send;
+        use ::recv;
+        use kernel_proto::*;
 
-    #[repr(C)]
-    pub struct ChannelState(i32, i64);
+        #[repr(C)]
+        pub struct PacketCounts(i32, i32);
 
-    pub extern fn get_channel_state(channel: i32) -> ChannelState {
-        send(&DrtioChannelStateRequest { channel: channel as u32 });
-        recv!(&DrtioChannelStateReply { fifo_space, last_timestamp }
-              => ChannelState(fifo_space as i32, last_timestamp as i64))
-    }
+        pub extern fn get_packet_counts(linkno: i32) -> PacketCounts {
+            send(&DrtioPacketCountRequest { linkno: linkno as u8 });
+            recv!(&DrtioPacketCountReply { tx_cnt, rx_cnt }
+                  => PacketCounts(tx_cnt as i32, rx_cnt as i32))
+        }
 
-    pub extern fn reset_channel_state(channel: i32) {
-        send(&DrtioResetChannelStateRequest { channel: channel as u32 })
-    }
-
-    pub extern fn get_fifo_space(channel: i32) {
-        send(&DrtioGetFifoSpaceRequest { channel: channel as u32 })
-    }
-
-    #[repr(C)]
-    pub struct PacketCounts(i32, i32);
-
-    pub extern fn get_packet_counts(linkno: i32) -> PacketCounts {
-        send(&DrtioPacketCountRequest { linkno: linkno as u8 });
-        recv!(&DrtioPacketCountReply { tx_cnt, rx_cnt }
-              => PacketCounts(tx_cnt as i32, rx_cnt as i32))
-    }
-
-    pub extern fn get_fifo_space_req_count(linkno: i32) -> i32 {
-        send(&DrtioFifoSpaceReqCountRequest { linkno: linkno as u8 });
-        recv!(&DrtioFifoSpaceReqCountReply { cnt }
-              => cnt as i32)
+        pub extern fn get_buffer_space_req_count(linkno: i32) -> i32 {
+            send(&DrtioBufferSpaceReqCountRequest { linkno: linkno as u8 });
+            recv!(&DrtioBufferSpaceReqCountReply { cnt }
+                  => cnt as i32)
+        }
     }
 }
diff --git a/artiq/firmware/libdrtioaux/lib.rs b/artiq/firmware/libdrtioaux/lib.rs
index e1b4c282c..7074d3a09 100644
--- a/artiq/firmware/libdrtioaux/lib.rs
+++ b/artiq/firmware/libdrtioaux/lib.rs
@@ -20,8 +20,9 @@ pub enum Packet {
 
     RtioErrorRequest,
     RtioNoErrorReply,
-    RtioErrorCollisionReply,
-    RtioErrorBusyReply,
+    RtioErrorSequenceErrorReply { channel: u16 },
+    RtioErrorCollisionReply { channel: u16 },
+    RtioErrorBusyReply { channel: u16 },
 
     MonitorRequest { channel: u16, probe: u8 },
     MonitorReply { value: u32 },
@@ -54,8 +55,15 @@ impl Packet {
 
             0x20 => Packet::RtioErrorRequest,
             0x21 => Packet::RtioNoErrorReply,
-            0x22 => Packet::RtioErrorCollisionReply,
-            0x23 => Packet::RtioErrorBusyReply,
+            0x22 => Packet::RtioErrorSequenceErrorReply {
+                channel: read_u16(reader)?
+            },
+            0x23 => Packet::RtioErrorCollisionReply {
+                channel: read_u16(reader)?
+            },
+            0x24 => Packet::RtioErrorBusyReply {
+                channel: read_u16(reader)?
+            },
 
             0x40 => Packet::MonitorRequest {
                 channel: read_u16(reader)?,
@@ -144,8 +152,18 @@ impl Packet {
 
             Packet::RtioErrorRequest => write_u8(writer, 0x20)?,
             Packet::RtioNoErrorReply => write_u8(writer, 0x21)?,
-            Packet::RtioErrorCollisionReply => write_u8(writer, 0x22)?,
-            Packet::RtioErrorBusyReply => write_u8(writer, 0x23)?,
+            Packet::RtioErrorSequenceErrorReply { channel } => {
+                write_u8(writer, 0x22)?;
+                write_u16(writer, channel)?;
+            },
+            Packet::RtioErrorCollisionReply { channel } => {
+                write_u8(writer, 0x23)?;
+                write_u16(writer, channel)?;
+            },
+            Packet::RtioErrorBusyReply { channel } => {
+                write_u8(writer, 0x24)?;
+                write_u16(writer, channel)?;
+            },
 
             Packet::MonitorRequest { channel, probe } => {
                 write_u8(writer, 0x40)?;
diff --git a/artiq/firmware/libproto/kernel_proto.rs b/artiq/firmware/libproto/kernel_proto.rs
index c59306cb8..06e5473d0 100644
--- a/artiq/firmware/libproto/kernel_proto.rs
+++ b/artiq/firmware/libproto/kernel_proto.rs
@@ -46,14 +46,10 @@ pub enum Message<'a> {
         duration: u64
     },
 
-    DrtioChannelStateRequest { channel: u32 },
-    DrtioChannelStateReply { fifo_space: u16, last_timestamp: u64 },
-    DrtioResetChannelStateRequest { channel: u32 },
-    DrtioGetFifoSpaceRequest { channel: u32 },
     DrtioPacketCountRequest { linkno: u8 },
     DrtioPacketCountReply { tx_cnt: u32, rx_cnt: u32 },
-    DrtioFifoSpaceReqCountRequest { linkno: u8 },
-    DrtioFifoSpaceReqCountReply { cnt: u32 },
+    DrtioBufferSpaceReqCountRequest { linkno: u8 },
+    DrtioBufferSpaceReqCountReply { cnt: u32 },
 
     RunFinished,
     RunException {
diff --git a/artiq/firmware/runtime/kern_hwreq.rs b/artiq/firmware/runtime/kern_hwreq.rs
index aae60c3c8..8eea9cd17 100644
--- a/artiq/firmware/runtime/kern_hwreq.rs
+++ b/artiq/firmware/runtime/kern_hwreq.rs
@@ -327,31 +327,16 @@ pub fn process_kern_hwreq(io: &Io, request: &kern::Message) -> io::Result<bool>
             kern_acknowledge()
         }
 
-        #[cfg(has_rtio_core)]
-        &kern::DrtioChannelStateRequest { channel } => {
-            let (fifo_space, last_timestamp) = rtio_mgt::drtio_dbg::get_channel_state(channel);
-            kern_send(io, &kern::DrtioChannelStateReply { fifo_space: fifo_space,
-                                                          last_timestamp: last_timestamp })
-        }
-        #[cfg(has_rtio_core)]
-        &kern::DrtioResetChannelStateRequest { channel } => {
-            rtio_mgt::drtio_dbg::reset_channel_state(channel);
-            kern_acknowledge()
-        }
-        #[cfg(has_rtio_core)]
-        &kern::DrtioGetFifoSpaceRequest { channel } => {
-            rtio_mgt::drtio_dbg::get_fifo_space(channel);
-            kern_acknowledge()
-        }
+
         #[cfg(has_rtio_core)]
         &kern::DrtioPacketCountRequest { linkno } => {
             let (tx_cnt, rx_cnt) = rtio_mgt::drtio_dbg::get_packet_counts(linkno);
             kern_send(io, &kern::DrtioPacketCountReply { tx_cnt: tx_cnt, rx_cnt: rx_cnt })
         }
         #[cfg(has_rtio_core)]
-        &kern::DrtioFifoSpaceReqCountRequest { linkno } => {
-            let cnt = rtio_mgt::drtio_dbg::get_fifo_space_req_count(linkno);
-            kern_send(io, &kern::DrtioFifoSpaceReqCountReply { cnt: cnt })
+        &kern::DrtioBufferSpaceReqCountRequest { linkno } => {
+            let cnt = rtio_mgt::drtio_dbg::get_buffer_space_req_count(linkno);
+            kern_send(io, &kern::DrtioBufferSpaceReqCountReply { cnt: cnt })
         }
 
         &kern::I2cStartRequest { busno } => {
diff --git a/artiq/firmware/runtime/rtio_mgt.rs b/artiq/firmware/runtime/rtio_mgt.rs
index 6917dc771..cc63b86e2 100644
--- a/artiq/firmware/runtime/rtio_mgt.rs
+++ b/artiq/firmware/runtime/rtio_mgt.rs
@@ -72,21 +72,11 @@ pub mod drtio {
         unsafe {
             (csr::DRTIO[linkidx].reset_write)(1);
             while (csr::DRTIO[linkidx].o_wait_read)() == 1 {}
-        }
-        // TODO: determine actual number of remote FIFOs
-        for channel in 0..16 {
-            unsafe {
-                (csr::DRTIO[linkidx].chan_sel_override_write)(channel);
-                (csr::DRTIO[linkidx].chan_sel_override_en_write)(1);
 
-                (csr::DRTIO[linkidx].o_reset_channel_status_write)(1);
-                (csr::DRTIO[linkidx].o_get_fifo_space_write)(1);
-                while (csr::DRTIO[linkidx].o_wait_read)() == 1 {}
-                info!("[LINK#{}] FIFO space on channel {} is {}",
-                    linkno, channel, (csr::DRTIO[linkidx].o_dbg_fifo_space_read)());
-
-                (csr::DRTIO[linkidx].chan_sel_override_en_write)(0);
-            }
+            (csr::DRTIO[linkidx].o_get_buffer_space_write)(1);
+            while (csr::DRTIO[linkidx].o_wait_read)() == 1 {}
+            info!("[LINK#{}] buffer space is {}",
+                linkno, (csr::DRTIO[linkidx].o_dbg_buffer_space_read)());
         }
     }
 
@@ -129,7 +119,7 @@ pub mod drtio {
                 error!("[LINK#{}] received truncated packet", linkno);
             }
             if errors & 4 != 0 {
-                error!("[LINK#{}] timeout attempting to get remote FIFO space", linkno);
+                error!("[LINK#{}] timeout attempting to get remote buffer space", linkno);
             }
         }
     }
@@ -138,10 +128,12 @@ pub mod drtio {
         drtioaux::hw::send_link(linkno, &drtioaux::Packet::RtioErrorRequest).unwrap();
         match drtioaux::hw::recv_timeout_link(linkno, None) {
             Ok(drtioaux::Packet::RtioNoErrorReply) => (),
-            Ok(drtioaux::Packet::RtioErrorCollisionReply) =>
-                error!("[LINK#{}] RTIO collision", linkno),
-            Ok(drtioaux::Packet::RtioErrorBusyReply) =>
-                error!("[LINK#{}] RTIO busy", linkno),
+            Ok(drtioaux::Packet::RtioErrorSequenceErrorReply { channel }) =>
+                error!("[LINK#{}] RTIO sequence error involving channel {}", linkno, channel),
+            Ok(drtioaux::Packet::RtioErrorCollisionReply { channel }) =>
+                error!("[LINK#{}] RTIO collision involving channel {}", linkno, channel),
+            Ok(drtioaux::Packet::RtioErrorBusyReply { channel }) =>
+                error!("[LINK#{}] RTIO busy error involving channel {}", linkno, channel),
             Ok(_) => error!("[LINK#{}] received unexpected aux packet", linkno),
             Err(e) => error!("[LINK#{}] aux packet error ({})", linkno, e)
         }
@@ -197,10 +189,16 @@ fn async_error_thread(io: Io) {
             io.until(|| csr::rtio_core::async_error_read() != 0).unwrap();
             let errors = csr::rtio_core::async_error_read();
             if errors & 1 != 0 {
-                error!("RTIO collision");
+                error!("RTIO collision involving channel {}",
+                       csr::rtio_core::collision_channel_read());
             }
             if errors & 2 != 0 {
-                error!("RTIO busy");
+                error!("RTIO busy error involving channel {}",
+                       csr::rtio_core::busy_channel_read());
+            }
+            if errors & 4 != 0 {
+                error!("RTIO sequence error involving channel {}",
+                       csr::rtio_core::sequence_error_channel_read());
             }
             csr::rtio_core::async_error_write(errors);
         }
@@ -260,42 +258,6 @@ pub fn init_core() {
 pub mod drtio_dbg {
     use board::csr;
 
-    // TODO: routing
-    pub fn get_channel_state(channel: u32) -> (u16, u64) {
-        let linkno = ((channel >> 16) - 1) as usize;
-        let node_channel = channel as u16;
-        unsafe {
-            (csr::DRTIO[linkno].chan_sel_override_write)(node_channel as u16);
-            (csr::DRTIO[linkno].chan_sel_override_en_write)(1);
-            let fifo_space = (csr::DRTIO[linkno].o_dbg_fifo_space_read)();
-            let last_timestamp = (csr::DRTIO[linkno].o_dbg_last_timestamp_read)();
-            (csr::DRTIO[linkno].chan_sel_override_en_write)(0);
-            (fifo_space, last_timestamp)
-        }
-    }
-
-    pub fn reset_channel_state(channel: u32) {
-        let linkno = ((channel >> 16) - 1) as usize;
-        let node_channel = channel as u16;
-        unsafe {
-            (csr::DRTIO[linkno].chan_sel_override_write)(node_channel);
-            (csr::DRTIO[linkno].chan_sel_override_en_write)(1);
-            (csr::DRTIO[linkno].o_reset_channel_status_write)(1);
-            (csr::DRTIO[linkno].chan_sel_override_en_write)(0);
-        }
-    }
-
-    pub fn get_fifo_space(channel: u32) {
-        let linkno = ((channel >> 16) - 1) as usize;
-        let node_channel = channel as u16;
-        unsafe {
-            (csr::DRTIO[linkno].chan_sel_override_write)(node_channel);
-            (csr::DRTIO[linkno].chan_sel_override_en_write)(1);
-            (csr::DRTIO[linkno].o_get_fifo_space_write)(1);
-            (csr::DRTIO[linkno].chan_sel_override_en_write)(0);
-        }
-    }
-
     pub fn get_packet_counts(linkno: u8) -> (u32, u32) {
         let linkno = linkno as usize;
         unsafe {
@@ -305,23 +267,17 @@ pub mod drtio_dbg {
         }
     }
 
-    pub fn get_fifo_space_req_count(linkno: u8) -> u32 {
+    pub fn get_buffer_space_req_count(linkno: u8) -> u32 {
         let linkno = linkno as usize;
         unsafe {
-            (csr::DRTIO[linkno].o_dbg_fifo_space_req_cnt_read)()
+            (csr::DRTIO[linkno].o_dbg_buffer_space_req_cnt_read)()
         }
     }
 }
 
 #[cfg(not(has_drtio))]
 pub mod drtio_dbg {
-    pub fn get_channel_state(_channel: u32) -> (u16, u64) { (0, 0) }
-
-    pub fn reset_channel_state(_channel: u32) {}
-
-    pub fn get_fifo_space(_channel: u32) {}
-
     pub fn get_packet_counts(_linkno: u8) -> (u32, u32) { (0, 0) }
 
-    pub fn get_fifo_space_req_count(_linkno: u8) -> u32 { 0 }
+    pub fn get_buffer_space_req_count(_linkno: u8) -> u32 { 0 }
 }
diff --git a/artiq/firmware/satman/main.rs b/artiq/firmware/satman/main.rs
index 66893a915..1ff73542a 100644
--- a/artiq/firmware/satman/main.rs
+++ b/artiq/firmware/satman/main.rs
@@ -30,16 +30,28 @@ fn process_aux_packet(p: &drtioaux::Packet) {
                 errors = (csr::DRTIO[0].rtio_error_read)();
             }
             if errors & 1 != 0 {
+                let channel;
                 unsafe {
+                    channel = (csr::DRTIO[0].sequence_error_channel_read)();
                     (csr::DRTIO[0].rtio_error_write)(1);
                 }
-                drtioaux::hw::send_link(0, &drtioaux::Packet::RtioErrorCollisionReply).unwrap();
+                drtioaux::hw::send_link(0, &drtioaux::Packet::RtioErrorSequenceErrorReply { channel: channel }).unwrap();
             } else if errors & 2 != 0 {
+                let channel;
                 unsafe {
+                    channel = (csr::DRTIO[0].collision_channel_read)();
                     (csr::DRTIO[0].rtio_error_write)(2);
                 }
-                drtioaux::hw::send_link(0, &drtioaux::Packet::RtioErrorBusyReply).unwrap();
-            } else {
+                drtioaux::hw::send_link(0, &drtioaux::Packet::RtioErrorCollisionReply { channel: channel }).unwrap();
+            } else if errors & 4 != 0 {
+                let channel;
+                unsafe {
+                    channel = (board::csr::DRTIO[0].busy_channel_read)();
+                    (board::csr::DRTIO[0].rtio_error_write)(4);
+                }
+                drtioaux::hw::send_link(0, &drtioaux::Packet::RtioErrorBusyReply { channel: channel }).unwrap();
+            }
+            else {
                 drtioaux::hw::send_link(0, &drtioaux::Packet::RtioNoErrorReply).unwrap();
             }
         }
@@ -160,9 +172,6 @@ fn process_errors() {
     if errors & 8 != 0 {
         error!("write overflow");
     }
-    if errors & 16 != 0 {
-        error!("write sequence error");
-    }
 }
 
 
diff --git a/artiq/gateware/drtio/core.py b/artiq/gateware/drtio/core.py
index c3da1cb51..0bda7ad38 100644
--- a/artiq/gateware/drtio/core.py
+++ b/artiq/gateware/drtio/core.py
@@ -3,10 +3,11 @@ from types import SimpleNamespace
 from migen import *
 from migen.genlib.cdc import ElasticBuffer
 
+from artiq.gateware.rtio.sed.core import *
+from artiq.gateware.rtio.input_collector import *
 from artiq.gateware.drtio import (link_layer, aux_controller,
-                                  rt_packet_satellite, rt_ios_satellite,
-                                  rt_errors_satellite,
-                                  rt_packet_master, rt_controller_master) 
+                                  rt_packet_satellite, rt_errors_satellite,
+                                  rt_packet_master, rt_controller_master)
 
 
 class ChannelInterface:
@@ -49,7 +50,8 @@ class GenericRXSynchronizer(Module):
 
 
 class DRTIOSatellite(Module):
-    def __init__(self, chanif, channels, rx_synchronizer=None, fine_ts_width=3, full_ts_width=63):
+    def __init__(self, chanif, channels, rx_synchronizer=None, fine_ts_width=3,
+                 lane_count=8, fifo_depth=128):
         if rx_synchronizer is None:
             rx_synchronizer = GenericRXSynchronizer()
             self.submodules += rx_synchronizer
@@ -77,11 +79,30 @@ class DRTIOSatellite(Module):
         self.submodules.rt_packet = ClockDomainsRenamer("rtio")(
             rt_packet_satellite.RTPacketSatellite(link_layer_sync))
 
-        self.submodules.ios = rt_ios_satellite.IOS(
-            self.rt_packet, channels, fine_ts_width, full_ts_width)
+        coarse_ts = Signal(64 - fine_ts_width)
+        self.sync.rtio += \
+            If(self.rt_packet.tsc_load,
+                coarse_ts.eq(self.rt_packet.tsc_load_value)
+            ).Else(
+                coarse_ts.eq(coarse_ts + 1)
+            )
+        self.comb += self.rt_packet.cri.counter.eq(coarse_ts << fine_ts_width)
+
+        self.submodules.outputs = ClockDomainsRenamer("rio")(
+            SED(channels, fine_ts_width, "sync",
+                lane_count=lane_count, fifo_depth=fifo_depth,
+                enable_spread=False, report_buffer_space=True,
+                interface=self.rt_packet.cri))
+        self.comb += self.outputs.coarse_timestamp.eq(coarse_ts)
+        self.sync += self.outputs.minimum_coarse_timestamp.eq(coarse_ts + 16)
+
+        self.submodules.inputs = ClockDomainsRenamer("rio")(
+            InputCollector(channels, fine_ts_width, "sync",
+                           interface=self.rt_packet.cri))
+        self.comb += self.inputs.coarse_timestamp.eq(coarse_ts)
 
         self.submodules.rt_errors = rt_errors_satellite.RTErrorsSatellite(
-            self.rt_packet, self.ios)
+            self.rt_packet, self.outputs)
 
         self.clock_domains.cd_rio = ClockDomain()
         self.clock_domains.cd_rio_phy = ClockDomain()
diff --git a/artiq/gateware/drtio/rt_controller_master.py b/artiq/gateware/drtio/rt_controller_master.py
index d97f418c6..12683db01 100644
--- a/artiq/gateware/drtio/rt_controller_master.py
+++ b/artiq/gateware/drtio/rt_controller_master.py
@@ -7,7 +7,7 @@ from migen.genlib.resetsync import AsyncResetSynchronizer
 
 from misoc.interconnect.csr import *
 
-from artiq.gateware.rtio.cdc import RTIOCounter
+from artiq.gateware.rtio.cdc import GrayCodeTransfer
 from artiq.gateware.rtio import cri
 
 
@@ -15,9 +15,6 @@ class _CSRs(AutoCSR):
     def __init__(self):
         self.protocol_error = CSR(3)
 
-        self.chan_sel_override = CSRStorage(16)
-        self.chan_sel_override_en = CSRStorage()
-
         self.tsc_correction = CSRStorage(64)
         self.set_time = CSR()
         self.underflow_margin = CSRStorage(16, reset=200)
@@ -25,14 +22,30 @@ class _CSRs(AutoCSR):
         self.reset = CSR()
         self.reset_phy = CSR()
 
-        self.o_get_fifo_space = CSR()
-        self.o_dbg_fifo_space = CSRStatus(16)
-        self.o_dbg_last_timestamp = CSRStatus(64)
-        self.o_dbg_fifo_space_req_cnt = CSRStatus(32)
-        self.o_reset_channel_status = CSR()
+        self.o_get_buffer_space = CSR()
+        self.o_dbg_buffer_space = CSRStatus(16)
+        self.o_dbg_buffer_space_req_cnt = CSRStatus(32)
         self.o_wait = CSRStatus()
 
 
+class RTIOCounter(Module):
+    def __init__(self, width):
+        self.width = width
+        # Timestamp counter in RTIO domain
+        self.value_rtio = Signal(width)
+        # Timestamp counter resynchronized to sys domain
+        # Lags behind value_rtio, monotonic and glitch-free
+        self.value_sys = Signal(width)
+
+        # # #
+
+        # note: counter is in rtio domain and never affected by the reset CSRs
+        self.sync.rtio += self.value_rtio.eq(self.value_rtio + 1)
+        gt = GrayCodeTransfer(width)
+        self.submodules += gt
+        self.comb += gt.i.eq(self.value_rtio), self.value_sys.eq(gt.o)
+
+
 class RTController(Module):
     def __init__(self, rt_packet, channel_count, fine_ts_width):
         self.csrs = _CSRs()
@@ -41,27 +54,20 @@ class RTController(Module):
         # protocol errors
         err_unknown_packet_type = Signal()
         err_packet_truncated = Signal()
-        signal_fifo_space_timeout = Signal()
-        err_fifo_space_timeout = Signal()
+        signal_buffer_space_timeout = Signal()
+        err_buffer_space_timeout = Signal()
         self.sync.sys_with_rst += [
             If(self.csrs.protocol_error.re,
                 If(self.csrs.protocol_error.r[0], err_unknown_packet_type.eq(0)),
                 If(self.csrs.protocol_error.r[1], err_packet_truncated.eq(0)),
-                If(self.csrs.protocol_error.r[2], err_fifo_space_timeout.eq(0))
+                If(self.csrs.protocol_error.r[2], err_buffer_space_timeout.eq(0))
             ),
             If(rt_packet.err_unknown_packet_type, err_unknown_packet_type.eq(1)),
             If(rt_packet.err_packet_truncated, err_packet_truncated.eq(1)),
-            If(signal_fifo_space_timeout, err_fifo_space_timeout.eq(1))
+            If(signal_buffer_space_timeout, err_buffer_space_timeout.eq(1))
         ]
         self.comb += self.csrs.protocol_error.w.eq(
-            Cat(err_unknown_packet_type, err_packet_truncated, err_fifo_space_timeout))
-
-        # channel selection
-        chan_sel = Signal(16)
-        self.comb += chan_sel.eq(
-            Mux(self.csrs.chan_sel_override_en.storage,
-                self.csrs.chan_sel_override.storage,
-                self.cri.chan_sel[:16]))
+            Cat(err_unknown_packet_type, err_packet_truncated, err_buffer_space_timeout))
 
         # master RTIO counter and counter synchronization
         self.submodules.counter = RTIOCounter(64-fine_ts_width)
@@ -104,26 +110,16 @@ class RTController(Module):
         self.comb += self.cd_rtio_with_rst.clk.eq(ClockSignal("rtio"))
         self.specials += AsyncResetSynchronizer(self.cd_rtio_with_rst, local_reset)
 
-        # remote channel status cache
-        fifo_spaces_mem = Memory(16, channel_count)
-        fifo_spaces = fifo_spaces_mem.get_port(write_capable=True)
-        self.specials += fifo_spaces_mem, fifo_spaces
-        last_timestamps_mem = Memory(64, channel_count)
-        last_timestamps = last_timestamps_mem.get_port(write_capable=True)
-        self.specials += last_timestamps_mem, last_timestamps
-
         # common packet fields
-        rt_packet_fifo_request = Signal()
+        chan_sel = self.cri.chan_sel[:16]
+        rt_packet_buffer_request = Signal()
         rt_packet_read_request = Signal()
         self.comb += [
-            fifo_spaces.adr.eq(chan_sel),
-            last_timestamps.adr.eq(chan_sel),
-            last_timestamps.dat_w.eq(self.cri.timestamp),
             rt_packet.sr_channel.eq(chan_sel),
             rt_packet.sr_address.eq(self.cri.o_address),
             rt_packet.sr_data.eq(self.cri.o_data),
             rt_packet.sr_timestamp.eq(self.cri.timestamp),
-            If(rt_packet_fifo_request,
+            If(rt_packet_buffer_request,
                 rt_packet.sr_notwrite.eq(1),
                 rt_packet.sr_address.eq(0)
             ),
@@ -136,30 +132,28 @@ class RTController(Module):
         # output status
         o_status_wait = Signal()
         o_status_underflow = Signal()
-        o_status_sequence_error = Signal()
         self.comb += [
             self.cri.o_status.eq(Cat(
-                o_status_wait, o_status_underflow, o_status_sequence_error)),
+                o_status_wait, o_status_underflow)),
             self.csrs.o_wait.status.eq(o_status_wait)
         ]
-        o_sequence_error_set = Signal()
         o_underflow_set = Signal()
         self.sync.sys_with_rst += [
             If(self.cri.cmd == cri.commands["write"],
-                o_status_underflow.eq(0),
-                o_status_sequence_error.eq(0),
+                o_status_underflow.eq(0)
             ),
-            If(o_underflow_set, o_status_underflow.eq(1)),
-            If(o_sequence_error_set, o_status_sequence_error.eq(1))
+            If(o_underflow_set, o_status_underflow.eq(1))
         ]
 
         timeout_counter = WaitTimer(8191)
         self.submodules += timeout_counter
 
-        cond_sequence_error = self.cri.timestamp < last_timestamps.dat_r
-        cond_underflow = ((self.cri.timestamp[fine_ts_width:]
+        cond_underflow = Signal()
+        self.comb += cond_underflow.eq((self.cri.timestamp[fine_ts_width:]
                            - self.csrs.underflow_margin.storage[fine_ts_width:]) < self.counter.value_sys)
 
+        buffer_space = Signal(16)
+
         # input status
         i_status_wait_event = Signal()
         i_status_overflow = Signal()
@@ -190,56 +184,51 @@ class RTController(Module):
 
         fsm.act("IDLE",
             If(self.cri.cmd == cri.commands["write"],
-                If(cond_sequence_error,
-                    o_sequence_error_set.eq(1)
-                ).Elif(cond_underflow,
+                If(cond_underflow,
                     o_underflow_set.eq(1)
                 ).Else(
                     NextState("WRITE")
                 )
             ),
             If(self.cri.cmd == cri.commands["read"], NextState("READ")),
-            If(self.csrs.o_get_fifo_space.re, NextState("GET_FIFO_SPACE"))
+            If(self.csrs.o_get_buffer_space.re, NextState("GET_BUFFER_SPACE"))
         )
         fsm.act("WRITE",
             o_status_wait.eq(1),
             rt_packet.sr_stb.eq(1),
             If(rt_packet.sr_ack,
-                fifo_spaces.we.eq(1),
-                fifo_spaces.dat_w.eq(fifo_spaces.dat_r - 1),
-                last_timestamps.we.eq(1),
-                If(fifo_spaces.dat_r <= 1,
-                    NextState("GET_FIFO_SPACE")
+                NextValue(buffer_space, buffer_space - 1),
+                If(buffer_space <= 1,
+                    NextState("GET_BUFFER_SPACE")
                 ).Else(
                     NextState("IDLE")
                 )
             )
         )
-        fsm.act("GET_FIFO_SPACE",
+        fsm.act("GET_BUFFER_SPACE",
             o_status_wait.eq(1),
-            rt_packet.fifo_space_not_ack.eq(1),
-            rt_packet_fifo_request.eq(1),
+            rt_packet.buffer_space_not_ack.eq(1),
+            rt_packet_buffer_request.eq(1),
             rt_packet.sr_stb.eq(1),
             If(rt_packet.sr_ack,
-                NextState("GET_FIFO_SPACE_REPLY")
+                NextState("GET_BUFFER_SPACE_REPLY")
             )
         )
-        fsm.act("GET_FIFO_SPACE_REPLY",
+        fsm.act("GET_BUFFER_SPACE_REPLY",
             o_status_wait.eq(1),
-            fifo_spaces.dat_w.eq(rt_packet.fifo_space),
-            fifo_spaces.we.eq(1),
-            rt_packet.fifo_space_not_ack.eq(1),
-            If(rt_packet.fifo_space_not,
-                If(rt_packet.fifo_space != 0,
+            NextValue(buffer_space, rt_packet.buffer_space),
+            rt_packet.buffer_space_not_ack.eq(1),
+            If(rt_packet.buffer_space_not,
+               If(rt_packet.buffer_space != 0,
                     NextState("IDLE")
                 ).Else(
-                    NextState("GET_FIFO_SPACE")
+                    NextState("GET_BUFFER_SPACE")
                 )
             ),
             timeout_counter.wait.eq(1),
             If(timeout_counter.done,
-                signal_fifo_space_timeout.eq(1),
-                NextState("GET_FIFO_SPACE")
+                signal_buffer_space_timeout.eq(1),
+                NextState("GET_BUFFER_SPACE")
             )
         )
         fsm.act("READ",
@@ -260,21 +249,12 @@ class RTController(Module):
             )
         )
 
-        # channel state access
-        self.comb += [
-            self.csrs.o_dbg_fifo_space.status.eq(fifo_spaces.dat_r),
-            self.csrs.o_dbg_last_timestamp.status.eq(last_timestamps.dat_r),
-            If(self.csrs.o_reset_channel_status.re,
-                fifo_spaces.dat_w.eq(0),
-                fifo_spaces.we.eq(1),
-                last_timestamps.dat_w.eq(0),
-                last_timestamps.we.eq(1)
-            )
-        ]
+        # debug CSRs
+        self.comb += self.csrs.o_dbg_buffer_space.status.eq(buffer_space),
         self.sync += \
-            If((rt_packet.sr_stb & rt_packet.sr_ack & rt_packet_fifo_request),
-                self.csrs.o_dbg_fifo_space_req_cnt.status.eq(
-                    self.csrs.o_dbg_fifo_space_req_cnt.status + 1)
+            If((rt_packet.sr_stb & rt_packet.sr_ack & rt_packet_buffer_request),
+               self.csrs.o_dbg_buffer_space_req_cnt.status.eq(
+                   self.csrs.o_dbg_buffer_space_req_cnt.status + 1)
             )
 
     def get_csrs(self):
diff --git a/artiq/gateware/drtio/rt_errors_satellite.py b/artiq/gateware/drtio/rt_errors_satellite.py
index e93a7bc9c..bf6e9a3d9 100644
--- a/artiq/gateware/drtio/rt_errors_satellite.py
+++ b/artiq/gateware/drtio/rt_errors_satellite.py
@@ -7,31 +7,63 @@ from artiq.gateware.rtio.cdc import BlindTransfer
 
 
 class RTErrorsSatellite(Module, AutoCSR):
-    def __init__(self, rt_packet, ios):
-        self.protocol_error = CSR(5)
-        self.rtio_error = CSR(2)
+    def __init__(self, rt_packet, outputs):
+        self.protocol_error = CSR(4)
+        self.rtio_error = CSR(3)
+        self.sequence_error_channel = CSRStatus(16)
+        self.collision_channel = CSRStatus(16)
+        self.busy_channel = CSRStatus(16)
 
         def error_csr(csr, *sources):
-            for n, source in enumerate(sources):
-                pending = Signal(related=source)
-                xfer = BlindTransfer(odomain="sys")
+            for n, (source, detect_edges, din, dout) in enumerate(sources):
+                assert isinstance(source, Signal)
+
+                if din is not None:
+                    data_width = len(din)
+                else:
+                    data_width = 0
+                xfer = BlindTransfer(odomain="sys", data_width=data_width)
                 self.submodules += xfer
-                self.comb += xfer.i.eq(source)
+
+                if detect_edges:
+                    source_r = Signal()
+                    self.sync.rio += source_r.eq(source)
+                    self.comb += xfer.i.eq(source & source_r)
+                else:
+                    self.comb += xfer.i.eq(source)
+
+                pending = Signal(related=source)
                 self.sync += [
                     If(csr.re & csr.r[n], pending.eq(0)),
                     If(xfer.o, pending.eq(1))
                 ]
                 self.comb += csr.w[n].eq(pending)
 
-        # The master is normally responsible for avoiding output overflows,
-        # output underflows, and sequence errors.
-        # Error reports here are only for diagnosing internal ARTIQ bugs.
-        error_csr(self.protocol_error, 
-                  rt_packet.unknown_packet_type,
-                  rt_packet.packet_truncated,
-                  ios.write_underflow,
-                  ios.write_overflow,
-                  ios.write_sequence_error)
+                if din is not None:
+                    self.comb += xfer.data_i.eq(din)
+                    self.sync += If(xfer.o & ~pending, dout.eq(xfer.data_o))
+
+
+        # The master is normally responsible for avoiding output overflows
+        # and output underflows. The error reports here are only for diagnosing
+        # internal ARTIQ bugs.
+        underflow = Signal()
+        overflow = Signal()
+        self.comb += [
+            underflow.eq(outputs.cri.o_status[1]),
+            overflow.eq(outputs.cri.o_status[0])
+        ]
+        error_csr(self.protocol_error,
+                  (rt_packet.unknown_packet_type, False, None, None),
+                  (rt_packet.packet_truncated, False, None, None),
+                  (underflow, True, None, None),
+                  (overflow, True, None, None)
+        )
         error_csr(self.rtio_error,
-                  ios.collision,
-                  ios.busy)
+                  (outputs.sequence_error, False,
+                   outputs.sequence_error_channel, self.sequence_error_channel.status),
+                  (outputs.collision, False,
+                   outputs.collision_channel, self.collision_channel.status),
+                  (outputs.busy, False,
+                   outputs.busy_channel, self.busy_channel.status)
+        )
diff --git a/artiq/gateware/drtio/rt_ios_satellite.py b/artiq/gateware/drtio/rt_ios_satellite.py
deleted file mode 100644
index f9fc9096e..000000000
--- a/artiq/gateware/drtio/rt_ios_satellite.py
+++ /dev/null
@@ -1,246 +0,0 @@
-"""Real-time I/O scheduler for satellites"""
-
-from migen import *
-from migen.genlib.fifo import SyncFIFOBuffered
-from migen.genlib.record import *
-
-from artiq.gateware.rtio import rtlink
-
-
-class IOS(Module):
-    def __init__(self, rt_packet, channels, max_fine_ts_width, full_ts_width):
-        self.write_underflow = Signal()
-        self.write_overflow = Signal()
-        self.write_sequence_error = Signal()
-        self.collision = Signal()
-        self.busy = Signal()
-
-        self.rt_packet = rt_packet
-        self.max_fine_ts_width = max_fine_ts_width
-
-        self.tsc = Signal(full_ts_width - max_fine_ts_width)
-        self.sync.rtio += \
-            If(rt_packet.tsc_load,
-                self.tsc.eq(rt_packet.tsc_load_value)
-            ).Else(
-                self.tsc.eq(self.tsc + 1)
-            )
-        self.comb += rt_packet.tsc_input.eq(self.tsc)
-
-        self.sync.rio += [
-            self.write_underflow.eq(0),
-            self.write_overflow.eq(0),
-            self.collision.eq(0),
-            self.busy.eq(0)
-        ]
-        for n, channel in enumerate(channels):
-            self.add_output(n, channel)
-            self.add_input(n, channel)
-
-    def add_output(self, n, channel):
-        rt_packet = self.rt_packet
-        max_fine_ts_width = self.max_fine_ts_width
-
-        interface = channel.interface.o
-        data_width = rtlink.get_data_width(interface)
-        address_width = rtlink.get_address_width(interface)
-        fine_ts_width = rtlink.get_fine_ts_width(interface)
-        assert fine_ts_width <= max_fine_ts_width
-
-        we = Signal()
-        self.comb += we.eq(rt_packet.write_stb
-                           & (rt_packet.write_channel == n))
-        write_timestamp = rt_packet.write_timestamp[max_fine_ts_width-fine_ts_width:]
-        write_timestamp_coarse = rt_packet.write_timestamp[max_fine_ts_width:]
-        write_timestamp_fine = rt_packet.write_timestamp[max_fine_ts_width-fine_ts_width:max_fine_ts_width]
-
-        # latency compensation
-        if interface.delay:
-            tsc_comp = Signal.like(self.tsc)
-            self.sync.rtio += tsc_comp.eq(self.tsc - interface.delay + 1)
-        else:
-            tsc_comp = self.tsc
-
-        # FIFO
-        ev_layout = []
-        if data_width:
-            ev_layout.append(("data", data_width))
-        if address_width:
-            ev_layout.append(("address", address_width))
-        ev_layout.append(("timestamp", len(self.tsc) + fine_ts_width))
-
-        fifo = ClockDomainsRenamer("rio")(
-            SyncFIFOBuffered(layout_len(ev_layout), channel.ofifo_depth))
-        self.submodules += fifo
-        fifo_in = Record(ev_layout)
-        fifo_out = Record(ev_layout)
-        self.comb += [
-            fifo.din.eq(fifo_in.raw_bits()),
-            fifo_out.raw_bits().eq(fifo.dout)
-        ]
-
-        # Buffer
-        buf_pending = Signal()
-        buf = Record(ev_layout)
-        buf_just_written = Signal()
-
-        # Special cases
-        replace = Signal()
-        sequence_error = Signal()
-        collision = Signal()
-        any_error = Signal()
-        if interface.enable_replace:
-            # Note: replace may be asserted at the same time as collision
-            # when addresses are different. In that case, it is a collision.
-            self.sync.rio += replace.eq(write_timestamp == buf.timestamp)
-        # Detect sequence errors on coarse timestamps only
-        # so that they are mutually exclusive with collision errors.
-        self.sync.rio += sequence_error.eq(write_timestamp_coarse < buf.timestamp[fine_ts_width:])
-        if interface.enable_replace:
-            if address_width:
-                different_addresses = rt_packet.write_address != buf.address
-            else:
-                different_addresses = 0
-            if fine_ts_width:
-                self.sync.rio += collision.eq(
-                    (write_timestamp_coarse == buf.timestamp[fine_ts_width:])
-                    & ((write_timestamp_fine != buf.timestamp[:fine_ts_width])
-                       |different_addresses))
-            else:
-                self.sync.rio += collision.eq(
-                    (write_timestamp == buf.timestamp) & different_addresses)
-        else:
-            self.sync.rio += collision.eq(
-                write_timestamp_coarse == buf.timestamp[fine_ts_width:])
-        self.comb += any_error.eq(sequence_error | collision)
-        self.sync.rio += [
-            If(we & sequence_error, self.write_sequence_error.eq(1)),
-            If(we & collision, self.collision.eq(1))
-        ]
-
-        # Buffer read and FIFO write
-        self.comb += fifo_in.eq(buf)
-        in_guard_time = Signal()
-        self.comb += in_guard_time.eq(
-            buf.timestamp[fine_ts_width:] < tsc_comp + 4)
-        self.sync.rio += If(in_guard_time, buf_pending.eq(0))
-        report_underflow = Signal()
-        self.comb += \
-            If(buf_pending,
-                If(in_guard_time,
-                    If(buf_just_written,
-                        report_underflow.eq(1)
-                    ).Else(
-                        fifo.we.eq(1)
-                    )
-                ),
-                If(we & ~replace & ~any_error,
-                   fifo.we.eq(1)
-                )
-            )
-        self.sync.rio += If(report_underflow, self.write_underflow.eq(1))
-
-        # Buffer write
-        # Must come after read to handle concurrent read+write properly
-        self.sync.rio += [
-            buf_just_written.eq(0),
-            If(we & ~any_error,
-                buf_just_written.eq(1),
-                buf_pending.eq(1),
-                buf.timestamp.eq(write_timestamp),
-                buf.data.eq(rt_packet.write_data) if data_width else [],
-                buf.address.eq(rt_packet.write_address) if address_width else [],
-            ),
-            If(we & ~fifo.writable, self.write_overflow.eq(1))
-        ]
-
-        # FIFO level
-        self.sync.rio += \
-            If(rt_packet.fifo_space_update &
-               (rt_packet.fifo_space_channel == n),
-                rt_packet.fifo_space.eq(channel.ofifo_depth - fifo.level))
-
-        # FIFO read
-        self.sync.rio += [
-            fifo.re.eq(0),
-            interface.stb.eq(0),
-            If(fifo.readable &
-               (fifo_out.timestamp[fine_ts_width:] == tsc_comp),
-                fifo.re.eq(1),
-                interface.stb.eq(1)
-            )
-        ]
-        if data_width:
-            self.sync.rio += interface.data.eq(fifo_out.data)
-        if address_width:
-            self.sync.rio += interface.address.eq(fifo_out.address)
-        if fine_ts_width:
-            self.sync.rio += interface.fine_ts.eq(fifo_out.timestamp[:fine_ts_width])
-
-        self.sync.rio += If(interface.stb & interface.busy, self.busy.eq(1))
-
-    def add_input(self, n, channel):
-        rt_packet = self.rt_packet
-
-        interface = channel.interface.i
-        if interface is None:
-            return
-        data_width = rtlink.get_data_width(interface)
-        fine_ts_width = rtlink.get_fine_ts_width(interface)
-
-        selected = Signal()
-        self.comb += selected.eq(rt_packet.read_channel == n)
-
-        # latency compensation
-        if interface.delay:
-            tsc_comp = Signal.like(self.tsc)
-            self.sync.rtio += tsc_comp.eq(self.tsc - interface.delay + 1)
-        else:
-            tsc_comp = self.tsc
-
-        # FIFO
-        ev_layout = []
-        if data_width:
-            ev_layout.append(("data", data_width))
-        if interface.timestamped:
-            ev_layout.append(("timestamp", len(self.tsc) + fine_ts_width))
-
-        fifo = ClockDomainsRenamer("rio")(
-            SyncFIFOBuffered(layout_len(ev_layout), channel.ififo_depth))
-        self.submodules += fifo
-        fifo_in = Record(ev_layout)
-        fifo_out = Record(ev_layout)
-        self.comb += [
-            fifo.din.eq(fifo_in.raw_bits()),
-            fifo_out.raw_bits().eq(fifo.dout)
-        ]
-
-        # FIFO write
-        if data_width:
-            self.comb += fifo_in.data.eq(interface.data)
-        if interface.timestamped:
-            if fine_ts_width:
-                full_ts = Cat(interface.fine_ts, tsc_comp)
-            else:
-                full_ts = tsc_comp
-            self.comb += fifo_in.timestamp.eq(full_ts)
-        self.comb += fifo.we.eq(interface.stb)
-
-        overflow = Signal()
-        self.comb += If(selected, rt_packet.read_overflow.eq(overflow))
-        self.sync.rio += [
-            If(selected & rt_packet.read_overflow_ack, overflow.eq(0)),
-            If(fifo.we & ~fifo.writable, overflow.eq(1))
-        ]
-
-        # FIFO read
-        if data_width:
-            self.comb += If(selected, rt_packet.read_data.eq(fifo_out.data))
-        if interface.timestamped:
-            self.comb += If(selected, rt_packet.read_timestamp.eq(fifo_out.timestamp))
-        self.comb += [
-            If(selected,
-                rt_packet.read_readable.eq(fifo.readable),
-                fifo.re.eq(rt_packet.read_consume)
-            )
-        ]
diff --git a/artiq/gateware/drtio/rt_packet_master.py b/artiq/gateware/drtio/rt_packet_master.py
index a32004ebd..94dea3944 100644
--- a/artiq/gateware/drtio/rt_packet_master.py
+++ b/artiq/gateware/drtio/rt_packet_master.py
@@ -66,12 +66,12 @@ class RTPacketMaster(Module):
 
         # standard request interface
         #
-        # notwrite=1 address=0  FIFO space request <channel>
+        # notwrite=1 address=0  buffer space request
         # notwrite=1 address=1  read request <channel, timestamp>
         #
         # optimized for write throughput
         # requests are performed on the DRTIO link preserving their order of issue
-        # this is important for FIFO space requests, which have to be ordered
+        # this is important for buffer space requests, which have to be ordered
         # wrt writes.
         self.sr_stb = Signal()
         self.sr_ack = Signal()
@@ -81,10 +81,10 @@ class RTPacketMaster(Module):
         self.sr_address = Signal(16)
         self.sr_data = Signal(512)
 
-        # fifo space reply interface
-        self.fifo_space_not = Signal()
-        self.fifo_space_not_ack = Signal()
-        self.fifo_space = Signal(16)
+        # buffer space reply interface
+        self.buffer_space_not = Signal()
+        self.buffer_space_not_ack = Signal()
+        self.buffer_space = Signal(16)
 
         # read reply interface
         self.read_not = Signal()
@@ -209,11 +209,11 @@ class RTPacketMaster(Module):
             )
 
         # CDC
-        fifo_space_not = Signal()
-        fifo_space = Signal(16)
+        buffer_space_not = Signal()
+        buffer_space = Signal(16)
         self.submodules += _CrossDomainNotification("rtio_rx",
-            fifo_space_not, fifo_space,
-            self.fifo_space_not, self.fifo_space_not_ack, self.fifo_space)
+            buffer_space_not, buffer_space,
+            self.buffer_space_not, self.buffer_space_not_ack, self.buffer_space)
 
         set_time_stb = Signal()
         set_time_ack = Signal()
@@ -274,7 +274,7 @@ class RTPacketMaster(Module):
             If(sr_buf_readable,
                 If(sr_notwrite,
                     Case(sr_address[0], {
-                        0: NextState("FIFO_SPACE"),
+                        0: NextState("BUFFER_SPACE"),
                         1: NextState("READ")
                     }),
                 ).Else(
@@ -316,8 +316,8 @@ class RTPacketMaster(Module):
                 NextState("IDLE")
             )
         )
-        tx_fsm.act("FIFO_SPACE",
-            tx_dp.send("fifo_space_request", channel=sr_channel),
+        tx_fsm.act("BUFFER_SPACE",
+            tx_dp.send("buffer_space_request"),
             If(tx_dp.packet_last,
                 sr_buf_re.eq(1),
                 NextState("IDLE")
@@ -369,7 +369,7 @@ class RTPacketMaster(Module):
                 If(rx_dp.packet_last,
                     Case(rx_dp.packet_type, {
                         rx_plm.types["echo_reply"]: echo_received_now.eq(1),
-                        rx_plm.types["fifo_space_reply"]: NextState("FIFO_SPACE"),
+                        rx_plm.types["buffer_space_reply"]: NextState("BUFFER_SPACE"),
                         rx_plm.types["read_reply"]: NextState("READ_REPLY"),
                         rx_plm.types["read_reply_noevent"]: NextState("READ_REPLY_NOEVENT"),
                         "default": err_unknown_packet_type.i.eq(1)
@@ -382,9 +382,9 @@ class RTPacketMaster(Module):
                 err_packet_truncated.i.eq(1)
             )
         )
-        rx_fsm.act("FIFO_SPACE",
-            fifo_space_not.eq(1),
-            fifo_space.eq(rx_dp.packet_as["fifo_space_reply"].space),
+        rx_fsm.act("BUFFER_SPACE",
+            buffer_space_not.eq(1),
+            buffer_space.eq(rx_dp.packet_as["buffer_space_reply"].space),
             NextState("INPUT")
         )
         rx_fsm.act("READ_REPLY",
diff --git a/artiq/gateware/drtio/rt_packet_satellite.py b/artiq/gateware/drtio/rt_packet_satellite.py
index 249f8ca27..47b9a1d55 100644
--- a/artiq/gateware/drtio/rt_packet_satellite.py
+++ b/artiq/gateware/drtio/rt_packet_satellite.py
@@ -3,6 +3,7 @@
 from migen import *
 from migen.genlib.fsm import *
 
+from artiq.gateware.rtio import cri
 from artiq.gateware.drtio.rt_serializer import *
 
 
@@ -13,30 +14,11 @@ class RTPacketSatellite(Module):
 
         self.tsc_load = Signal()
         self.tsc_load_value = Signal(64)
-        self.tsc_input = Signal(64)
 
         self.reset = Signal(reset=1)
         self.reset_phy = Signal(reset=1)
 
-        self.fifo_space_channel = Signal(16)
-        self.fifo_space_update = Signal()
-        self.fifo_space = Signal(16)
-
-        # write parameters are stable one cycle before stb is asserted,
-        # and when stb is asserted.
-        self.write_stb = Signal()
-        self.write_timestamp = Signal(64)
-        self.write_channel = Signal(16)
-        self.write_address = Signal(16)
-        self.write_data = Signal(512)
-
-        self.read_channel = Signal(16)
-        self.read_readable = Signal()
-        self.read_consume = Signal()
-        self.read_data = Signal(32)
-        self.read_timestamp = Signal(64)
-        self.read_overflow = Signal()
-        self.read_overflow_ack = Signal()
+        self.cri = cri.Interface()
 
         # # #
 
@@ -69,27 +51,49 @@ class RTPacketSatellite(Module):
 
         # RX->TX
         echo_req = Signal()
-        fifo_space_set = Signal()
-        fifo_space_req = Signal()
-        fifo_space_ack = Signal()
+        buffer_space_set = Signal()
+        buffer_space_req = Signal()
+        buffer_space_ack = Signal()
         self.sync += [
-            If(fifo_space_ack, fifo_space_req.eq(0)),
-            If(fifo_space_set, fifo_space_req.eq(1)),
+            If(buffer_space_ack, buffer_space_req.eq(0)),
+            If(buffer_space_set, buffer_space_req.eq(1)),
+        ]
+
+        buffer_space_update = Signal()
+        buffer_space = Signal(16)
+        self.sync += If(buffer_space_update, buffer_space.eq(self.cri.o_buffer_space))
+
+        load_read_request = Signal()
+        clear_read_request = Signal()
+        read_request_pending = Signal()
+        self.sync += [
+            If(clear_read_request | self.reset,
+                read_request_pending.eq(0)
+            ),
+            If(load_read_request,
+                read_request_pending.eq(1),
+            )
         ]
 
         # RX FSM
+        read = Signal()
         self.comb += [
             self.tsc_load_value.eq(
                 rx_dp.packet_as["set_time"].timestamp),
-            self.fifo_space_channel.eq(
-                rx_dp.packet_as["fifo_space_request"].channel),
-            self.write_timestamp.eq(
-                rx_dp.packet_as["write"].timestamp),
-            self.write_channel.eq(
-                rx_dp.packet_as["write"].channel),
-            self.write_address.eq(
+            If(load_read_request | read_request_pending,
+                self.cri.chan_sel.eq(
+                    rx_dp.packet_as["read_request"].channel),
+                self.cri.timestamp.eq(
+                    rx_dp.packet_as["read_request"].timeout)
+            ).Else(
+                self.cri.chan_sel.eq(
+                    rx_dp.packet_as["write"].channel),
+                self.cri.timestamp.eq(
+                    rx_dp.packet_as["write"].timestamp)
+            ),
+            self.cri.o_address.eq(
                 rx_dp.packet_as["write"].address),
-            self.write_data.eq(
+            self.cri.o_data.eq(
                 Cat(rx_dp.packet_as["write"].short_data, write_data_buffer)),
         ]
 
@@ -100,26 +104,6 @@ class RTPacketSatellite(Module):
             self.reset_phy.eq(reset_phy)
         ]
 
-        load_read_request = Signal()
-        clear_read_request = Signal()
-        read_request_pending = Signal()
-        read_request_time_limit = Signal(64)
-        read_request_timeout = Signal()
-        read_request_wait = Signal()  # 1 cycle latency channel→(data,overflow) and time_limit→timeout
-        self.sync += [
-            If(clear_read_request | self.reset,
-                read_request_pending.eq(0)
-            ),
-            read_request_wait.eq(0),
-            If(load_read_request,
-                read_request_pending.eq(1),
-                read_request_wait.eq(1),
-                self.read_channel.eq(rx_dp.packet_as["read_request"].channel),
-                read_request_time_limit.eq(rx_dp.packet_as["read_request"].timeout)
-            ),
-            read_request_timeout.eq(self.tsc_input >= read_request_time_limit),
-        ]
-
         rx_fsm = FSM(reset_state="INPUT")
         self.submodules += rx_fsm
 
@@ -138,7 +122,7 @@ class RTPacketSatellite(Module):
                         rx_plm.types["set_time"]: NextState("SET_TIME"),
                         rx_plm.types["reset"]: NextState("RESET"),
                         rx_plm.types["write"]: NextState("WRITE"),
-                        rx_plm.types["fifo_space_request"]: NextState("FIFO_SPACE"),
+                        rx_plm.types["buffer_space_request"]: NextState("BUFFER_SPACE"),
                         rx_plm.types["read_request"]: NextState("READ_REQUEST"),
                         "default": self.unknown_packet_type.eq(1)
                     })
@@ -165,7 +149,7 @@ class RTPacketSatellite(Module):
 
         rx_fsm.act("WRITE",
             If(write_data_buffer_cnt == rx_dp.packet_as["write"].extra_data_cnt,
-                self.write_stb.eq(1),
+                self.cri.cmd.eq(cri.commands["write"]),
                 NextState("INPUT")
             ).Else(
                 write_data_buffer_load.eq(1),
@@ -175,14 +159,15 @@ class RTPacketSatellite(Module):
                 )
             )
         )
-        rx_fsm.act("FIFO_SPACE",
-            fifo_space_set.eq(1),
-            self.fifo_space_update.eq(1),
+        rx_fsm.act("BUFFER_SPACE",
+            buffer_space_set.eq(1),
+            buffer_space_update.eq(1),
             NextState("INPUT")
         )
 
         rx_fsm.act("READ_REQUEST",
             load_read_request.eq(1),
+            self.cri.cmd.eq(cri.commands["read"]),
             NextState("INPUT")
         )
 
@@ -192,11 +177,11 @@ class RTPacketSatellite(Module):
 
         tx_fsm.act("IDLE",
             If(echo_req, NextState("ECHO")),
-            If(fifo_space_req, NextState("FIFO_SPACE")),
-            If(~read_request_wait & read_request_pending,
-                If(read_request_timeout, NextState("READ_TIMEOUT")),
-                If(self.read_overflow, NextState("READ_OVERFLOW")),
-                If(self.read_readable, NextState("READ"))
+            If(buffer_space_req, NextState("BUFFER_SPACE")),
+            If(read_request_pending,
+                If(~self.cri.i_status[2], NextState("READ")),
+                If(self.cri.i_status[0], NextState("READ_TIMEOUT")),
+                If(self.cri.i_status[1], NextState("READ_OVERFLOW"))
             )
         )
 
@@ -205,9 +190,9 @@ class RTPacketSatellite(Module):
             If(tx_dp.packet_last, NextState("IDLE"))
         )
 
-        tx_fsm.act("FIFO_SPACE",
-            fifo_space_ack.eq(1),
-            tx_dp.send("fifo_space_reply", space=self.fifo_space),
+        tx_fsm.act("BUFFER_SPACE",
+            buffer_space_ack.eq(1),
+            tx_dp.send("buffer_space_reply", space=buffer_space),
             If(tx_dp.packet_last, NextState("IDLE"))
         )
 
@@ -220,17 +205,15 @@ class RTPacketSatellite(Module):
             tx_dp.send("read_reply_noevent", overflow=1),
             clear_read_request.eq(1),
             If(tx_dp.packet_last,
-                self.read_overflow_ack.eq(1),
                 NextState("IDLE")
             )
         )
         tx_fsm.act("READ",
             tx_dp.send("read_reply",
-                       timestamp=self.read_timestamp,
-                       data=self.read_data),
+                       timestamp=self.cri.i_timestamp,
+                       data=self.cri.i_data),
             clear_read_request.eq(1),
             If(tx_dp.packet_last,
-                self.read_consume.eq(1),
                 NextState("IDLE")
             )
         )
diff --git a/artiq/gateware/drtio/rt_serializer.py b/artiq/gateware/drtio/rt_serializer.py
index df1e2e5c6..8ba5668fc 100644
--- a/artiq/gateware/drtio/rt_serializer.py
+++ b/artiq/gateware/drtio/rt_serializer.py
@@ -18,7 +18,7 @@ class PacketLayoutManager:
         self.layouts = dict()
         self.types = dict()
         self.type_names = dict()
-    
+
     def add_type(self, name, *fields, pad=True):
         type_n = len(self.types)
         self.types[name] = type_n
@@ -54,7 +54,7 @@ def get_m2s_layouts(alignment):
                           ("address", 16),
                           ("extra_data_cnt", 8),
                           ("short_data", short_data_len))
-    plm.add_type("fifo_space_request", ("channel", 16))
+    plm.add_type("buffer_space_request")
 
     plm.add_type("read_request", ("channel", 16), ("timeout", 64))
 
@@ -66,7 +66,7 @@ def get_s2m_layouts(alignment):
 
     plm.add_type("echo_reply")
 
-    plm.add_type("fifo_space_reply", ("space", 16))
+    plm.add_type("buffer_space_reply", ("space", 16))
 
     plm.add_type("read_reply", ("timestamp", 64), ("data", 32))
     plm.add_type("read_reply_noevent", ("overflow", 1))  # overflow=0→timeout
@@ -110,7 +110,7 @@ class ReceiveDatapath(Module):
         packet_buffer_count = Signal(max=w_in_packet+1)
         self.sync += \
             If(self.packet_buffer_load,
-                Case(packet_buffer_count, 
+                Case(packet_buffer_count,
                      {i: packet_buffer[i*ws:(i+1)*ws].eq(self.data_r)
                       for i in range(w_in_packet)}),
                 packet_buffer_count.eq(packet_buffer_count + 1)
diff --git a/artiq/gateware/rtio/__init__.py b/artiq/gateware/rtio/__init__.py
index 18feec299..718f2bc7f 100644
--- a/artiq/gateware/rtio/__init__.py
+++ b/artiq/gateware/rtio/__init__.py
@@ -1,5 +1,6 @@
 from artiq.gateware.rtio.cri import KernelInitiator, CRIInterconnectShared
-from artiq.gateware.rtio.core import Channel, LogChannel, Core
+from artiq.gateware.rtio.channel import Channel, LogChannel
+from artiq.gateware.rtio.core import Core
 from artiq.gateware.rtio.analyzer import Analyzer
 from artiq.gateware.rtio.moninj import MonInj
 from artiq.gateware.rtio.dma import DMA
diff --git a/artiq/gateware/rtio/analyzer.py b/artiq/gateware/rtio/analyzer.py
index 2461f80c1..f958ca8d6 100644
--- a/artiq/gateware/rtio/analyzer.py
+++ b/artiq/gateware/rtio/analyzer.py
@@ -94,10 +94,6 @@ class MessageEncoder(Module, AutoCSR):
                 exception_stb.eq(1),
                 exception.exception_type.eq(ExceptionType.o_underflow.value)
             ),
-            If(just_written & cri.o_status[2],
-                exception_stb.eq(1),
-                exception.exception_type.eq(ExceptionType.o_sequence_error.value)
-            ),
             If(read_overflow,
                 exception_stb.eq(1),
                 exception.exception_type.eq(ExceptionType.i_overflow.value)
diff --git a/artiq/gateware/rtio/cdc.py b/artiq/gateware/rtio/cdc.py
index af93b4105..e5f75fba3 100644
--- a/artiq/gateware/rtio/cdc.py
+++ b/artiq/gateware/rtio/cdc.py
@@ -2,7 +2,7 @@ from migen import *
 from migen.genlib.cdc import *
 
 
-__all__ = ["GrayCodeTransfer", "RTIOCounter", "BlindTransfer"]
+__all__ = ["GrayCodeTransfer", "BlindTransfer"]
 
 
 # note: transfer is in rtio/sys domains and not affected by the reset CSRs
@@ -28,28 +28,15 @@ class GrayCodeTransfer(Module):
         self.sync += self.o.eq(value_sys)
 
 
-class RTIOCounter(Module):
-    def __init__(self, width):
-        self.width = width
-        # Timestamp counter in RTIO domain
-        self.value_rtio = Signal(width)
-        # Timestamp counter resynchronized to sys domain
-        # Lags behind value_rtio, monotonic and glitch-free
-        self.value_sys = Signal(width)
-
-        # # #
-
-        # note: counter is in rtio domain and never affected by the reset CSRs
-        self.sync.rtio += self.value_rtio.eq(self.value_rtio + 1)
-        gt = GrayCodeTransfer(width)
-        self.submodules += gt
-        self.comb += gt.i.eq(self.value_rtio), self.value_sys.eq(gt.o)
-
-
 class BlindTransfer(Module):
-    def __init__(self, idomain="rio", odomain="rsys"):
+    def __init__(self, idomain="rio", odomain="rsys", data_width=0):
         self.i = Signal()
         self.o = Signal()
+        if data_width:
+            self.data_i = Signal(data_width)
+            self.data_o = Signal(data_width)
+
+        # # #
 
         ps = PulseSynchronizer(idomain, odomain)
         ps_ack = PulseSynchronizer(odomain, idomain)
@@ -65,3 +52,10 @@ class BlindTransfer(Module):
             ps_ack.i.eq(ps.o),
             self.o.eq(ps.o)
         ]
+
+        if data_width:
+            bxfer_data = Signal(data_width)
+            isync += If(ps.i, bxfer_data.eq(self.data_i))
+            bxfer_data.attr.add("no_retiming")
+            self.specials += MultiReg(bxfer_data, self.data_o,
+                                      odomain=odomain)
diff --git a/artiq/gateware/rtio/channel.py b/artiq/gateware/rtio/channel.py
new file mode 100644
index 000000000..5e85a1add
--- /dev/null
+++ b/artiq/gateware/rtio/channel.py
@@ -0,0 +1,36 @@
+import warnings
+
+from artiq.gateware.rtio import rtlink
+
+
+class Channel:
+    def __init__(self, interface, probes=None, overrides=None,
+                 ofifo_depth=None, ififo_depth=64):
+        if probes is None:
+            probes = []
+        if overrides is None:
+            overrides = []
+
+        self.interface = interface
+        self.probes = probes
+        self.overrides = overrides
+        if ofifo_depth is None:
+            ofifo_depth = 64
+        else:
+            warnings.warn("ofifo_depth is deprecated", DeprecationWarning)
+        self.ofifo_depth = ofifo_depth
+        self.ififo_depth = ififo_depth
+
+    @classmethod
+    def from_phy(cls, phy, **kwargs):
+        probes = getattr(phy, "probes", [])
+        overrides = getattr(phy, "overrides", [])
+        return cls(phy.rtlink, probes, overrides, **kwargs)
+
+
+class LogChannel:
+    """A degenerate channel used to log messages into the analyzer."""
+    def __init__(self):
+        self.interface = rtlink.Interface(rtlink.OInterface(32))
+        self.probes = []
+        self.overrides = []
diff --git a/artiq/gateware/rtio/core.py b/artiq/gateware/rtio/core.py
index 76bcd09ee..c2bee2947 100644
--- a/artiq/gateware/rtio/core.py
+++ b/artiq/gateware/rtio/core.py
@@ -2,302 +2,31 @@ from functools import reduce
 from operator import and_
 
 from migen import *
-from migen.genlib.record import Record
-from migen.genlib.fifo import AsyncFIFO
 from migen.genlib.resetsync import AsyncResetSynchronizer
 from misoc.interconnect.csr import *
 
-from artiq.gateware.rtio import cri, rtlink
+from artiq.gateware.rtio import cri
+from artiq.gateware.rtio import rtlink
+from artiq.gateware.rtio.channel import *
 from artiq.gateware.rtio.cdc import *
-
-
-# CHOOSING A GUARD TIME
-#
-# The buffer must be transferred to the FIFO soon enough to account for:
-#    * transfer of counter to sys domain: Tio + 2*Tsys + Tsys
-#    * FIFO latency: Tsys + 2*Tio
-#    * FIFO buffer latency: Tio
-# Therefore we must choose:
-#    guard_io_cycles > (4*Tio + 4*Tsys)/Tio
-#
-# We are writing to the FIFO from the buffer when the guard time has been
-# reached. This can fill the FIFO and deassert the writable flag. A race
-# condition occurs that causes problems if the deassertion happens between
-# the CPU checking the writable flag (and reading 1) and writing a new event.
-#
-# When the FIFO is about to be full, it contains fifo_depth-1 events of
-# strictly increasing timestamps.
-#
-# Thus the FIFO-filling event's timestamp must satisfy:
-#    timestamp*Tio > (fifo_depth-1)*Tio + time
-# We also have (guard time reached):
-#    timestamp*Tio < time + guard_io_cycles*Tio
-# [NB: time > counter.value_sys*Tio]
-# Thus we must have:
-#    guard_io_cycles > fifo_depth-1
-#
-# We can prevent overflows by choosing instead:
-#    guard_io_cycles < fifo_depth-1
-
-class _OutputManager(Module):
-    def __init__(self, interface, counter, fifo_depth, guard_io_cycles):
-        data_width = rtlink.get_data_width(interface)
-        address_width = rtlink.get_address_width(interface)
-        fine_ts_width = rtlink.get_fine_ts_width(interface)
-
-        ev_layout = []
-        if data_width:
-            ev_layout.append(("data", data_width))
-        if address_width:
-            ev_layout.append(("address", address_width))
-        ev_layout.append(("timestamp", counter.width + fine_ts_width))
-        # ev must be valid 1 cycle before we to account for the latency in
-        # generating replace, sequence_error and collision
-        self.ev = Record(ev_layout)
-
-        self.writable = Signal()
-        self.we = Signal()  # maximum throughput 1/2
-
-        self.underflow = Signal()  # valid 1 cycle after we, pulsed
-        self.sequence_error = Signal()
-        self.collision = Signal()
-        self.busy = Signal()  # pulsed
-
-        # # #
-
-        # FIFO
-        fifo = ClockDomainsRenamer({"write": "rsys", "read": "rio"})(
-            AsyncFIFO(layout_len(ev_layout), fifo_depth))
-        self.submodules += fifo
-        fifo_in = Record(ev_layout)
-        fifo_out = Record(ev_layout)
-        self.comb += [
-            fifo.din.eq(fifo_in.raw_bits()),
-            fifo_out.raw_bits().eq(fifo.dout)
-        ]
-
-        # Buffer
-        buf_pending = Signal()
-        buf = Record(ev_layout)
-        buf_just_written = Signal()
-
-        # Special cases
-        replace = Signal(reset_less=True)
-        sequence_error = Signal(reset_less=True)
-        collision = Signal(reset_less=True)
-        any_error = Signal()
-        if interface.enable_replace:
-            # Note: replace may be asserted at the same time as collision
-            # when addresses are different. In that case, it is a collision.
-            self.sync.rsys += replace.eq(self.ev.timestamp == buf.timestamp)
-        # Detect sequence errors on coarse timestamps only
-        # so that they are mutually exclusive with collision errors.
-        self.sync.rsys += sequence_error.eq(self.ev.timestamp[fine_ts_width:] <
-                                            buf.timestamp[fine_ts_width:])
-        if interface.enable_replace:
-            if address_width:
-                different_addresses = self.ev.address != buf.address
-            else:
-                different_addresses = 0
-            if fine_ts_width:
-                self.sync.rsys += collision.eq(
-                    (self.ev.timestamp[fine_ts_width:] == buf.timestamp[fine_ts_width:])
-                    & ((self.ev.timestamp[:fine_ts_width] != buf.timestamp[:fine_ts_width])
-                       |different_addresses))
-            else:
-                self.sync.rsys += collision.eq(
-                    (self.ev.timestamp == buf.timestamp) & different_addresses)
-        else:
-            self.sync.rsys += collision.eq(
-                self.ev.timestamp[fine_ts_width:] == buf.timestamp[fine_ts_width:])
-        self.comb += [
-            any_error.eq(sequence_error | collision),
-            self.sequence_error.eq(self.we & sequence_error),
-            self.collision.eq(self.we & collision)
-        ]
-
-        # Buffer read and FIFO write
-        self.comb += fifo_in.eq(buf)
-        in_guard_time = Signal()
-        self.comb += in_guard_time.eq(
-            buf.timestamp[fine_ts_width:]
-                < counter.value_sys + guard_io_cycles)
-        self.sync.rsys += If(in_guard_time, buf_pending.eq(0))
-        self.comb += \
-            If(buf_pending,
-                If(in_guard_time,
-                    If(buf_just_written,
-                        self.underflow.eq(1)
-                    ).Else(
-                        fifo.we.eq(1)
-                    )
-                ),
-                If(self.we & ~replace & ~any_error,
-                   fifo.we.eq(1)
-                )
-            )
-
-        # Buffer write
-        # Must come after read to handle concurrent read+write properly
-        self.sync.rsys += [
-            buf_just_written.eq(0),
-            If(self.we & ~any_error,
-                buf_just_written.eq(1),
-                buf_pending.eq(1),
-                buf.eq(self.ev)
-            )
-        ]
-        self.comb += self.writable.eq(fifo.writable)
-
-        # Buffer output of FIFO to improve timing
-        dout_stb = Signal()
-        dout_ack = Signal()
-        dout = Record(ev_layout)
-        self.sync.rio += \
-            If(fifo.re,
-                dout_stb.eq(1),
-                dout.eq(fifo_out)
-            ).Elif(dout_ack,
-                dout_stb.eq(0)
-            )
-        self.comb += fifo.re.eq(fifo.readable & (~dout_stb | dout_ack))
-
-        # latency compensation
-        if interface.delay:
-            counter_rtio = Signal.like(counter.value_rtio, reset_less=True)
-            self.sync.rtio += counter_rtio.eq(counter.value_rtio -
-                                              (interface.delay + 1))
-        else:
-            counter_rtio = counter.value_rtio
-
-        # FIFO read through buffer
-        self.comb += [
-            dout_ack.eq(
-                dout.timestamp[fine_ts_width:] == counter_rtio),
-            interface.stb.eq(dout_stb & dout_ack)
-        ]
-
-        busy_transfer = BlindTransfer()
-        self.submodules += busy_transfer
-        self.comb += [
-            busy_transfer.i.eq(interface.stb & interface.busy),
-            self.busy.eq(busy_transfer.o),
-        ]
-
-        if data_width:
-            self.comb += interface.data.eq(dout.data)
-        if address_width:
-            self.comb += interface.address.eq(dout.address)
-        if fine_ts_width:
-            self.comb += interface.fine_ts.eq(dout.timestamp[:fine_ts_width])
-
-
-class _InputManager(Module):
-    def __init__(self, interface, counter, fifo_depth):
-        data_width = rtlink.get_data_width(interface)
-        fine_ts_width = rtlink.get_fine_ts_width(interface)
-
-        ev_layout = []
-        if data_width:
-            ev_layout.append(("data", data_width))
-        if interface.timestamped:
-            ev_layout.append(("timestamp", counter.width + fine_ts_width))
-        self.ev = Record(ev_layout)
-
-        self.readable = Signal()
-        self.re = Signal()
-
-        self.overflow = Signal()  # pulsed
-
-        # # #
-
-        fifo = ClockDomainsRenamer({"read": "rsys", "write": "rio"})(
-            AsyncFIFO(layout_len(ev_layout), fifo_depth))
-        self.submodules += fifo
-        fifo_in = Record(ev_layout)
-        fifo_out = Record(ev_layout)
-        self.comb += [
-            fifo.din.eq(fifo_in.raw_bits()),
-            fifo_out.raw_bits().eq(fifo.dout)
-        ]
-
-        # latency compensation
-        if interface.delay:
-            counter_rtio = Signal.like(counter.value_rtio, reset_less=True)
-            self.sync.rtio += counter_rtio.eq(counter.value_rtio -
-                                              (interface.delay + 1))
-        else:
-            counter_rtio = counter.value_rtio
-
-        # FIFO write
-        if data_width:
-            self.comb += fifo_in.data.eq(interface.data)
-        if interface.timestamped:
-            if fine_ts_width:
-                full_ts = Cat(interface.fine_ts, counter_rtio)
-            else:
-                full_ts = counter_rtio
-            self.comb += fifo_in.timestamp.eq(full_ts)
-        self.comb += fifo.we.eq(interface.stb)
-
-        # FIFO read
-        self.comb += [
-            self.ev.eq(fifo_out),
-            self.readable.eq(fifo.readable),
-            fifo.re.eq(self.re)
-        ]
-
-        overflow_transfer = BlindTransfer()
-        self.submodules += overflow_transfer
-        self.comb += [
-            overflow_transfer.i.eq(fifo.we & ~fifo.writable),
-            self.overflow.eq(overflow_transfer.o),
-        ]
-
-
-class Channel:
-    def __init__(self, interface, probes=None, overrides=None,
-                 ofifo_depth=64, ififo_depth=64):
-        if probes is None:
-            probes = []
-        if overrides is None:
-            overrides = []
-
-        self.interface = interface
-        self.probes = probes
-        self.overrides = overrides
-        self.ofifo_depth = ofifo_depth
-        self.ififo_depth = ififo_depth
-
-    @classmethod
-    def from_phy(cls, phy, **kwargs):
-        probes = getattr(phy, "probes", [])
-        overrides = getattr(phy, "overrides", [])
-        return cls(phy.rtlink, probes, overrides, **kwargs)
-
-
-class LogChannel:
-    """A degenerate channel used to log messages into the analyzer."""
-    def __init__(self):
-        self.interface = rtlink.Interface(rtlink.OInterface(32))
-        self.probes = []
-        self.overrides = []
+from artiq.gateware.rtio.sed.core import *
+from artiq.gateware.rtio.input_collector import *
 
 
 class Core(Module, AutoCSR):
-    def __init__(self, channels, fine_ts_width=None, guard_io_cycles=20):
-        if fine_ts_width is None:
-            fine_ts_width = max(rtlink.get_fine_ts_width(c.interface)
-                                for c in channels)
-
+    def __init__(self, channels, lane_count=8, fifo_depth=128,
+                 glbl_fine_ts_width=None):
         self.cri = cri.Interface()
         self.reset = CSR()
         self.reset_phy = CSR()
-        self.async_error = CSR(2)
+        self.async_error = CSR(3)
+        self.collision_channel = CSRStatus(16)
+        self.busy_channel = CSRStatus(16)
+        self.sequence_error_channel = CSRStatus(16)
 
         # Clocking/Reset
         # Create rsys, rio and rio_phy domains based on sys and rtio
-        # with reset controlled by CRI.
+        # with reset controlled by CSR.
         #
         # The `rio` CD contains logic that is reset with `core.reset()`.
         # That's state that could unduly affect subsequent experiments,
@@ -327,125 +56,78 @@ class Core(Module, AutoCSR):
         self.specials += AsyncResetSynchronizer(self.cd_rio, cmd_reset)
         self.specials += AsyncResetSynchronizer(self.cd_rio_phy, cmd_reset_phy)
 
-        # Managers
-        self.submodules.counter = RTIOCounter(len(self.cri.timestamp) - fine_ts_width)
+        # TSC
+        chan_fine_ts_width = max(max(rtlink.get_fine_ts_width(channel.interface.o)
+                                     for channel in channels),
+                                 max(rtlink.get_fine_ts_width(channel.interface.i)
+                                     for channel in channels))
+        if glbl_fine_ts_width is None:
+            glbl_fine_ts_width = chan_fine_ts_width
+        assert glbl_fine_ts_width >= chan_fine_ts_width
 
-        # Collision is not an asynchronous error with local RTIO, but
-        # we treat it as such for consistency with DRTIO, where collisions
-        # are reported by the satellites.
-        o_underflow = Signal()
-        o_sequence_error = Signal()
+        coarse_ts = Signal(64-glbl_fine_ts_width)
+        self.sync.rtio += coarse_ts.eq(coarse_ts + 1)
+        coarse_ts_cdc = GrayCodeTransfer(len(coarse_ts))
+        self.submodules += coarse_ts_cdc
+        self.comb += [
+            coarse_ts_cdc.i.eq(coarse_ts),
+            self.cri.counter.eq(coarse_ts_cdc.o << glbl_fine_ts_width)
+        ]
+
+        # Outputs/Inputs
+        quash_channels = [n for n, c in enumerate(channels) if isinstance(c, LogChannel)]
+
+        outputs = SED(channels, glbl_fine_ts_width, "async",
+            quash_channels=quash_channels,
+            lane_count=lane_count, fifo_depth=fifo_depth,
+            interface=self.cri)
+        self.submodules += outputs
+        self.comb += outputs.coarse_timestamp.eq(coarse_ts)
+        self.sync += outputs.minimum_coarse_timestamp.eq(coarse_ts + 16)
+
+        inputs = InputCollector(channels, glbl_fine_ts_width, "async",
+            quash_channels=quash_channels,
+            interface=self.cri)
+        self.submodules += inputs
+        self.comb += inputs.coarse_timestamp.eq(coarse_ts)
+
+        # Asychronous output errors
+        o_collision_sync = BlindTransfer(data_width=16)
+        o_busy_sync = BlindTransfer(data_width=16)
+        self.submodules += o_collision_sync, o_busy_sync
         o_collision = Signal()
         o_busy = Signal()
-        self.sync.rsys += [
-            If(self.cri.cmd == cri.commands["write"],
-                o_underflow.eq(0),
-                o_sequence_error.eq(0),
-            )
-        ]
+        o_sequence_error = Signal()
         self.sync += [
             If(self.async_error.re,
                 If(self.async_error.r[0], o_collision.eq(0)),
                 If(self.async_error.r[1], o_busy.eq(0)),
+                If(self.async_error.r[2], o_sequence_error.eq(0)),
+            ),
+            If(o_collision_sync.o, 
+                o_collision.eq(1),
+                If(~o_collision,
+                    self.collision_channel.status.eq(o_collision_sync.data_o)
+                )
+            ),
+            If(o_busy_sync.o, 
+                o_busy.eq(1),
+                If(~o_busy,
+                    self.busy_channel.status.eq(o_busy_sync.data_o)
+                )
+            ),
+            If(outputs.sequence_error, 
+                o_sequence_error.eq(1),
+                If(~o_sequence_error,
+                    self.sequence_error_channel.status.eq(outputs.sequence_error_channel)
+                )
             )
         ]
+        self.comb += self.async_error.w.eq(Cat(o_collision, o_busy, o_sequence_error))
 
-        o_statuses, i_statuses = [], []
-        i_datas, i_timestamps = [], []
-        i_ack = Signal()
-        sel = self.cri.chan_sel[:16]
-        for n, channel in enumerate(channels):
-            if isinstance(channel, LogChannel):
-                o_statuses.append(1)
-                i_datas.append(0)
-                i_timestamps.append(0)
-                i_statuses.append(0)
-                continue
-
-            selected = Signal()
-            self.comb += selected.eq(sel == n)
-
-            o_manager = _OutputManager(channel.interface.o, self.counter,
-                                       channel.ofifo_depth, guard_io_cycles)
-            self.submodules += o_manager
-
-            if hasattr(o_manager.ev, "data"):
-                self.comb += o_manager.ev.data.eq(self.cri.o_data)
-            if hasattr(o_manager.ev, "address"):
-                self.comb += o_manager.ev.address.eq(self.cri.o_address)
-            ts_shift = len(self.cri.timestamp) - len(o_manager.ev.timestamp)
-            self.comb += o_manager.ev.timestamp.eq(self.cri.timestamp[ts_shift:])
-
-            self.comb += o_manager.we.eq(selected & (self.cri.cmd == cri.commands["write"]))
-
-            self.sync.rsys += [
-                If(o_manager.underflow, o_underflow.eq(1)),
-                If(o_manager.sequence_error, o_sequence_error.eq(1))
-            ]
-            self.sync += [
-                If(o_manager.collision, o_collision.eq(1)),
-                If(o_manager.busy, o_busy.eq(1))
-            ]
-            o_statuses.append(o_manager.writable)
-
-            if channel.interface.i is not None:
-                i_manager = _InputManager(channel.interface.i, self.counter,
-                                          channel.ififo_depth)
-                self.submodules += i_manager
-
-                if hasattr(i_manager.ev, "data"):
-                    i_datas.append(i_manager.ev.data)
-                else:
-                    i_datas.append(0)
-                if channel.interface.i.timestamped:
-                    ts_shift = (len(self.cri.i_timestamp) - len(i_manager.ev.timestamp))
-                    i_timestamps.append(i_manager.ev.timestamp << ts_shift)
-                else:
-                    i_timestamps.append(0)
-
-                overflow = Signal()
-                self.sync.rsys += [
-                    If(selected & i_ack,
-                       overflow.eq(0)),
-                    If(i_manager.overflow,
-                       overflow.eq(1))
-                ]
-                self.comb += i_manager.re.eq(selected & i_ack & ~overflow)
-                i_statuses.append(Cat(i_manager.readable & ~overflow, overflow))
-
-            else:
-                i_datas.append(0)
-                i_timestamps.append(0)
-                i_statuses.append(0)
-
-        o_status_raw = Signal()
         self.comb += [
-            o_status_raw.eq(Array(o_statuses)[sel]),
-            self.cri.o_status.eq(Cat(
-                ~o_status_raw, o_underflow, o_sequence_error)),
-            self.async_error.w.eq(Cat(o_collision, o_busy))
+            o_collision_sync.i.eq(outputs.collision),
+            o_collision_sync.data_i.eq(outputs.collision_channel),
+            o_busy_sync.i.eq(outputs.busy),
+            o_busy_sync.data_i.eq(outputs.busy_channel)
         ]
-
-        i_status_raw = Signal(2)
-        self.comb += i_status_raw.eq(Array(i_statuses)[sel])
-        input_timeout = Signal.like(self.cri.timestamp)
-        input_pending = Signal()
-        self.sync.rsys += [
-            i_ack.eq(0),
-            If(i_ack,
-                self.cri.i_status.eq(Cat(~i_status_raw[0], i_status_raw[1], 0)),
-                self.cri.i_data.eq(Array(i_datas)[sel]),
-                self.cri.i_timestamp.eq(Array(i_timestamps)[sel]),
-            ),
-            If((self.cri.counter >= input_timeout) | (i_status_raw != 0),
-                If(input_pending, i_ack.eq(1)),
-                input_pending.eq(0)
-            ),
-            If(self.cri.cmd == cri.commands["read"],
-                input_timeout.eq(self.cri.timestamp),
-                input_pending.eq(1),
-                self.cri.i_status.eq(0b100)
-            )
-        ]
-
-        self.comb += self.cri.counter.eq(self.counter.value_sys << fine_ts_width)
diff --git a/artiq/gateware/rtio/cri.py b/artiq/gateware/rtio/cri.py
index f282ae307..4885c0219 100644
--- a/artiq/gateware/rtio/cri.py
+++ b/artiq/gateware/rtio/cri.py
@@ -25,8 +25,11 @@ layout = [
     ("o_data", 512, DIR_M_TO_S),
     ("o_address", 16, DIR_M_TO_S),
     # o_status bits:
-    # <0:wait> <1:underflow> <2:sequence_error>
-    ("o_status", 3, DIR_S_TO_M),
+    # <0:wait> <1:underflow>
+    ("o_status", 2, DIR_S_TO_M),
+    # targets may optionally report a pessimistic estimate of the number
+    # of outputs events that can be written without waiting.
+    ("o_buffer_space", 16, DIR_S_TO_M),
 
     ("i_data", 32, DIR_S_TO_M),
     ("i_timestamp", 64, DIR_S_TO_M),
@@ -35,6 +38,9 @@ layout = [
     # <0> and <1> are mutually exclusive. <1> has higher priority.
     ("i_status", 3, DIR_S_TO_M),
 
+    # value of the timestamp counter transferred into the CRI clock domain.
+    # monotonic, may lag behind the counter in the IO clock domain, but
+    # not be ahead of it.
     ("counter", 64, DIR_S_TO_M)
 ]
 
@@ -55,7 +61,7 @@ class KernelInitiator(Module, AutoCSR):
         self.o_data = CSRStorage(512, write_from_dev=True)
         self.o_address = CSRStorage(16)
         self.o_we = CSR()
-        self.o_status = CSRStatus(3)
+        self.o_status = CSRStatus(2)
 
         self.i_data = CSRStatus(32)
         self.i_timestamp = CSRStatus(64)
diff --git a/artiq/gateware/rtio/dma.py b/artiq/gateware/rtio/dma.py
index ecfb3b5f5..a8b7c9195 100644
--- a/artiq/gateware/rtio/dma.py
+++ b/artiq/gateware/rtio/dma.py
@@ -242,9 +242,7 @@ class TimeOffset(Module, AutoCSR):
 
 class CRIMaster(Module, AutoCSR):
     def __init__(self):
-        self.error_status = CSRStatus(3)  # same encoding as RTIO status
-        self.error_underflow_reset = CSR()
-        self.error_sequence_error_reset = CSR()
+        self.underflow = CSR()
 
         self.error_channel = CSRStatus(24)
         self.error_timestamp = CSRStatus(64)
@@ -256,19 +254,16 @@ class CRIMaster(Module, AutoCSR):
 
         # # #
 
-        error_set = Signal(2)
-        for i, rcsr in enumerate([self.error_underflow_reset, self.error_sequence_error_reset]):
-            # bit 0 is RTIO wait and always 0 here
-            bit = i + 1
-            self.sync += [
-                If(error_set[i],
-                    self.error_status.status[bit].eq(1),
-                    self.error_channel.status.eq(self.sink.channel),
-                    self.error_timestamp.status.eq(self.sink.timestamp),
-                    self.error_address.status.eq(self.sink.address)
-                ),
-                If(rcsr.re, self.error_status.status[bit].eq(0))
-            ]
+        underflow_trigger = Signal()
+        self.sync += [
+            If(underflow_trigger,
+                self.underflow.w.eq(1),
+                self.error_channel.status.eq(self.sink.channel),
+                self.error_timestamp.status.eq(self.sink.timestamp),
+                self.error_address.status.eq(self.sink.address)
+            ),
+            If(self.underflow.re, self.underflow.w.eq(0))
+        ]
 
         self.comb += [
             self.cri.chan_sel.eq(self.sink.channel),
@@ -281,7 +276,7 @@ class CRIMaster(Module, AutoCSR):
         self.submodules += fsm
 
         fsm.act("IDLE",
-            If(self.error_status.status == 0,
+            If(~self.underflow.w,
                 If(self.sink.stb,
                     If(self.sink.eop,
                         # last packet contains dummy data, discard it
@@ -306,16 +301,14 @@ class CRIMaster(Module, AutoCSR):
                 self.sink.ack.eq(1),
                 NextState("IDLE")
             ),
-            If(self.cri.o_status[1], NextState("UNDERFLOW")),
-            If(self.cri.o_status[2], NextState("SEQUENCE_ERROR"))
+            If(self.cri.o_status[1], NextState("UNDERFLOW"))
+        )
+        fsm.act("UNDERFLOW",
+            self.busy.eq(1),
+            underflow_trigger.eq(1),
+            self.sink.ack.eq(1),
+            NextState("IDLE")
         )
-        for n, name in enumerate(["UNDERFLOW", "SEQUENCE_ERROR"]):
-            fsm.act(name,
-                self.busy.eq(1),
-                error_set.eq(1 << n),
-                self.sink.ack.eq(1),
-                NextState("IDLE")
-            )
 
 
 class DMA(Module):
diff --git a/artiq/gateware/rtio/input_collector.py b/artiq/gateware/rtio/input_collector.py
new file mode 100644
index 000000000..d0cba4f50
--- /dev/null
+++ b/artiq/gateware/rtio/input_collector.py
@@ -0,0 +1,140 @@
+from migen import *
+from migen.genlib.record import Record
+from migen.genlib.fifo import *
+
+from artiq.gateware.rtio import cri
+from artiq.gateware.rtio import rtlink
+from artiq.gateware.rtio.cdc import *
+
+
+__all__ = ["InputCollector"]
+
+
+def get_channel_layout(coarse_ts_width, interface):
+    data_width = rtlink.get_data_width(interface)
+    fine_ts_width = rtlink.get_fine_ts_width(interface)
+
+    layout = []
+    if data_width:
+        layout.append(("data", data_width))
+    if interface.timestamped:
+        layout.append(("timestamp", coarse_ts_width + fine_ts_width))
+
+    return layout
+
+
+class InputCollector(Module):
+    def __init__(self, channels, glbl_fine_ts_width, mode, quash_channels=[], interface=None):
+        if interface is None:
+            interface = cri.Interface()
+        self.cri = interface
+        self.coarse_timestamp = Signal(64 - glbl_fine_ts_width)
+
+        # # #
+
+        if mode == "sync":
+            fifo_factory = SyncFIFOBuffered
+            sync_io = self.sync
+            sync_cri = self.sync
+        elif mode == "async":
+            fifo_factory = lambda *args: ClockDomainsRenamer({"write": "rio", "read": "rsys"})(AsyncFIFO(*args))
+            sync_io = self.sync.rio
+            sync_cri = self.sync.rsys
+        else:
+            raise ValueError
+
+        i_statuses, i_datas, i_timestamps = [], [], []
+        i_ack = Signal()
+        sel = self.cri.chan_sel[:16]
+        for n, channel in enumerate(channels):
+            iif = channel.interface.i
+            if iif is None or n in quash_channels:
+                i_datas.append(0)
+                i_timestamps.append(0)
+                i_statuses.append(0)
+                continue
+
+            # FIFO
+            layout = get_channel_layout(len(self.coarse_timestamp), iif)
+            fifo = fifo_factory(layout_len(layout), channel.ififo_depth)
+            self.submodules += fifo
+            fifo_in = Record(layout)
+            fifo_out = Record(layout)
+            self.comb += [
+                fifo.din.eq(fifo_in.raw_bits()),
+                fifo_out.raw_bits().eq(fifo.dout)
+            ]
+
+            # FIFO write
+            if iif.delay:
+                counter_rtio = Signal.like(self.coarse_timestamp, reset_less=True)
+                sync_io += counter_rtio.eq(self.coarse_timestamp - (iif.delay + 1))
+            else:
+                counter_rtio = self.coarse_timestamp
+            if hasattr(fifo_in, "data"):
+                self.comb += fifo_in.data.eq(iif.data)
+            if hasattr(fifo_in, "timestamp"):
+                if hasattr(iif, "fine_ts"):
+                    full_ts = Cat(iif.fine_ts, counter_rtio)
+                else:
+                    full_ts = counter_rtio
+                self.comb += fifo_in.timestamp.eq(full_ts)
+            self.comb += fifo.we.eq(iif.stb)
+
+            overflow_io = Signal()
+            self.comb += overflow_io.eq(fifo.we & ~fifo.writable)
+            if mode == "sync":
+                overflow_trigger = overflow_io
+            elif mode == "async":
+                overflow_transfer = BlindTransfer()
+                self.submodules += overflow_transfer
+                self.comb += overflow_transfer.i.eq(overflow_io)
+                overflow_trigger = overflow_transfer.o
+            else:
+                raise ValueError
+
+            # FIFO read, CRI connection
+            if hasattr(fifo_out, "data"):
+                i_datas.append(fifo_out.data)
+            else:
+                i_datas.append(0)
+            if hasattr(fifo_out, "timestamp"):
+                ts_shift = 64 - len(fifo_out.timestamp)
+                i_timestamps.append(fifo_out.timestamp << ts_shift)
+            else:
+                i_timestamps.append(0)
+
+            selected = Signal()
+            self.comb += selected.eq(sel == n)
+
+            overflow = Signal()
+            sync_cri += [
+                If(selected & i_ack,
+                    overflow.eq(0)),
+                If(overflow_trigger,
+                    overflow.eq(1))
+            ]
+            self.comb += fifo.re.eq(selected & i_ack & ~overflow)
+            i_statuses.append(Cat(fifo.readable & ~overflow, overflow))
+
+        i_status_raw = Signal(2)
+        self.comb += i_status_raw.eq(Array(i_statuses)[sel])
+        input_timeout = Signal.like(self.cri.timestamp)
+        input_pending = Signal()
+        sync_cri += [
+            i_ack.eq(0),
+            If(i_ack,
+                self.cri.i_status.eq(Cat(~i_status_raw[0], i_status_raw[1], 0)),
+                self.cri.i_data.eq(Array(i_datas)[sel]),
+                self.cri.i_timestamp.eq(Array(i_timestamps)[sel]),
+            ),
+            If((self.cri.counter >= input_timeout) | (i_status_raw != 0),
+                If(input_pending, i_ack.eq(1)),
+                input_pending.eq(0)
+            ),
+            If(self.cri.cmd == cri.commands["read"],
+                input_timeout.eq(self.cri.timestamp),
+                input_pending.eq(1),
+                self.cri.i_status.eq(0b100)
+            )
+        ]
diff --git a/artiq/gateware/rtio/rtlink.py b/artiq/gateware/rtio/rtlink.py
index a4fb3ebf9..06cc0ebf2 100644
--- a/artiq/gateware/rtio/rtlink.py
+++ b/artiq/gateware/rtio/rtlink.py
@@ -69,14 +69,13 @@ class Interface:
 
 
 def _get_or_zero(interface, attr):
-    if isinstance(interface, Interface):
-        return max(_get_or_zero(interface.i, attr),
-                   _get_or_zero(interface.o, attr))
+    if interface is None:
+        return 0
+    assert isinstance(interface, (OInterface, IInterface))
+    if hasattr(interface, attr):
+        return len(getattr(interface, attr))
     else:
-        if hasattr(interface, attr):
-            return len(getattr(interface, attr))
-        else:
-            return 0
+        return 0
 
 
 def get_data_width(interface):
diff --git a/artiq/gateware/rtio/sed/__init__.py b/artiq/gateware/rtio/sed/__init__.py
new file mode 100644
index 000000000..d0055b52a
--- /dev/null
+++ b/artiq/gateware/rtio/sed/__init__.py
@@ -0,0 +1,57 @@
+"""
+The traditional RTIO system used one dedicated FIFO per output channel. While this architecture
+is simple and appropriate for ARTIQ systems that were rather small and simple, it shows limitations
+on more complex ones. By decreasing importance:
+* with DRTIO, the master needed to keep track, for each FIFO in each satellite, a lower bound on
+the number of available entries plus the last timestamp written. The timestamp is stored in order
+to detect sequence errors rapidly (and allow precise exceptions without compromising performance).
+When many satellites are involved, especially with DRTIO switches, the storage requirements become
+prohibitive.
+* with many channels in one device, the large muxes and the error detection logic that
+can handle all the FIFOs make timing closure problematic.
+* with many channels in one device, the FIFOs waste FPGA space, as they are never all filled at the
+same time.
+
+The scalable event dispatcher (SED) addresses those issues:
+* only one lower bound on the available entries needs to be stored per satellite device for flow
+control purposes (called "buffer space"). Most sequence errors no longer exist (non-increasing
+timestamps into one channel are permitted to an extent) so rapid detection of them is no longer
+required.
+* the events can be demultiplexed to the different channels using pipeline stages that ease timing.
+* only a few FIFOs are required and they are shared between the channels.
+
+The SED core contains a configurable number of FIFOs that hold the usual information about RTIO
+events (timestamp, address, data), the channel number, and a sequence number. The sequence number is
+increased for each event submitted.
+
+When an event is submitted, it is written into the current FIFO if its timestamp is strictly
+increasing. Otherwise, the current FIFO number is incremented by one (and wraps around, if the
+current FIFO was the last) and the event is written there, unless that FIFO already contains an
+event with a greater timestamp. In that case, an asynchronous error is reported. If the destination
+FIFO is full, the submitter is blocked.
+
+In order to help spreading events among FIFOs and maximize buffering, the SED core may optionally
+also switch to the next FIFO after the current FIFO has been full.
+
+At the output of the FIFOs, the events are distributed to the channels and simultaneous events on
+the same channel are handled using a structure similar to a odd-even merge-sort network that sorts
+by channel. When there are simultaneous events on the same channel, the event with the highest
+sequence number is kept and a flag is raised to indicate that a replacement occured on that
+channel. If a replacement was made on a channel that has replacements disabled, the final
+event is dropped and a collision error is reported asynchronously.
+
+Underflow errors are detected as before by comparing the event timestamp with the current value of
+the counter, and dropping events that do not have enough time to make it through the system.
+
+The sequence number is sized to be able to represent the combined capacity of all FIFOs, plus
+2 bits that allow the detection of wrap-arounds.
+
+The maximum number of simultaneous events (on different channels), and the maximum number of active
+timeline "rewinds", are equal to the number of FIFOs.
+
+The SED logic support both synchronous and asynchronous FIFOs, which are used respectively for local
+RTIO and DRTIO.
+
+To implement flow control in DRTIO, the master queries the satellite for buffer space. The satellite
+uses as buffer space the space available in its fullest FIFO.
+"""
diff --git a/artiq/gateware/rtio/sed/core.py b/artiq/gateware/rtio/sed/core.py
new file mode 100644
index 000000000..ca757e976
--- /dev/null
+++ b/artiq/gateware/rtio/sed/core.py
@@ -0,0 +1,102 @@
+from migen import *
+
+from artiq.gateware.rtio.sed import layouts
+from artiq.gateware.rtio.sed.lane_distributor import *
+from artiq.gateware.rtio.sed.fifos import *
+from artiq.gateware.rtio.sed.gates import *
+from artiq.gateware.rtio.sed.output_driver import *
+
+
+__all__ = ["SED"]
+
+
+class SED(Module):
+    def __init__(self, channels, glbl_fine_ts_width, mode,
+                 lane_count=8, fifo_depth=128, enable_spread=True,
+                 quash_channels=[], report_buffer_space=False, interface=None):
+        if mode == "sync":
+            lane_dist_cdr = lambda x: x
+            fifos_cdr = lambda x: x
+            gates_cdr = lambda x: x
+            output_driver_cdr = lambda x: x
+        elif mode == "async":
+            lane_dist_cdr = ClockDomainsRenamer("rsys")
+            fifos_cdr = ClockDomainsRenamer({"write": "rsys", "read": "rio"})
+            gates_cdr = ClockDomainsRenamer("rio")
+            output_driver_cdr = ClockDomainsRenamer("rio")
+        else:
+            raise ValueError
+
+        seqn_width = layouts.seqn_width(lane_count, fifo_depth)
+
+        self.submodules.lane_dist = lane_dist_cdr(
+            LaneDistributor(lane_count, seqn_width,
+                            layouts.fifo_payload(channels),
+                            [channel.interface.o.delay for channel in channels],
+                            glbl_fine_ts_width,
+                            enable_spread=enable_spread,
+                            quash_channels=quash_channels,
+                            interface=interface))
+        self.submodules.fifos = fifos_cdr(
+            FIFOs(lane_count, fifo_depth,
+                  layouts.fifo_payload(channels), mode, report_buffer_space))
+        self.submodules.gates = gates_cdr(
+            Gates(lane_count, seqn_width,
+                  layouts.fifo_payload(channels),
+                  layouts.output_network_payload(channels, glbl_fine_ts_width)))
+        self.submodules.output_driver = output_driver_cdr(
+            OutputDriver(channels, glbl_fine_ts_width, lane_count, seqn_width))
+
+        for o, i in zip(self.lane_dist.output, self.fifos.input):
+            self.comb += o.connect(i)
+        for o, i in zip(self.fifos.output, self.gates.input):
+            self.comb += o.connect(i)
+        for o, i in zip(self.gates.output, self.output_driver.input):
+            self.comb += i.eq(o)
+
+        if report_buffer_space:
+            self.comb += self.cri.o_buffer_space.eq(self.fifos.buffer_space)
+
+    @property
+    def cri(self):
+        return self.lane_dist.cri
+
+    # in CRI clock domain
+    @property
+    def minimum_coarse_timestamp(self):
+        return self.lane_dist.minimum_coarse_timestamp
+
+    # in I/O clock domain
+    @property
+    def coarse_timestamp(self):
+        return self.gates.coarse_timestamp
+
+    # in CRI clock domain
+    @property
+    def sequence_error(self):
+        return self.lane_dist.sequence_error
+
+    # in CRI clock domain
+    @property
+    def sequence_error_channel(self):
+        return self.lane_dist.sequence_error_channel
+
+    # in I/O clock domain
+    @property
+    def collision(self):
+        return self.output_driver.collision
+
+    # in I/O clock domain
+    @property
+    def collision_channel(self):
+        return self.output_driver.collision_channel
+
+    # in I/O clock domain
+    @property
+    def busy(self):
+        return self.output_driver.busy
+
+    # in I/O clock domain
+    @property
+    def busy_channel(self):
+        return self.output_driver.busy_channel
diff --git a/artiq/gateware/rtio/sed/fifos.py b/artiq/gateware/rtio/sed/fifos.py
new file mode 100644
index 000000000..f056e1a69
--- /dev/null
+++ b/artiq/gateware/rtio/sed/fifos.py
@@ -0,0 +1,84 @@
+from operator import or_
+from functools import reduce
+
+from migen import *
+from migen.genlib.fifo import *
+
+from artiq.gateware.rtio.sed import layouts
+
+
+__all__ = ["FIFOs"]
+
+
+class FIFOs(Module):
+    def __init__(self, lane_count, fifo_depth, layout_payload, mode, report_buffer_space=False):
+        seqn_width = layouts.seqn_width(lane_count, fifo_depth)
+        self.input = [Record(layouts.fifo_ingress(seqn_width, layout_payload))
+                      for _ in range(lane_count)]
+        self.output = [Record(layouts.fifo_egress(seqn_width, layout_payload))
+                       for _ in range(lane_count)]
+
+        if report_buffer_space:
+            self.buffer_space = Signal(max=fifo_depth+1)
+
+        # # #
+
+        if mode == "sync":
+            fifo_cls = SyncFIFOBuffered
+        elif mode == "async":
+            fifo_cls = AsyncFIFO
+        else:
+            raise ValueError
+
+        fifos = []
+        for input, output in zip(self.input, self.output):
+            fifo = fifo_cls(layout_len(layout_payload), fifo_depth)
+            self.submodules += fifo
+            fifos.append(fifo)
+
+            self.comb += [
+                fifo.din.eq(input.payload.raw_bits()),
+                fifo.we.eq(input.we),
+                input.writable.eq(fifo.writable),
+
+                output.payload.raw_bits().eq(fifo.dout),
+                output.readable.eq(fifo.readable),
+                fifo.re.eq(output.re)
+            ]
+
+        if report_buffer_space:
+            if mode != "sync":
+                raise NotImplementedError
+
+            def compute_max(elts):
+                l = len(elts)
+                if l == 1:
+                    return elts[0], 0
+                else:
+                    maximum1, latency1 = compute_max(elts[:l//2])
+                    maximum2, latency2 = compute_max(elts[l//2:])
+                    maximum = Signal(max(len(maximum1), len(maximum2)))
+                    self.sync += [
+                        If(maximum1 > maximum2,
+                            maximum.eq(maximum1)
+                        ).Else(
+                            maximum.eq(maximum2)
+                        )
+                    ]
+                    latency = max(latency1, latency2) + 1
+                    return maximum, latency
+
+            max_level, latency = compute_max([fifo.level for fifo in fifos])
+            max_level_valid = Signal()
+            max_level_valid_counter = Signal(max=latency)
+            self.sync += [
+                If(reduce(or_, [fifo.we for fifo in fifos]),
+                    max_level_valid.eq(0),
+                    max_level_valid_counter.eq(latency - 1)
+                ).Elif(max_level_valid_counter == 0,
+                    max_level_valid.eq(1)
+                ).Else(
+                    max_level_valid_counter.eq(max_level_valid_counter - 1)
+                )
+            ]
+            self.comb += If(max_level_valid, self.buffer_space.eq(fifo_depth - max_level))
diff --git a/artiq/gateware/rtio/sed/gates.py b/artiq/gateware/rtio/sed/gates.py
new file mode 100644
index 000000000..9b1c27d23
--- /dev/null
+++ b/artiq/gateware/rtio/sed/gates.py
@@ -0,0 +1,38 @@
+from migen import *
+
+from artiq.gateware.rtio.sed import layouts
+
+
+__all__ = ["Gates"]
+
+
+class Gates(Module):
+    def __init__(self, lane_count, seqn_width, layout_fifo_payload, layout_output_network_payload):
+        self.input = [Record(layouts.fifo_egress(seqn_width, layout_fifo_payload))
+                      for _ in range(lane_count)]
+        self.output = [Record(layouts.output_network_node(seqn_width, layout_output_network_payload))
+                       for _ in range(lane_count)]
+
+        if hasattr(self.output[0].payload, "fine_ts"):
+            glbl_fine_ts_width = len(self.output[0].payload.fine_ts)
+        else:
+            glbl_fine_ts_width = 0
+
+        self.coarse_timestamp = Signal(64-glbl_fine_ts_width)
+
+        # # #
+
+        for input, output in zip(self.input, self.output):
+            for field, _ in output.payload.layout:
+                if field == "fine_ts":
+                    self.sync += output.payload.fine_ts.eq(input.payload.timestamp[:glbl_fine_ts_width])
+                else:
+                    self.sync += getattr(output.payload, field).eq(getattr(input.payload, field))
+            self.sync += output.seqn.eq(input.seqn)
+            self.comb += [
+                output.replace_occured.eq(0),
+                output.nondata_replace_occured.eq(0)
+            ]
+
+            self.comb += input.re.eq(input.payload.timestamp[glbl_fine_ts_width:] == self.coarse_timestamp)
+            self.sync += output.valid.eq(input.re & input.readable)
diff --git a/artiq/gateware/rtio/sed/lane_distributor.py b/artiq/gateware/rtio/sed/lane_distributor.py
new file mode 100644
index 000000000..d6e346c9a
--- /dev/null
+++ b/artiq/gateware/rtio/sed/lane_distributor.py
@@ -0,0 +1,158 @@
+from migen import *
+
+from artiq.gateware.rtio import cri
+from artiq.gateware.rtio.sed import layouts
+
+
+__all__ = ["LaneDistributor"]
+
+
+# CRI write happens in 3 cycles:
+# 1. set timestamp and channel
+# 2. set other payload elements and issue write command
+# 3. check status
+
+class LaneDistributor(Module):
+    def __init__(self, lane_count, seqn_width, layout_payload,
+                 compensation, glbl_fine_ts_width,
+                 enable_spread=True, quash_channels=[], interface=None):
+        if lane_count & (lane_count - 1):
+            raise NotImplementedError("lane count must be a power of 2")
+
+        if interface is None:
+            interface = cri.Interface()
+        self.cri = interface
+        self.sequence_error = Signal()
+        self.sequence_error_channel = Signal(16)
+        self.minimum_coarse_timestamp = Signal(64-glbl_fine_ts_width)
+        self.output = [Record(layouts.fifo_ingress(seqn_width, layout_payload))
+                       for _ in range(lane_count)]
+
+        # # #
+
+        o_status_wait = Signal()
+        o_status_underflow = Signal()
+        self.comb += self.cri.o_status.eq(Cat(o_status_wait, o_status_underflow))
+
+        # internal state
+        current_lane = Signal(max=lane_count)
+        last_coarse_timestamp = Signal(64-glbl_fine_ts_width)
+        last_lane_coarse_timestamps = Array(Signal(64-glbl_fine_ts_width)
+                                            for _ in range(lane_count))
+        seqn = Signal(seqn_width)
+
+        # distribute data to lanes
+        for lio in self.output:
+            self.comb += [
+                lio.seqn.eq(seqn),
+                lio.payload.channel.eq(self.cri.chan_sel[:16]),
+                lio.payload.timestamp.eq(self.cri.timestamp),
+            ]
+            if hasattr(lio.payload, "address"):
+                self.comb += lio.payload.address.eq(self.cri.o_address)
+            if hasattr(lio.payload, "data"):
+                self.comb += lio.payload.data.eq(self.cri.o_data)
+
+        # when timestamp and channel arrive in cycle #1, prepare computations
+        us_timestamp_width = 64 - glbl_fine_ts_width
+        coarse_timestamp = Signal(us_timestamp_width)
+        self.comb += coarse_timestamp.eq(self.cri.timestamp[glbl_fine_ts_width:])
+        min_minus_timestamp = Signal((us_timestamp_width + 1, True))
+        laneAmin_minus_timestamp = Signal((us_timestamp_width + 1, True))
+        laneBmin_minus_timestamp = Signal((us_timestamp_width + 1, True))
+        last_minus_timestamp = Signal((us_timestamp_width + 1, True))
+        current_lane_plus_one = Signal(max=lane_count)
+        self.comb += current_lane_plus_one.eq(current_lane + 1)
+        self.sync += [
+            min_minus_timestamp.eq(self.minimum_coarse_timestamp - coarse_timestamp),
+            laneAmin_minus_timestamp.eq(last_lane_coarse_timestamps[current_lane] - coarse_timestamp),
+            laneBmin_minus_timestamp.eq(last_lane_coarse_timestamps[current_lane_plus_one] - coarse_timestamp),
+            last_minus_timestamp.eq(last_coarse_timestamp - coarse_timestamp)
+        ]
+
+        quash = Signal()
+        self.sync += quash.eq(0)
+        for channel in quash_channels:
+            self.sync += If(self.cri.chan_sel[:16] == channel, quash.eq(1))
+
+        latency_compensation = Memory(14, len(compensation), init=compensation)
+        latency_compensation_port = latency_compensation.get_port()
+        self.specials += latency_compensation, latency_compensation_port 
+        self.comb += latency_compensation_port.adr.eq(self.cri.chan_sel[:16]) 
+
+        # cycle #2, write
+        compensation = Signal((14, True))
+        self.comb += compensation.eq(latency_compensation_port.dat_r)
+        timestamp_above_min = Signal()
+        timestamp_above_laneA_min = Signal()
+        timestamp_above_laneB_min = Signal()
+        timestamp_above_lane_min = Signal()
+        force_laneB = Signal()
+        use_laneB = Signal()
+        use_lanen = Signal(max=lane_count)
+
+        do_write = Signal()
+        do_underflow = Signal()
+        do_sequence_error = Signal()
+        self.comb += [
+            timestamp_above_min.eq(min_minus_timestamp - compensation < 0),
+            timestamp_above_laneA_min.eq(laneAmin_minus_timestamp - compensation < 0),
+            timestamp_above_laneB_min.eq(laneBmin_minus_timestamp - compensation < 0),
+            If(force_laneB | (last_minus_timestamp - compensation >= 0),
+                use_lanen.eq(current_lane + 1),
+                use_laneB.eq(1)
+            ).Else(
+                use_lanen.eq(current_lane),
+                use_laneB.eq(0)
+            ),
+
+            timestamp_above_lane_min.eq(Mux(use_laneB, timestamp_above_laneB_min, timestamp_above_laneA_min)),
+            If(~quash,
+                do_write.eq((self.cri.cmd == cri.commands["write"]) & timestamp_above_min & timestamp_above_lane_min),
+                do_underflow.eq((self.cri.cmd == cri.commands["write"]) & ~timestamp_above_min),
+                do_sequence_error.eq((self.cri.cmd == cri.commands["write"]) & timestamp_above_min & ~timestamp_above_lane_min),
+            ),
+            Array(lio.we for lio in self.output)[use_lanen].eq(do_write)
+        ]
+        compensated_timestamp = Signal(64)
+        self.comb += compensated_timestamp.eq(self.cri.timestamp + (compensation << glbl_fine_ts_width))
+        self.sync += [
+            If(do_write,
+                If(use_laneB, current_lane.eq(current_lane + 1)),
+                last_coarse_timestamp.eq(compensated_timestamp[glbl_fine_ts_width:]),
+                last_lane_coarse_timestamps[use_lanen].eq(compensated_timestamp[glbl_fine_ts_width:]),
+                seqn.eq(seqn + 1),
+            )
+        ]
+        for lio in self.output:
+            self.comb += lio.payload.timestamp.eq(compensated_timestamp)
+
+        # cycle #3, read status
+        current_lane_writable = Signal()
+        self.comb += [
+            current_lane_writable.eq(Array(lio.writable for lio in self.output)[current_lane]),
+            o_status_wait.eq(~current_lane_writable)
+        ]
+        self.sync += [
+            If(self.cri.cmd == cri.commands["write"],
+                o_status_underflow.eq(0)
+            ),
+            If(do_underflow,
+                o_status_underflow.eq(1)
+            ),
+            self.sequence_error.eq(do_sequence_error),
+            self.sequence_error_channel.eq(self.cri.chan_sel[:16])
+        ]
+
+        # current lane has been full, spread events by switching to the next.
+        if enable_spread:
+            current_lane_writable_r = Signal(reset=1)
+            self.sync += [
+                current_lane_writable_r.eq(current_lane_writable),
+                If(~current_lane_writable_r & current_lane_writable,
+                    force_laneB.eq(1)
+                ),
+                If(do_write,
+                    force_laneB.eq(0)
+                )
+            ]
diff --git a/artiq/gateware/rtio/sed/layouts.py b/artiq/gateware/rtio/sed/layouts.py
new file mode 100644
index 000000000..1fbb8f6ec
--- /dev/null
+++ b/artiq/gateware/rtio/sed/layouts.py
@@ -0,0 +1,77 @@
+from migen import *
+
+from artiq.gateware.rtio import rtlink
+
+
+def fifo_payload(channels):
+    address_width = max(rtlink.get_address_width(channel.interface.o)
+                        for channel in channels)
+    data_width = max(rtlink.get_data_width(channel.interface.o)
+                     for channel in channels)
+
+    layout = [
+        ("channel", bits_for(len(channels)-1)),
+        ("timestamp", 64)
+    ]
+    if address_width:
+        layout.append(("address", address_width))
+    if data_width:
+        layout.append(("data", data_width))
+
+    return layout
+
+
+def seqn_width(lane_count, fifo_depth):
+    # There must be a unique sequence number for every possible event in every FIFO.
+    # Plus 2 bits to detect and handle wraparounds.
+    return bits_for(lane_count*fifo_depth-1) + 2
+
+
+def fifo_ingress(seqn_width, layout_payload):
+    return [
+        ("we", 1, DIR_M_TO_S),
+        ("writable", 1, DIR_S_TO_M),
+        ("seqn", seqn_width, DIR_M_TO_S),
+        ("payload", [(a, b, DIR_M_TO_S) for a, b in layout_payload])
+    ]
+
+
+def fifo_egress(seqn_width, layout_payload):
+    return [
+        ("re", 1, DIR_S_TO_M),
+        ("readable", 1, DIR_M_TO_S),
+        ("seqn", seqn_width, DIR_M_TO_S),
+        ("payload", [(a, b, DIR_M_TO_S) for a, b in layout_payload])
+    ]
+
+
+# We use glbl_fine_ts_width in the output network so that collisions due
+# to insufficiently increasing timestamps are always reliably detected.
+# We can still have undetected collisions on the address by making it wrap
+# around, but those are more rare and easier to debug, and addresses are
+# not normally exposed directly to the ARTIQ user.
+def output_network_payload(channels, glbl_fine_ts_width):
+    address_width = max(rtlink.get_address_width(channel.interface.o)
+                        for channel in channels)
+    data_width = max(rtlink.get_data_width(channel.interface.o)
+                     for channel in channels)
+
+    layout = [("channel", bits_for(len(channels)-1))]
+    if glbl_fine_ts_width:
+        layout.append(("fine_ts", glbl_fine_ts_width))
+    if address_width:
+        layout.append(("address", address_width))
+    if data_width:
+        layout.append(("data", data_width))
+
+    return layout
+
+
+def output_network_node(seqn_width, layout_payload):
+    return [
+        ("valid", 1),
+        ("seqn", seqn_width),
+        ("replace_occured", 1),
+        ("nondata_replace_occured", 1),
+        ("payload", layout_payload)
+    ]
diff --git a/artiq/gateware/rtio/sed/output_driver.py b/artiq/gateware/rtio/sed/output_driver.py
new file mode 100644
index 000000000..3150e98b6
--- /dev/null
+++ b/artiq/gateware/rtio/sed/output_driver.py
@@ -0,0 +1,108 @@
+from functools import reduce
+from operator import or_
+
+from migen import *
+
+from artiq.gateware.rtio.sed import layouts
+from artiq.gateware.rtio.sed.output_network import OutputNetwork
+
+
+__all__ = ["OutputDriver"]
+
+
+class OutputDriver(Module):
+    def __init__(self, channels, glbl_fine_ts_width, lane_count, seqn_width):
+        self.collision = Signal()
+        self.collision_channel = Signal(max=len(channels))
+        self.busy = Signal()
+        self.busy_channel = Signal(max=len(channels))
+
+        # output network
+        layout_on_payload = layouts.output_network_payload(channels, glbl_fine_ts_width)
+        output_network = OutputNetwork(lane_count, seqn_width, layout_on_payload)
+        self.submodules += output_network
+        self.input = output_network.input
+
+        # detect collisions (adds one pipeline stage)
+        layout_lane_data = [
+            ("valid", 1),
+            ("collision", 1),
+            ("payload", layout_on_payload)
+        ]
+        lane_datas = [Record(layout_lane_data) for _ in range(lane_count)]
+        en_replaces = [channel.interface.o.enable_replace for channel in channels]
+        for lane_data, on_output in zip(lane_datas, output_network.output):
+            replace_occured_r = Signal()
+            nondata_replace_occured_r = Signal()
+            self.sync += [
+                lane_data.valid.eq(on_output.valid),
+                lane_data.payload.eq(on_output.payload),
+                replace_occured_r.eq(on_output.replace_occured),
+                nondata_replace_occured_r.eq(on_output.nondata_replace_occured)
+            ]
+
+            en_replaces_rom = Memory(1, len(en_replaces), init=en_replaces)
+            en_replaces_rom_port = en_replaces_rom.get_port()
+            self.specials += en_replaces_rom, en_replaces_rom_port
+            self.comb += [
+                en_replaces_rom_port.adr.eq(on_output.payload.channel),
+                lane_data.collision.eq(replace_occured_r & (~en_replaces_rom_port.dat_r | nondata_replace_occured_r))
+            ]
+
+        self.sync += [
+            self.collision.eq(0),
+            self.collision_channel.eq(0)
+        ]
+        for lane_data in lane_datas:
+            self.sync += [
+                If(lane_data.valid & lane_data.collision,
+                    self.collision.eq(1),
+                    self.collision_channel.eq(lane_data.payload.channel)
+                )
+            ]
+
+        # demultiplex channels (adds one pipeline stage)
+        for n, channel in enumerate(channels):
+            oif = channel.interface.o
+
+            onehot_stb = []
+            onehot_fine_ts = []
+            onehot_address = []
+            onehot_data = []
+            for lane_data in lane_datas:
+                selected = Signal()
+                self.comb += selected.eq(lane_data.valid & ~lane_data.collision & (lane_data.payload.channel == n))
+                onehot_stb.append(selected)
+                if hasattr(lane_data.payload, "fine_ts") and hasattr(oif, "fine_ts"):
+                    ts_shift = len(lane_data.payload.fine_ts) - len(oif.fine_ts)
+                    onehot_fine_ts.append(Mux(selected, lane_data.payload.fine_ts[ts_shift:], 0))
+                if hasattr(lane_data.payload, "address"):
+                    onehot_address.append(Mux(selected, lane_data.payload.address, 0))
+                if hasattr(lane_data.payload, "data"):
+                    onehot_data.append(Mux(selected, lane_data.payload.data, 0))
+
+            self.sync += oif.stb.eq(reduce(or_, onehot_stb))
+            if hasattr(oif, "fine_ts"):
+                self.sync += oif.fine_ts.eq(reduce(or_, onehot_fine_ts))
+            if hasattr(oif, "address"):
+                self.sync += oif.address.eq(reduce(or_, onehot_address))
+            if hasattr(oif, "data"):
+                self.sync += oif.data.eq(reduce(or_, onehot_data))
+
+        # detect busy errors, at lane level to reduce muxing
+        self.sync += [
+            self.busy.eq(0),
+            self.busy_channel.eq(0)
+        ]
+        for lane_data in lane_datas:
+            stb_r = Signal()
+            channel_r = Signal(max=len(channels))
+            self.sync += [
+                stb_r.eq(lane_data.valid & ~lane_data.collision),
+                channel_r.eq(lane_data.payload.channel),
+
+                If(stb_r & Array(channel.interface.o.busy for channel in channels)[channel_r],
+                    self.busy.eq(1),
+                    self.busy_channel.eq(channel_r)
+                )
+            ]
diff --git a/artiq/gateware/rtio/sed/output_network.py b/artiq/gateware/rtio/sed/output_network.py
new file mode 100644
index 000000000..1b57baf9e
--- /dev/null
+++ b/artiq/gateware/rtio/sed/output_network.py
@@ -0,0 +1,101 @@
+from migen import *
+
+from artiq.gateware.rtio.sed import layouts
+
+
+__all__ = ["latency", "OutputNetwork"]
+
+
+# Based on: https://github.com/Bekbolatov/SortingNetworks/blob/master/src/main/js/gr.js
+def boms_get_partner(n, l, p):
+    if p == 1:
+        return n ^ (1 << (l - 1))
+    scale = 1 << (l - p)
+    box = 1 << p
+    sn = n//scale - n//scale//box*box
+    if sn == 0 or sn == (box - 1):
+        return n
+    if (sn % 2) == 0:
+        return n - scale
+    return n + scale
+
+
+def boms_steps_pairs(lane_count):
+    d = log2_int(lane_count)
+    steps = []
+    for l in range(1, d+1):
+        for p in range(1, l+1):
+            pairs = []
+            for n in range(2**d):
+                partner = boms_get_partner(n, l, p)
+                if partner != n:
+                    if partner > n:
+                        pair = (n, partner)
+                    else:
+                        pair = (partner, n)
+                    if pair not in pairs:
+                        pairs.append(pair)
+            steps.append(pairs)
+    return steps
+
+
+def latency(lane_count):
+    d = log2_int(lane_count)
+    return sum(l for l in range(1, d+1))
+
+
+def cmp_wrap(a, b):
+    return Mux(a[-2:] == ~b[-2:], a[0], a[:-2] < b[:-2])
+
+
+class OutputNetwork(Module):
+    def __init__(self, lane_count, seqn_width, layout_payload):
+        self.input = [Record(layouts.output_network_node(seqn_width, layout_payload))
+                      for _ in range(lane_count)]
+        self.output = None
+
+        step_input = self.input
+        for step in boms_steps_pairs(lane_count):
+            step_output = [Record(layouts.output_network_node(seqn_width, layout_payload))
+                           for _ in range(lane_count)]
+
+            for node1, node2 in step:
+                nondata_difference = Signal()
+                for field, _ in layout_payload:
+                    if field != "data":
+                        f1 = getattr(step_input[node1].payload, field)
+                        f2 = getattr(step_input[node2].payload, field)
+                        self.comb += If(f1 != f2, nondata_difference.eq(1))
+
+                k1 = Cat(step_input[node1].payload.channel, ~step_input[node1].valid)
+                k2 = Cat(step_input[node2].payload.channel, ~step_input[node2].valid)
+                self.sync += [
+                    If(k1 == k2,
+                        If(cmp_wrap(step_input[node1].seqn, step_input[node2].seqn),
+                            step_output[node1].eq(step_input[node2]),
+                            step_output[node2].eq(step_input[node1])
+                        ).Else(
+                            step_output[node1].eq(step_input[node1]),
+                            step_output[node2].eq(step_input[node2])
+                        ),
+                        step_output[node1].replace_occured.eq(1),
+                        step_output[node1].nondata_replace_occured.eq(nondata_difference),
+                        step_output[node2].valid.eq(0),
+                    ).Elif(k1 < k2,
+                        step_output[node1].eq(step_input[node1]),
+                        step_output[node2].eq(step_input[node2])
+                    ).Else(
+                        step_output[node1].eq(step_input[node2]),
+                        step_output[node2].eq(step_input[node1])
+                    )
+                ]
+
+            unchanged = list(range(lane_count))
+            for node1, node2 in step:
+                unchanged.remove(node1)
+                unchanged.remove(node2)
+            for node in unchanged:
+                self.sync += step_output[node].eq(step_input[node])
+
+            self.output = step_output
+            step_input = step_output
diff --git a/artiq/gateware/targets/kc705_dds.py b/artiq/gateware/targets/kc705_dds.py
index 62fab2427..bb6a662eb 100755
--- a/artiq/gateware/targets/kc705_dds.py
+++ b/artiq/gateware/targets/kc705_dds.py
@@ -319,18 +319,18 @@ class NIST_CLOCK(_NIST_Ions):
         phy = spi.SPIMaster(ams101_dac)
         self.submodules += phy
         rtio_channels.append(rtio.Channel.from_phy(
-            phy, ofifo_depth=4, ififo_depth=4))
+            phy, ififo_depth=4))
 
         for i in range(3):
             phy = spi.SPIMaster(self.platform.request("spi", i))
             self.submodules += phy
             rtio_channels.append(rtio.Channel.from_phy(
-                phy, ofifo_depth=128, ififo_depth=128))
+                phy, ififo_depth=128))
             
         phy = spi.SPIMaster(platform.request("sdcard_spi_33"))
         self.submodules += phy
         rtio_channels.append(rtio.Channel.from_phy(
-            phy, ofifo_depth=4, ififo_depth=4))
+            phy, ififo_depth=4))
 
         fmcdio_dirctl = self.platform.request("fmcdio_dirctl")
         for s in fmcdio_dirctl.clk, fmcdio_dirctl.ser, fmcdio_dirctl.latch:
@@ -365,9 +365,7 @@ class NIST_CLOCK(_NIST_Ions):
 
         phy = dds.AD9914(platform.request("dds"), 11, onehot=True)
         self.submodules += phy
-        rtio_channels.append(rtio.Channel.from_phy(phy,
-                                                   ofifo_depth=512,
-                                                   ififo_depth=4))
+        rtio_channels.append(rtio.Channel.from_phy(phy, ififo_depth=4))
 
         self.config["HAS_RTIO_LOG"] = None
         self.config["RTIO_LOG_CHANNEL"] = len(rtio_channels)
@@ -425,21 +423,19 @@ class NIST_QC2(_NIST_Ions):
         phy = spi.SPIMaster(ams101_dac)
         self.submodules += phy
         rtio_channels.append(rtio.Channel.from_phy(
-            phy, ofifo_depth=4, ififo_depth=4))
+            phy, ififo_depth=4))
 
         for i in range(4):
             phy = spi.SPIMaster(self.platform.request("spi", i))
             self.submodules += phy
             rtio_channels.append(rtio.Channel.from_phy(
-                phy, ofifo_depth=128, ififo_depth=128))
+                phy, ififo_depth=128))
 
         for backplane_offset in range(2):
             phy = dds.AD9914(
                 platform.request("dds", backplane_offset), 12, onehot=True)
             self.submodules += phy
-            rtio_channels.append(rtio.Channel.from_phy(phy,
-                                                       ofifo_depth=512,
-                                                       ififo_depth=4))
+            rtio_channels.append(rtio.Channel.from_phy(phy, ififo_depth=4))
 
         self.config["HAS_RTIO_LOG"] = None
         self.config["RTIO_LOG_CHANNEL"] = len(rtio_channels)
diff --git a/artiq/gateware/targets/kc705_sma_spi.py b/artiq/gateware/targets/kc705_sma_spi.py
index cea74c574..63f91a3a5 100755
--- a/artiq/gateware/targets/kc705_sma_spi.py
+++ b/artiq/gateware/targets/kc705_sma_spi.py
@@ -101,12 +101,12 @@ class SMA_SPI(_NIST_Ions):
         phy = spi.SPIMaster(ams101_dac)
         self.submodules += phy
         rtio_channels.append(rtio.Channel.from_phy(
-            phy, ofifo_depth=4, ififo_depth=4))
+            phy, ififo_depth=4))
 
         phy = spi.SPIMaster(self.platform.request("sma_spi"))
         self.submodules += phy
         rtio_channels.append(rtio.Channel.from_phy(
-            phy, ofifo_depth=128, ififo_depth=128))
+            phy, ififo_depth=128))
 
         self.config["HAS_RTIO_LOG"] = None
         self.config["RTIO_LOG_CHANNEL"] = len(rtio_channels)
diff --git a/artiq/gateware/targets/sayma_amc_drtio_master.py b/artiq/gateware/targets/sayma_amc_drtio_master.py
index a14c1446f..28a70303e 100755
--- a/artiq/gateware/targets/sayma_amc_drtio_master.py
+++ b/artiq/gateware/targets/sayma_amc_drtio_master.py
@@ -103,7 +103,7 @@ class Master(MiniSoC, AMPSoC):
         self.submodules.rtio_moninj = rtio.MonInj(rtio_channels)
         self.csr_devices.append("rtio_moninj")
 
-        self.submodules.rtio_core = rtio.Core(rtio_channels, 3)
+        self.submodules.rtio_core = rtio.Core(rtio_channels, glbl_fine_ts_width=3)
         self.csr_devices.append("rtio_core")
 
         self.submodules.rtio = rtio.KernelInitiator()
diff --git a/artiq/gateware/test/drtio/test_full_stack.py b/artiq/gateware/test/drtio/test_full_stack.py
index 59e5b2897..23810321c 100644
--- a/artiq/gateware/test/drtio/test_full_stack.py
+++ b/artiq/gateware/test/drtio/test_full_stack.py
@@ -52,7 +52,8 @@ class DUT(Module):
         self.ttl1 = Signal()
         self.transceivers = DummyTransceiverPair(nwords)
 
-        self.submodules.master = DRTIOMaster(self.transceivers.alice)
+        self.submodules.master = DRTIOMaster(self.transceivers.alice,
+                                             fine_ts_width=0)
         self.submodules.master_ki = rtio.KernelInitiator(self.master.cri)
 
         rx_synchronizer = DummyRXSynchronizer()
@@ -60,132 +61,164 @@ class DUT(Module):
         self.submodules.phy1 = ttl_simple.Output(self.ttl1)
         self.submodules.phy2 = SimpleIOPHY(512, 32)  # test wide output data
         rtio_channels = [
-            rtio.Channel.from_phy(self.phy0, ofifo_depth=4),
-            rtio.Channel.from_phy(self.phy1, ofifo_depth=4),
-            rtio.Channel.from_phy(self.phy2, ofifo_depth=4),
+            rtio.Channel.from_phy(self.phy0),
+            rtio.Channel.from_phy(self.phy1),
+            rtio.Channel.from_phy(self.phy2),
         ]
         self.submodules.satellite = DRTIOSatellite(
-            self.transceivers.bob, rtio_channels, rx_synchronizer)
-        
+            self.transceivers.bob, rtio_channels, rx_synchronizer,
+            lane_count=4, fifo_depth=8, fine_ts_width=0)
+
+
+class OutputsTestbench:
+    def __init__(self):
+        self.dut = DUT(2)
+        self.now = 0
+
+    def init(self):
+        yield from self.dut.master.rt_controller.csrs.underflow_margin.write(100)
+        while not (yield from self.dut.master.link_layer.link_status.read()):
+            yield
+        yield from self.get_buffer_space()
+
+    def get_buffer_space(self):
+        csrs = self.dut.master.rt_controller.csrs
+        yield from csrs.o_get_buffer_space.write(1)
+        yield
+        while (yield from csrs.o_wait.read()):
+            yield
+        r = (yield from csrs.o_dbg_buffer_space.read())
+        return r
+
+    def delay(self, dt):
+        self.now += dt
+
+    def sync(self):
+        t = self.now + 15
+        while (yield self.dut.master.cri.counter) < t:
+            yield
+
+    def write(self, channel, data):
+        kcsrs = self.dut.master_ki
+        yield from kcsrs.chan_sel.write(channel)
+        yield from kcsrs.timestamp.write(self.now)
+        yield from kcsrs.o_data.write(data)
+        yield from kcsrs.o_we.write(1)
+        yield
+        status = 1
+        wlen = 0
+        while status:
+            status = yield from kcsrs.o_status.read()
+            if status & 2:
+                raise RTIOUnderflow
+            yield
+            wlen += 1
+        return wlen
+
+    @passive
+    def check_ttls(self, ttl_changes):
+        cycle = 0
+        old_ttls = [0, 0]
+        while True:
+            ttls = [(yield self.dut.ttl0), (yield self.dut.ttl1)]
+            for n, (old_ttl, ttl) in enumerate(zip(old_ttls, ttls)):
+                if ttl != old_ttl:
+                    ttl_changes.append((cycle, n))
+            old_ttls = ttls
+            yield
+            cycle += 1
+
 
 class TestFullStack(unittest.TestCase):
     clocks = {"sys": 8, "rtio": 5, "rtio_rx": 5,
               "rio": 5, "rio_phy": 5,
               "sys_with_rst": 8, "rtio_with_rst": 5}
 
-    def test_outputs(self):
-        dut = DUT(2)
-        kcsrs = dut.master_ki
-        csrs = dut.master.rt_controller.csrs
-        mgr = dut.master.rt_manager
-        saterr = dut.satellite.rt_errors
-
+    def test_pulses(self):
+        tb = OutputsTestbench()
         ttl_changes = []
         correct_ttl_changes = [
-            # from test_pulses
-            (203, 0),
             (208, 0),
-            (208, 1),
-            (214, 1),
-
-            # from test_fifo_space
-            (414, 0),
-            (454, 0),
-            (494, 0),
-            (534, 0),
-            (574, 0),
-            (614, 0)
+            (213, 0),
+            (213, 1),
+            (219, 1),
         ]
 
-        now = 0
-        def delay(dt):
-            nonlocal now
-            now += dt
+        def test():
+            yield from tb.init()
+            tb.delay(200)
+            yield from tb.write(0, 1)
+            tb.delay(5)
+            yield from tb.write(0, 0)
+            yield from tb.write(1, 1)
+            tb.delay(6)
+            yield from tb.write(1, 0)
+            yield from tb.sync()
 
-        def get_fifo_space(channel):
-            yield from csrs.chan_sel_override_en.write(1)
-            yield from csrs.chan_sel_override.write(channel)
-            yield from csrs.o_get_fifo_space.write(1)
-            yield
-            while (yield from csrs.o_wait.read()):
-                yield
-            r = (yield from csrs.o_dbg_fifo_space.read())
-            yield from csrs.chan_sel_override_en.write(0)
-            return r
+        run_simulation(tb.dut,
+            {"sys": test(), "rtio": tb.check_ttls(ttl_changes)}, self.clocks)
+        self.assertEqual(ttl_changes, correct_ttl_changes)
 
-        def write(channel, data):
-            yield from kcsrs.chan_sel.write(channel)
-            yield from kcsrs.timestamp.write(now)
-            yield from kcsrs.o_data.write(data)
-            yield from kcsrs.o_we.write(1)
-            yield
-            status = 1
-            wlen = 0
-            while status:
-                status = yield from kcsrs.o_status.read()
-                if status & 2:
-                    raise RTIOUnderflow
-                if status & 4:
-                    raise RTIOSequenceError
-                yield
-                wlen += 1
-            return wlen
+    def test_underflow(self):
+        tb = OutputsTestbench()
 
-        def test_init():
-            yield from get_fifo_space(0)
-            yield from get_fifo_space(1)
-
-        def test_underflow():
+        def test():
+            yield from tb.init()
             with self.assertRaises(RTIOUnderflow):
-                yield from write(0, 0)
+                yield from tb.write(0, 0)
 
-        def test_pulses():
-            delay(200*8)
-            yield from write(0, 1)
-            delay(5*8)
-            yield from write(0, 1)
-            yield from write(0, 0)  # replace
-            yield from write(1, 1)
-            delay(6*8)
-            yield from write(1, 0)
+        run_simulation(tb.dut, {"sys": test()}, self.clocks)
 
-        def test_sequence_error():
-            delay(-200*8)
-            with self.assertRaises(RTIOSequenceError):
-                yield from write(0, 1)
-            delay(200*8)
+    def test_large_data(self):
+        tb = OutputsTestbench()
 
-        def test_large_data():
+        def test():
+            yield from tb.init()
             correct_large_data = random.Random(0).randrange(2**512-1)
-            self.assertNotEqual((yield dut.phy2.received_data), correct_large_data)
-            delay(10*8)
-            yield from write(2, correct_large_data)
-            for i in range(45):
-                yield
-            self.assertEqual((yield dut.phy2.received_data), correct_large_data)
+            self.assertNotEqual((yield tb.dut.phy2.received_data), correct_large_data)
+            tb.delay(200)
+            yield from tb.write(2, correct_large_data)
+            yield from tb.sync()
+            self.assertEqual((yield tb.dut.phy2.received_data), correct_large_data)
 
-        def test_fifo_space():
-            delay(200*8)
+        run_simulation(tb.dut, {"sys": test()}, self.clocks)
+
+    def test_buffer_space(self):
+        tb = OutputsTestbench()
+        ttl_changes = []
+        correct_ttl_changes = [(258 + 40*i, 0) for i in range(10)]
+
+        def test():
+            yield from tb.init()
+            tb.delay(250)
             max_wlen = 0
-            for _ in range(3):
-                wlen = yield from write(0, 1)
+            for i in range(10):
+                wlen = yield from tb.write(0, (i + 1) % 2)
                 max_wlen = max(max_wlen, wlen)
-                delay(40*8)
-                wlen = yield from write(0, 0)
-                max_wlen = max(max_wlen, wlen)
-                delay(40*8)
-            # check that some writes caused FIFO space requests
+                tb.delay(40)
+            # check that some writes caused buffer space requests
             self.assertGreater(max_wlen, 5)
+            yield from tb.sync()
 
-        def test_tsc_error():
+        run_simulation(tb.dut,
+            {"sys": test(), "rtio": tb.check_ttls(ttl_changes)}, self.clocks)
+        self.assertEqual(ttl_changes, correct_ttl_changes)
+
+    def test_tsc_error(self):
+        tb = OutputsTestbench()
+
+        def test():
+            saterr = tb.dut.satellite.rt_errors
+            csrs = tb.dut.master.rt_controller.csrs
+            yield from tb.init()
             errors = yield from saterr.protocol_error.read()
             self.assertEqual(errors, 0)
             yield from csrs.tsc_correction.write(100000000)
             yield from csrs.set_time.write(1)
             for i in range(15):
                yield
-            delay(10000*8)
-            yield from write(0, 1)
+            tb.delay(10000)
+            yield from tb.write(0, 1)
             for i in range(12):
                yield
             errors = yield from saterr.protocol_error.read()
@@ -195,39 +228,7 @@ class TestFullStack(unittest.TestCase):
             errors = yield from saterr.protocol_error.read()
             self.assertEqual(errors, 0)
 
-        def wait_ttl_events():
-            while len(ttl_changes) < len(correct_ttl_changes):
-                yield
-
-        def test():
-            while not (yield from dut.master.link_layer.link_status.read()):
-                yield
-
-            yield from test_init()
-            yield from test_underflow()
-            yield from test_pulses()
-            yield from test_sequence_error()
-            yield from test_fifo_space()
-            yield from test_large_data()
-            yield from test_tsc_error()
-            yield from wait_ttl_events()
-
-        @passive
-        def check_ttls():
-            cycle = 0
-            old_ttls = [0, 0]
-            while True:
-                ttls = [(yield dut.ttl0), (yield dut.ttl1)]
-                for n, (old_ttl, ttl) in enumerate(zip(old_ttls, ttls)):
-                    if ttl != old_ttl:
-                        ttl_changes.append((cycle, n))
-                old_ttls = ttls
-                yield
-                cycle += 1
-
-        run_simulation(dut,
-            {"sys": test(), "rtio": check_ttls()}, self.clocks)
-        self.assertEqual(ttl_changes, correct_ttl_changes)
+        run_simulation(tb.dut, {"sys": test()}, self.clocks)
 
     def test_inputs(self):
         dut = DUT(2)
@@ -250,8 +251,7 @@ class TestFullStack(unittest.TestCase):
                     (yield from kcsrs.i_timestamp.read()))
 
         def test():
-            # wait for link layer ready
-            for i in range(5):
+            while not (yield from dut.master.link_layer.link_status.read()):
                 yield
 
             i1 = yield from get_input(10)
@@ -269,7 +269,7 @@ class TestFullStack(unittest.TestCase):
             yield dut.phy2.rtlink.i.stb.eq(0)
 
         run_simulation(dut,
-            {"sys": test(), "rtio": generate_input()}, self.clocks, vcd_name="foo.vcd")
+            {"sys": test(), "rtio": generate_input()}, self.clocks)
 
     def test_echo(self):
         dut = DUT(2)
diff --git a/artiq/gateware/test/rtio/test_dma.py b/artiq/gateware/test/rtio/test_dma.py
index 759fe60b0..d0e74b5ea 100644
--- a/artiq/gateware/test/rtio/test_dma.py
+++ b/artiq/gateware/test/rtio/test_dma.py
@@ -1,10 +1,14 @@
 import unittest
 import random
+import itertools
 
 from migen import *
 from misoc.interconnect import wishbone
 
+from artiq.coredevice.exceptions import RTIOUnderflow
+from artiq.gateware import rtio
 from artiq.gateware.rtio import dma, cri
+from artiq.gateware.rtio.phy import ttl_simple
 
 
 def encode_n(n, min_length, max_length):
@@ -47,6 +51,16 @@ def encode_sequence(writes, ws):
     return pack(sequence, ws)
 
 
+def do_dma(dut, address):
+    yield from dut.dma.base_address.write(address)
+    yield from dut.enable.write(1)
+    yield
+    while ((yield from dut.enable.read())):
+        yield
+    if (yield from dut.cri_master.underflow.read()):
+        raise RTIOUnderflow
+
+
 test_writes1 = [
     (0x01, 0x23, 0x12, 0x33),
     (0x901, 0x902, 0x911, 0xeeeeeeeeeeeeeefffffffffffffffffffffffffffffff28888177772736646717738388488),
@@ -83,21 +97,44 @@ class TB(Module):
         self.submodules.dut = dma.DMA(bus)
 
 
+test_writes_full_stack = [
+    (0, 32, 0, 1),
+    (1, 40, 0, 1),
+    (0, 48, 0, 0),
+    (1, 50, 0, 0),
+]
+
+
+class FullStackTB(Module):
+    def __init__(self, ws):
+        self.ttl0 = Signal()
+        self.ttl1 = Signal()
+
+        self.submodules.phy0 = ttl_simple.Output(self.ttl0)
+        self.submodules.phy1 = ttl_simple.Output(self.ttl1)
+
+        rtio_channels = [
+            rtio.Channel.from_phy(self.phy0),
+            rtio.Channel.from_phy(self.phy1)
+        ]
+
+        sequence = encode_sequence(test_writes_full_stack, ws)
+
+        bus = wishbone.Interface(ws*8)
+        self.submodules.memory = wishbone.SRAM(
+            256, init=sequence, bus=bus)
+        self.submodules.dut = dma.DMA(bus)
+        self.submodules.rtio = rtio.Core(rtio_channels)
+        self.comb += self.dut.cri.connect(self.rtio.cri)
+
+
 class TestDMA(unittest.TestCase):
     def test_dma_noerror(self):
-        ws = 64
-        tb = TB(ws)
-
-        def do_dma(address):
-            yield from tb.dut.dma.base_address.write(address)
-            yield from tb.dut.enable.write(1)
-            yield
-            while ((yield from tb.dut.enable.read())):
-                yield
+        tb = TB(64)
 
         def do_writes():
-            yield from do_dma(0)
-            yield from do_dma(512)
+            yield from do_dma(tb.dut, 0)
+            yield from do_dma(tb.dut, 512)
 
         received = []
         @passive
@@ -124,3 +161,30 @@ class TestDMA(unittest.TestCase):
 
         run_simulation(tb, [do_writes(), rtio_sim()])
         self.assertEqual(received, test_writes1 + test_writes2)
+
+    def test_full_stack(self):
+        tb = FullStackTB(64)
+
+        ttl_changes = []
+        @passive
+        def monitor():
+            old_ttl_states = [0, 0]
+            for time in itertools.count():
+                ttl_states = [
+                    (yield tb.ttl0),
+                    (yield tb.ttl1)
+                ]
+                for i, (old, new) in enumerate(zip(old_ttl_states, ttl_states)):
+                    if new != old:
+                        ttl_changes.append((time, i))
+                old_ttl_states = ttl_states
+                yield
+
+        run_simulation(tb, {"sys": [
+            do_dma(tb.dut, 0), monitor(),
+            (None for _ in range(70)),
+        ]}, {"sys": 8, "rsys": 8, "rtio": 8, "rio": 8, "rio_phy": 8})
+
+        correct_changes = [(timestamp + 11, channel)
+                           for channel, timestamp, _, _ in test_writes_full_stack]
+        self.assertEqual(ttl_changes, correct_changes)
diff --git a/artiq/gateware/test/rtio/test_input_collector.py b/artiq/gateware/test/rtio/test_input_collector.py
new file mode 100644
index 000000000..c67f2aa53
--- /dev/null
+++ b/artiq/gateware/test/rtio/test_input_collector.py
@@ -0,0 +1,90 @@
+import unittest
+
+from migen import *
+
+from artiq.gateware import rtio
+from artiq.gateware.rtio import rtlink
+from artiq.gateware.rtio import cri
+from artiq.gateware.rtio.input_collector import *
+
+
+class OscInput(Module):
+    def __init__(self):
+        self.rtlink = rtlink.Interface(
+            rtlink.OInterface(1),
+            rtlink.IInterface(1))
+        self.overrides = []
+        self.probes = []
+
+        # # #
+
+        counter = Signal(2)
+        trigger = Signal()
+        self.sync += [
+            Cat(counter, trigger).eq(counter + 1),
+            self.rtlink.i.stb.eq(0),
+            If(trigger,
+                self.rtlink.i.stb.eq(1),
+                self.rtlink.i.data.eq(~self.rtlink.i.data)
+            )
+        ]
+
+
+class DUT(Module):
+    def __init__(self):
+        self.submodules.phy0 = OscInput()
+        self.submodules.phy1 = OscInput()
+        rtio_channels = [
+            rtio.Channel.from_phy(self.phy0, ififo_depth=4),
+            rtio.Channel.from_phy(self.phy1, ififo_depth=4)
+        ]
+        self.submodules.input_collector = InputCollector(rtio_channels, 0, "sync")
+        self.sync += self.input_collector.coarse_timestamp.eq(self.input_collector.coarse_timestamp + 1)
+        self.comb += self.input_collector.cri.counter.eq(self.input_collector.coarse_timestamp)
+
+    @property
+    def cri(self):
+        return self.input_collector.cri
+
+
+def simulate(wait_cycles, ts_timeouts):
+    result = []
+    dut = DUT()
+    def gen():
+        for _ in range(wait_cycles):
+            yield
+
+        for ts_timeout in ts_timeouts:
+            yield dut.cri.timestamp.eq(ts_timeout)
+            yield dut.cri.cmd.eq(cri.commands["read"])
+            yield
+            yield dut.cri.cmd.eq(cri.commands["nop"])
+            yield
+            while (yield dut.cri.i_status) & 4:
+                yield
+            status = yield dut.cri.i_status
+            if status & 2:
+                result.append("overflow")
+            elif status & 1:
+                result.append("timeout")
+            else:
+                i_timestamp = yield dut.cri.i_timestamp
+                i_data = yield dut.cri.i_data
+                result.append((i_timestamp, i_data))
+
+    run_simulation(dut, gen())
+    return result
+
+
+class TestInput(unittest.TestCase):
+    def test_get_data(self):
+        result = simulate(0, [256]*8)
+        self.assertEqual(result, [(n*4+1, n % 2) for n in range(1, 9)])
+
+    def test_timeout(self):
+        result = simulate(0, [3, 16])
+        self.assertEqual(result, ["timeout", (5, 1)])
+
+    def test_overflow(self):
+        result = simulate(32, [256])
+        self.assertEqual(result, ["overflow"])
diff --git a/artiq/gateware/test/rtio/test_sed_lane_distributor.py b/artiq/gateware/test/rtio/test_sed_lane_distributor.py
new file mode 100644
index 000000000..c02b6c4bb
--- /dev/null
+++ b/artiq/gateware/test/rtio/test_sed_lane_distributor.py
@@ -0,0 +1,153 @@
+import unittest
+
+from migen import *
+
+from artiq.gateware.rtio import cri
+from artiq.gateware.rtio.sed import lane_distributor
+
+
+LANE_COUNT = 8
+
+
+def simulate(input_events, compensation=None, wait=True):
+    layout = [("channel", 8), ("timestamp", 32)]
+    if compensation is None:
+        compensation = [0]*256
+    dut = lane_distributor.LaneDistributor(LANE_COUNT, 8, layout, compensation, 3)
+
+    output = []
+    access_results = []
+
+    def gen():
+        for channel, timestamp in input_events:
+            yield dut.cri.chan_sel.eq(channel)
+            yield dut.cri.timestamp.eq(timestamp)
+            yield
+
+            yield dut.cri.cmd.eq(cri.commands["write"])
+            yield
+            yield dut.cri.cmd.eq(cri.commands["nop"])
+
+            access_time = 0
+            yield
+            while (yield dut.cri.o_status) & 0x01:
+                yield
+                access_time += 1
+
+            status = (yield dut.cri.o_status)
+            access_status = "ok"
+            if status & 0x02:
+                access_status = "underflow"
+            if (yield dut.sequence_error):
+                access_status = "sequence_error"
+
+            access_results.append((access_status, access_time))
+
+    @passive
+    def monitor_lane(n, lio, wait_time):
+        yield lio.writable.eq(1)
+        while True:
+            while not (yield lio.we):
+                yield
+            seqn = (yield lio.seqn)
+            channel = (yield lio.payload.channel)
+            timestamp = (yield lio.payload.timestamp)
+            output.append((n, seqn, channel, timestamp))
+
+            yield lio.writable.eq(0)
+            for i in range(wait_time):
+                yield
+            yield lio.writable.eq(1)
+            yield
+
+    generators = [gen()]
+    for n, lio in enumerate(dut.output):
+        lio.writable.reset = 1
+        wait_time = 0
+        if wait:
+            if n == 6:
+                wait_time = 1
+            elif n == 7:
+                wait_time = 4
+        generators.append(monitor_lane(n, lio, wait_time))
+    run_simulation(dut, generators)
+
+    return output, access_results
+
+
+class TestLaneDistributor(unittest.TestCase):
+    def test_regular(self):
+        N = 16
+        output, access_results = simulate([(42+n, (n+1)*8) for n in range(N)], wait=False)
+        self.assertEqual(output, [(0, n, 42+n, (n+1)*8) for n in range(N)])
+        self.assertEqual(access_results, [("ok", 0)]*N)
+
+    def test_wait_time(self):
+        output, access_results = simulate([(42+n, 8) for n in range(LANE_COUNT)])
+        self.assertEqual(output, [(n, n, 42+n, 8) for n in range(LANE_COUNT)])
+        expected_access_results = [("ok", 0)]*LANE_COUNT
+        expected_access_results[6] = ("ok", 1)
+        expected_access_results[7] = ("ok", 4)
+        self.assertEqual(access_results, expected_access_results)
+
+    def test_lane_switch(self):
+        N = 32
+        output, access_results = simulate([(42+n, n+8) for n in range(N)], wait=False)
+        self.assertEqual(output, [((n-n//8) % LANE_COUNT, n, 42+n, n+8) for n in range(N)])
+        self.assertEqual([ar[0] for ar in access_results], ["ok"]*N)
+
+    def test_sequence_error(self):
+        input_events = [(42+n, 8) for n in range(LANE_COUNT+1)]
+        input_events.append((42+LANE_COUNT+1, 16))
+        output, access_results = simulate(input_events)
+        self.assertEqual(len(output), len(input_events)-1)  # event with sequence error must get discarded
+        self.assertEqual([ar[0] for ar in access_results[:LANE_COUNT]], ["ok"]*LANE_COUNT)
+        self.assertEqual(access_results[LANE_COUNT][0], "sequence_error")
+
+    def test_underflow(self):
+        N = 16
+        input_events = [(42+n, (n+1)*8) for n in range(N-2)]
+        input_events.append((0, 0))  # timestamp < 8 underflows
+        input_events.append((42+N-2, N*8))
+        output, access_results = simulate(input_events)
+        self.assertEqual(len(output), len(input_events)-1)  # event with underflow must get discarded
+        self.assertEqual([ar[0] for ar in access_results[:N-2]], ["ok"]*(N-2))
+        self.assertEqual(access_results[N-2][0], "underflow")
+        self.assertEqual(output[N-2], (0, N-2, 42+N-2, N*8))
+        self.assertEqual(access_results[N-1][0], "ok")
+
+    def test_spread(self):
+        # get to lane 6
+        input_events = [(42+n, 8) for n in range(7)]
+        input_events.append((100, 16))
+        input_events.append((100, 32))
+        output, access_results = simulate(input_events)
+        self.assertEqual([o[0] for o in output], [x % LANE_COUNT for x in range(9)])
+        self.assertEqual([ar[0] for ar in access_results], ["ok"]*9)
+
+    def test_regular_lc(self):
+        N = 16
+        output, access_results = simulate([(n, 8) for n in range(N)],
+                                          compensation=range(N), wait=False)
+        self.assertEqual(output, [(0, n, n, (n+1)*8) for n in range(N)])
+        self.assertEqual(access_results, [("ok", 0)]*N)
+
+    def test_lane_switch_lc(self):
+        N = 32
+        compensation = [n//2 for n in range(N)]
+        output, access_results = simulate([(n, 8) for n in range(N)],
+                                          compensation=compensation, wait=False)
+        self.assertEqual(output, [((n-n//2) % LANE_COUNT, n, n, 8*(1+n//2)) for n in range(N)])
+        self.assertEqual([ar[0] for ar in access_results], ["ok"]*N)
+
+    def test_underflow_lc(self):
+        N = 16
+        compensation = [0]*N
+        input_events = [(n, (n+1)*8) for n in range(N)]
+        compensation[N-2] = -input_events[N-2][1]//8
+        output, access_results = simulate(input_events, compensation=compensation)
+        self.assertEqual(len(output), len(input_events)-1)  # event with underflow must get discarded
+        self.assertEqual([ar[0] for ar in access_results[:N-2]], ["ok"]*(N-2))
+        self.assertEqual(access_results[N-2][0], "underflow")
+        self.assertEqual(output[N-2], (0, N-2, N-1, N*8))
+        self.assertEqual(access_results[N-1][0], "ok")
diff --git a/artiq/gateware/test/rtio/test_sed_output_driver.py b/artiq/gateware/test/rtio/test_sed_output_driver.py
new file mode 100644
index 000000000..4da3bd463
--- /dev/null
+++ b/artiq/gateware/test/rtio/test_sed_output_driver.py
@@ -0,0 +1,125 @@
+import unittest
+
+from migen import *
+
+from artiq.gateware import rtio
+from artiq.gateware.rtio.sed import output_network, output_driver
+from artiq.gateware.rtio.phy import ttl_simple
+from artiq.gateware.rtio import rtlink
+
+
+LANE_COUNT = 8
+
+
+class BusyPHY(Module):
+    def __init__(self):
+        self.rtlink = rtlink.Interface(rtlink.OInterface(1))
+        self.comb += self.rtlink.o.busy.eq(1)
+
+
+class DUT(Module):
+    def __init__(self):
+        self.ttl0 = Signal()
+        self.ttl1 = Signal()
+        self.ttl2 = Signal()
+
+        self.submodules.phy0 = ttl_simple.Output(self.ttl0)
+        self.submodules.phy1 = ttl_simple.Output(self.ttl1)
+        self.submodules.phy2 = ttl_simple.Output(self.ttl2)
+        self.phy2.rtlink.o.enable_replace = False
+        self.submodules.phy3 = BusyPHY()
+
+        rtio_channels = [
+            rtio.Channel.from_phy(self.phy0),
+            rtio.Channel.from_phy(self.phy1),
+            rtio.Channel.from_phy(self.phy2),
+            rtio.Channel.from_phy(self.phy3),
+        ]
+
+        self.submodules.output_driver = output_driver.OutputDriver(
+            rtio_channels, 0, LANE_COUNT, 4*LANE_COUNT)
+
+
+def simulate(input_events):
+    dut = DUT()
+
+    def gen():
+        for n, input_event in enumerate(input_events):
+            yield dut.output_driver.input[n].valid.eq(1)
+            yield dut.output_driver.input[n].seqn.eq(n)
+            for k, v in input_event.items():
+                yield getattr(dut.output_driver.input[n].payload, k).eq(v)
+        yield
+        for n in range(len(input_events)):
+            yield dut.output_driver.input[n].valid.eq(0)
+        for i in range(output_network.latency(LANE_COUNT) + 2):
+            yield
+        for i in range(3):
+            yield
+
+    output = ""
+
+    @passive
+    def monitor():
+        nonlocal output
+
+        ttls = [dut.ttl0, dut.ttl1, dut.ttl2]
+        prev_ttl_values = [0, 0, 0]
+        while True:
+            ttl_values = []
+            for ttl in ttls:
+                ttl_values.append((yield ttl))
+            for n, (old, new) in enumerate(zip(prev_ttl_values, ttl_values)):
+                if old != new:
+                    output += "TTL{} {}->{}\n".format(n, old, new)
+            prev_ttl_values = ttl_values
+
+            if (yield dut.output_driver.collision):
+                output += "collision ch{}\n".format((yield dut.output_driver.collision_channel))
+            if (yield dut.output_driver.busy):
+                output += "busy ch{}\n".format((yield dut.output_driver.busy_channel))
+
+            yield
+
+    run_simulation(dut, {"sys": [gen(), monitor()]},
+                   {"sys": 5, "rio": 5, "rio_phy": 5})
+    return output
+
+
+class TestOutputNetwork(unittest.TestCase):
+    def test_one_ttl(self):
+        self.assertEqual(
+            simulate([{"data": 1}]),
+            "TTL0 0->1\n")
+
+    def test_simultaneous_ttl(self):
+        self.assertEqual(
+            simulate([{"channel": 0, "data": 1},
+                      {"channel": 1, "data": 1},
+                      {"channel": 2, "data": 1}]),
+            "TTL0 0->1\n"
+            "TTL1 0->1\n"
+            "TTL2 0->1\n")
+
+    def test_replace(self):
+        self.assertEqual(
+            simulate([{"data": 0},
+                      {"data": 1},
+                      {"data": 0}]),
+            "")
+        self.assertEqual(
+            simulate([{"data": 1},
+                      {"data": 0},
+                      {"data": 1}]),
+            "TTL0 0->1\n")
+
+    def test_collision(self):
+        self.assertEqual(
+            simulate([{"channel": 2},
+                      {"channel": 2}]),
+            "collision ch2\n")
+
+    def test_busy(self):
+        self.assertEqual(
+            simulate([{"channel": 3}]),
+            "busy ch3\n")
diff --git a/artiq/gateware/test/rtio/test_sed_output_network.py b/artiq/gateware/test/rtio/test_sed_output_network.py
new file mode 100644
index 000000000..b9c10526e
--- /dev/null
+++ b/artiq/gateware/test/rtio/test_sed_output_network.py
@@ -0,0 +1,61 @@
+import unittest
+
+from migen import *
+
+from artiq.gateware.rtio.sed import output_network
+
+
+LANE_COUNT = 8
+
+
+def simulate(input_events):
+    layout_payload = [
+        ("channel", 8),
+        ("fine_ts", 3),
+        ("address", 16),
+        ("data", 512),
+    ]
+    dut = output_network.OutputNetwork(LANE_COUNT, LANE_COUNT*4, layout_payload)
+    output = []
+    def gen():
+        for n, input_event in enumerate(input_events):
+            yield dut.input[n].valid.eq(1)
+            yield dut.input[n].seqn.eq(n)
+            for k, v in input_event.items():
+                yield getattr(dut.input[n].payload, k).eq(v)
+        yield
+        for n in range(len(input_events)):
+            yield dut.input[n].valid.eq(0)
+        for i in range(output_network.latency(LANE_COUNT)):
+            yield
+            for x in range(LANE_COUNT):
+                if (yield dut.output[x].valid):
+                    d = {
+                        "replace_occured": (yield dut.output[x].replace_occured),
+                        "channel": (yield dut.output[x].payload.channel),
+                        "fine_ts": (yield dut.output[x].payload.fine_ts),
+                        "address": (yield dut.output[x].payload.address),
+                        "data": (yield dut.output[x].payload.data),
+                    }
+                    output.append(d)
+    run_simulation(dut, gen())
+    return output
+
+
+class TestOutputNetwork(unittest.TestCase):
+    def test_replace(self):
+        for n_events in range(2, LANE_COUNT+1):
+            with self.subTest(n_events=n_events):
+                input = [{"channel": 1, "address": i} for i in range(n_events)]
+                output = simulate(input)
+                expect = [{'replace_occured': 1, 'channel': 1, 'fine_ts': 0, 'address': n_events-1, 'data': 0}]
+                self.assertEqual(output, expect)
+
+    def test_no_replace(self):
+        for n_events in range(1, LANE_COUNT+1):
+            with self.subTest(n_events=n_events):
+                input = [{"channel": i, "address": i} for i in range(n_events)]
+                output = simulate(input)
+                expect = [{'replace_occured': 0, 'channel': i, 'fine_ts': 0, 'address': i, 'data': 0}
+                          for i in range(n_events)]
+                self.assertEqual(output, expect)
diff --git a/artiq/gateware/test/rtio/test_sed_top.py b/artiq/gateware/test/rtio/test_sed_top.py
new file mode 100644
index 000000000..d5de88979
--- /dev/null
+++ b/artiq/gateware/test/rtio/test_sed_top.py
@@ -0,0 +1,88 @@
+import unittest
+import itertools
+
+from migen import *
+
+from artiq.gateware import rtio
+from artiq.gateware.rtio import cri
+from artiq.gateware.rtio.sed.core import *
+from artiq.gateware.rtio.phy import ttl_simple
+
+
+class DUT(Module):
+    def __init__(self):
+        self.ttl0 = Signal()
+        self.ttl1 = Signal()
+
+        self.submodules.phy0 = ttl_simple.Output(self.ttl0)
+        self.submodules.phy1 = ttl_simple.Output(self.ttl1)
+
+        rtio_channels = [
+            rtio.Channel.from_phy(self.phy0),
+            rtio.Channel.from_phy(self.phy1)
+        ]
+
+        self.submodules.sed = SED(rtio_channels, 0, "sync")
+        self.sync += [
+            self.sed.coarse_timestamp.eq(self.sed.coarse_timestamp + 1),
+            self.sed.minimum_coarse_timestamp.eq(self.sed.coarse_timestamp + 16)
+        ]
+
+
+def simulate(input_events):
+    dut = DUT()
+
+    ttl_changes = []
+    access_results = []
+
+    def gen():
+        yield dut.sed.cri.chan_sel.eq(0)
+        for timestamp, data in input_events:
+            yield dut.sed.cri.timestamp.eq(timestamp)
+            yield dut.sed.cri.o_data.eq(data)
+            yield
+
+            yield dut.sed.cri.cmd.eq(cri.commands["write"])
+            yield
+            yield dut.sed.cri.cmd.eq(cri.commands["nop"])
+
+            access_time = 0
+            yield
+            while (yield dut.sed.cri.o_status) & 0x01:
+                yield
+                access_time += 1
+
+            status = (yield dut.sed.cri.o_status)
+            access_status = "ok"
+            if status & 0x02:
+                access_status = "underflow"
+            if (yield dut.sed.sequence_error):
+                access_status = "sequence_error"
+
+            access_results.append((access_status, access_time))
+
+    @passive
+    def monitor():
+        old_ttl_state = 0
+        for time in itertools.count():
+            ttl_state = yield dut.ttl0
+            if ttl_state != old_ttl_state:
+                ttl_changes.append(time)
+            old_ttl_state = ttl_state
+            yield
+
+    run_simulation(dut, {"sys": [
+        gen(), monitor(),
+        (None for _ in range(45))
+    ]}, {"sys": 5, "rio": 5, "rio_phy": 5})
+
+    return ttl_changes, access_results
+
+
+class TestSED(unittest.TestCase):
+    def test_sed(self):
+        input_events = [(18, 1), (20, 0), (25, 1), (30, 0)]
+        latency = 11
+        ttl_changes, access_results = simulate(input_events)
+        self.assertEqual(ttl_changes, [e[0] + latency for e in input_events])
+        self.assertEqual(access_results, [("ok", 0)]*len(input_events))
diff --git a/artiq/test/coredevice/test_rtio.py b/artiq/test/coredevice/test_rtio.py
index 92a0d1abc..13517084a 100644
--- a/artiq/test/coredevice/test_rtio.py
+++ b/artiq/test/coredevice/test_rtio.py
@@ -258,10 +258,10 @@ class SequenceError(EnvExperiment):
     @kernel
     def run(self):
         self.core.reset()
-        t = now_mu()
-        self.ttl_out.pulse(25*us)
-        at_mu(t)
-        self.ttl_out.pulse(25*us)
+        delay(55*256*us)
+        for _ in range(256):
+            self.ttl_out.pulse(25*us)
+            delay(-75*us)
 
 
 class Collision(EnvExperiment):
@@ -276,6 +276,8 @@ class Collision(EnvExperiment):
         for i in range(16):
             self.ttl_out_serdes.pulse_mu(1)
             delay_mu(1)
+        while self.core.get_rtio_counter_mu() < now_mu():
+            pass
 
 
 class AddressCollision(EnvExperiment):
@@ -288,6 +290,8 @@ class AddressCollision(EnvExperiment):
         self.core.reset()
         self.loop_in.input()
         self.loop_in.pulse(10*us)
+        while self.core.get_rtio_counter_mu() < now_mu():
+            pass
 
 
 class TimeKeepsRunning(EnvExperiment):
@@ -358,7 +362,7 @@ class CoredeviceTest(ExperimentCase):
         rtt = self.dataset_mgr.get("rtt")
         print(rtt)
         self.assertGreater(rtt, 0*ns)
-        self.assertLess(rtt, 60*ns)
+        self.assertLess(rtt, 140*ns)
 
     def test_clock_generator_loopback(self):
         self.execute(ClockGeneratorLoopback)
@@ -397,27 +401,23 @@ class CoredeviceTest(ExperimentCase):
         with self.assertRaises(RTIOUnderflow):
             self.execute(Underflow)
 
+    def execute_and_test_in_log(self, experiment, string):
+        core_addr = self.device_mgr.get_desc("core")["arguments"]["host"]
+        mgmt = CommMgmt(core_addr)
+        mgmt.clear_log()
+        self.execute(experiment)
+        log = mgmt.get_log()
+        self.assertIn(string, log)
+        mgmt.close()
+
     def test_sequence_error(self):
-        with self.assertRaises(RTIOSequenceError):
-            self.execute(SequenceError)
+        self.execute_and_test_in_log(SequenceError, "RTIO sequence error")
 
     def test_collision(self):
-        core_addr = self.device_mgr.get_desc("core")["arguments"]["host"]
-        mgmt = CommMgmt(core_addr)
-        mgmt.clear_log()
-        self.execute(Collision)
-        log = mgmt.get_log()
-        self.assertIn("RTIO collision", log)
-        mgmt.close()
+        self.execute_and_test_in_log(Collision, "RTIO collision")
 
     def test_address_collision(self):
-        core_addr = self.device_mgr.get_desc("core")["arguments"]["host"]
-        mgmt = CommMgmt(core_addr)
-        mgmt.clear_log()
-        self.execute(AddressCollision)
-        log = mgmt.get_log()
-        self.assertIn("RTIO collision", log)
-        mgmt.close()
+        self.execute_and_test_in_log(AddressCollision, "RTIO collision")
 
     def test_watchdog(self):
         # watchdog only works on the device
@@ -491,7 +491,7 @@ class RPCTest(ExperimentCase):
 
 
 class _DMA(EnvExperiment):
-    def build(self, trace_name="foobar"):
+    def build(self, trace_name="test_rtio"):
         self.setattr_device("core")
         self.setattr_device("core_dma")
         self.setattr_device("ttl1")
@@ -499,8 +499,12 @@ class _DMA(EnvExperiment):
         self.delta = np.int64(0)
 
     @kernel
-    def record(self):
+    def record(self, for_handle=True):
         with self.core_dma.record(self.trace_name):
+            # When not using the handle, retrieving the DMA trace
+            # in dma.playback() can be slow. Allow some time.
+            if not for_handle:
+                delay(1*ms)
             delay(100*ns)
             self.ttl1.on()
             delay(100*ns)
@@ -519,20 +523,22 @@ class _DMA(EnvExperiment):
         self.set_dataset("dma_record_time", self.core.mu_to_seconds(t2 - t1))
 
     @kernel
-    def playback(self, use_handle=False):
-        self.core.break_realtime()
-        start = now_mu()
+    def playback(self, use_handle=True):
         if use_handle:
             handle = self.core_dma.get_handle(self.trace_name)
+            self.core.break_realtime()
+            start = now_mu()
             self.core_dma.playback_handle(handle)
         else:
+            self.core.break_realtime()
+            start = now_mu()
             self.core_dma.playback(self.trace_name)
         self.delta = now_mu() - start
 
     @kernel
     def playback_many(self, n):
-        self.core.break_realtime()
         handle = self.core_dma.get_handle(self.trace_name)
+        self.core.break_realtime()
         t1 = self.core.get_rtio_counter_mu()
         for i in range(n):
             self.core_dma.playback_handle(handle)
@@ -579,9 +585,9 @@ class DMATest(ExperimentCase):
         core_host = self.device_mgr.get_desc("core")["arguments"]["host"]
 
         exp = self.create(_DMA)
-        exp.record()
 
         for use_handle in [False, True]:
+            exp.record(use_handle)
             get_analyzer_dump(core_host)  # clear analyzer buffer
             exp.playback(use_handle)
 
@@ -603,9 +609,13 @@ class DMATest(ExperimentCase):
         exp = self.create(_DMA)
         exp.record()
 
-        for use_handle in [False, True]:
-            exp.playback(use_handle)
-            self.assertEqual(exp.delta, 200)
+        exp.record(False)
+        exp.playback(False)
+        self.assertEqual(exp.delta, 1000200)
+
+        exp.record(True)
+        exp.playback(True)
+        self.assertEqual(exp.delta, 200)
 
     def test_dma_record_time(self):
         exp = self.create(_DMA)
@@ -618,11 +628,17 @@ class DMATest(ExperimentCase):
     def test_dma_playback_time(self):
         exp = self.create(_DMA)
         count = 20000
-        exp.record()
+        exp.record_many(40)
         exp.playback_many(count)
         dt = self.dataset_mgr.get("dma_playback_time")
         print("dt={}, dt/count={}".format(dt, dt/count))
-        self.assertLess(dt/count, 3*us)
+        self.assertLess(dt/count, 4.5*us)
+
+    def test_dma_underflow(self):
+        exp = self.create(_DMA)
+        exp.record()
+        with self.assertRaises(RTIOUnderflow):
+            exp.playback_many(20000)
 
     def test_handle_invalidation(self):
         exp = self.create(_DMA)
diff --git a/doc/manual/rtio.rst b/doc/manual/rtio.rst
index a08edad32..f62c3b57d 100644
--- a/doc/manual/rtio.rst
+++ b/doc/manual/rtio.rst
@@ -117,6 +117,22 @@ To track down ``RTIOUnderflows`` in an experiment there are a few approaches:
     code.
   * The :any:`integrated logic analyzer <core-device-rtio-analyzer-tool>` shows the timeline context that lead to the exception. The analyzer is always active and supports plotting of RTIO slack. RTIO slack is the difference between timeline cursor and wall clock time (``now - rtio_counter``).
 
+Sequence errors
+---------------
+A sequence error happens when the sequence of coarse timestamps cannot be supported by the gateware. For example, there may have been too many timeline rewinds.
+
+Internally, the gateware stores output events in an array of FIFO buffers (the "lanes") and the timestamps in each lane much be strictly increasing. The gateware selects a different lane when an event with a decreasing or equal timestamp is submitted. A sequence error occurs when no appropriate lane can be found.
+
+Notes:
+
+* Strictly increasing timestamps never cause sequence errors. 
+* Configuring the gateware with more lanes for the RTIO core reduces the frequency of sequence errors. 
+* Whether a particular sequence of timestamps causes a sequence error or not is fully deterministic (starting from a known RTIO state, e.g. after a reset). Adding a constant offset to the whole sequence does not affect the result.
+
+The offending event is discarded and the RTIO core keeps operating.
+
+This error is reported asynchronously via the core device log: for performance reasons with DRTIO, the CPU does not wait for an error report from the satellite after writing an event. Therefore, it is not possible to raise an exception precisely.
+
 Collisions
 ----------
 A collision happens when more than one event is submitted on a given channel with the same coarse timestamp, and that channel does not implement replacement behavior or the fine timestamps are different.