DMA: improve recording performance.

This commit moves DMA serialization code to the kernel CPU (to cope with the existence of rtio_output_wide) and batches the resulting sequences. This results in less data being transferred between kernel and comms CPUs (24 octets with one pointer before, 18 octets with no pointers now, for the common case of rtio_output), but most importantly reduces cache flushes, which now happen once per 64k octets. On average, it now takes about 15us to record a single RTIO event in a DMA trace. Fixes #712.
2017-04-15 07:26:18 +00:00 · 2017-04-15 07:26:18 +00:00 · 9dfe9c1248
parent ea753bed17
commit 9dfe9c1248
5 changed files with 116 additions and 59 deletions
--- a/artiq/firmware/ksupport/lib.rs
+++ b/artiq/firmware/ksupport/lib.rs
@ -238,11 +238,33 @@ extern fn i2c_read(busno: i32, ack: bool) -> i32 {
    recv!(&I2cReadReply { data } => data) as i32
 }

-static mut DMA_RECORDING: bool = false;
+const DMA_BUFFER_SIZE: usize = 64 * 1024;
+
+struct DmaRecorder {
+    active:   bool,
+    #[allow(dead_code)]
+    padding:  [u8; 3], //https://github.com/rust-lang/rust/issues/41315
+    data_len: usize,
+    buffer:   [u8; DMA_BUFFER_SIZE],
+}
+
+static mut DMA_RECORDER: DmaRecorder = DmaRecorder {
+    active:   false,
+    padding:  [0; 3],
+    data_len: 0,
+    buffer:   [0; DMA_BUFFER_SIZE],
+};
+
+fn dma_record_flush() {
+    unsafe {
+        send(&DmaRecordAppend(&DMA_RECORDER.buffer[..DMA_RECORDER.data_len]));
+        DMA_RECORDER.data_len = 0;
+    }
+}

 extern fn dma_record_start() {
    unsafe {
-        if DMA_RECORDING {
+        if DMA_RECORDER.active {
            raise!("DMAError", "DMA is already recording")
        }

@ -252,7 +274,7 @@ extern fn dma_record_start() {
        library.rebind(b"rtio_output_wide",
                       dma_record_output_wide as *const () as u32).unwrap();

-        DMA_RECORDING = true;
+        DMA_RECORDER.active = true;
        send(&DmaRecordStart);
    }
 }
@ -261,7 +283,9 @@ extern fn dma_record_stop(name: CSlice<u8>, duration: i64) {
    let name = str::from_utf8(name.as_ref()).unwrap();

    unsafe {
-        if !DMA_RECORDING {
+        dma_record_flush();
+
+        if !DMA_RECORDER.active {
            raise!("DMAError", "DMA is not recording")
        }

@ -271,7 +295,7 @@ extern fn dma_record_stop(name: CSlice<u8>, duration: i64) {
        library.rebind(b"rtio_output_wide",
                       rtio::output_wide as *const () as u32).unwrap();

-        DMA_RECORDING = false;
+        DMA_RECORDER.active = false;
        send(&DmaRecordStop {
            name:     name,
            duration: duration as u64
@ -279,23 +303,56 @@ extern fn dma_record_stop(name: CSlice<u8>, duration: i64) {
    }
 }

-extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, data: i32) {
-    send(&DmaRecordAppend {
-        timestamp: timestamp as u64,
-        channel:   channel as u32,
-        address:   address as u32,
-        data:      &[data as u32]
-    })
+extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, word: i32) {
+    dma_record_output_wide(timestamp, channel, address, [word].as_c_slice())
 }

-extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, data: CSlice<i32>) {
-    assert!(data.len() <= 16); // enforce the hardware limit
-    send(&DmaRecordAppend {
-        timestamp: timestamp as u64,
-        channel:   channel as u32,
-        address:   address as u32,
-        data:      unsafe { mem::transmute::<&[i32], &[u32]>(data.as_ref()) }
-    })
+extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, words: CSlice<i32>) {
+    assert!(words.len() <= 16); // enforce the hardware limit
+
+    // See gateware/rtio/dma.py.
+    let header_length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2;
+    let length = header_length + /*data*/words.len() * 4;
+
+    let header = [
+        (length    >>  0) as u8,
+        (channel   >>  0) as u8,
+        (channel   >>  8) as u8,
+        (channel   >> 16) as u8,
+        (timestamp >>  0) as u8,
+        (timestamp >>  8) as u8,
+        (timestamp >> 16) as u8,
+        (timestamp >> 24) as u8,
+        (timestamp >> 32) as u8,
+        (timestamp >> 40) as u8,
+        (timestamp >> 48) as u8,
+        (timestamp >> 56) as u8,
+        (address   >>  0) as u8,
+        (address   >>  8) as u8,
+    ];
+
+    let mut data = [0; 16 * 4];
+    for (i, &word) in words.as_ref().iter().enumerate() {
+        let part = [
+            (word >>  0) as u8,
+            (word >>  8) as u8,
+            (word >> 16) as u8,
+            (word >> 24) as u8,
+        ];
+        data[i * 4..(i + 1) * 4].copy_from_slice(&part[..]);
+    }
+    let data = &data[..words.len() * 4];
+
+    unsafe {
+        if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length {
+            dma_record_flush()
+        }
+        let mut dst = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len..
+                                               DMA_RECORDER.data_len + length];
+        dst[..header_length].copy_from_slice(&header[..]);
+        dst[header_length..].copy_from_slice(&data[..]);
+        DMA_RECORDER.data_len += length;
+    }
 }

 extern fn dma_erase(name: CSlice<u8>) {
--- a/artiq/firmware/libproto/kernel_proto.rs
+++ b/artiq/firmware/libproto/kernel_proto.rs
@ -6,7 +6,6 @@ pub const KERNELCPU_PAYLOAD_ADDRESS: usize = 0x40840000;
 pub const KERNELCPU_LAST_ADDRESS:    usize = 0x4fffffff;
 pub const KSUPPORT_HEADER_SIZE:      usize = 0x80;

-#[repr(C)]
 #[derive(Debug, Clone)]
 pub struct Exception<'a> {
    pub name:     &'a str,
@ -30,12 +29,7 @@ pub enum Message<'a> {
    RtioInitRequest,

    DmaRecordStart,
-    DmaRecordAppend {
-        timestamp: u64,
-        channel:   u32,
-        address:   u32,
-        data:      &'a [u32]
-    },
+    DmaRecordAppend(&'a [u8]),
    DmaRecordStop {
        name:      &'a str,
        duration:  u64
--- a/artiq/firmware/runtime/rtio_dma.rs
+++ b/artiq/firmware/runtime/rtio_dma.rs
@ -31,36 +31,8 @@ impl Manager {
        self.recording = Vec::new();
    }

-    pub fn record_append(&mut self, timestamp: u64, channel: u32,
-                         address: u32, data: &[u32]) {
-        let writer = &mut self.recording;
-        // See gateware/rtio/dma.py.
-        let length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2 +
-                     /*data*/data.len() * 4;
-        writer.write_all(&[
-            (length    >>  0) as u8,
-            (channel   >>  0) as u8,
-            (channel   >>  8) as u8,
-            (channel   >> 16) as u8,
-            (timestamp >>  0) as u8,
-            (timestamp >>  8) as u8,
-            (timestamp >> 16) as u8,
-            (timestamp >> 24) as u8,
-            (timestamp >> 32) as u8,
-            (timestamp >> 40) as u8,
-            (timestamp >> 48) as u8,
-            (timestamp >> 56) as u8,
-            (address   >>  0) as u8,
-            (address   >>  8) as u8,
-        ]).unwrap();
-        for &word in data {
-            writer.write_all(&[
-                (word >>  0) as u8,
-                (word >>  8) as u8,
-                (word >> 16) as u8,
-                (word >> 24) as u8,
-            ]).unwrap();
-        }
+    pub fn record_append(&mut self, data: &[u8]) {
+        self.recording.write_all(data).unwrap();
    }

    pub fn record_stop(&mut self, name: &str, duration: u64) {
--- a/artiq/firmware/runtime/session.rs
+++ b/artiq/firmware/runtime/session.rs
@ -129,6 +129,13 @@ fn host_write(stream: &mut Write, reply: host::Reply) -> io::Result<()> {
 fn kern_send(io: &Io, request: &kern::Message) -> io::Result<()> {
    match request {
        &kern::LoadRequest(_) => debug!("comm->kern LoadRequest(...)"),
+        &kern::DmaPlaybackReply { trace, duration } => {
+            if trace.map(|data| data.len() > 100).unwrap_or(false) {
+                debug!("comm->kern DmaPlaybackReply {{ trace: ..., duration: {:?} }}", duration)
+            } else {
+                debug!("comm->kern {:?}", request)
+            }
+        }
        _ => debug!("comm->kern {:?}", request)
    }
    unsafe { mailbox::send(request as *const _ as usize) }
@ -150,6 +157,13 @@ fn kern_recv_dotrace(reply: &kern::Message) {
    match reply {
        &kern::Log(_) => debug!("comm<-kern Log(...)"),
        &kern::LogSlice(_) => debug!("comm<-kern LogSlice(...)"),
+        &kern::DmaRecordAppend(data) => {
+            if data.len() > 100 {
+                debug!("comm<-kern DmaRecordAppend(...)")
+            } else {
+                debug!("comm<-kern {:?}", reply)
+            }
+        }
        _ => debug!("comm<-kern {:?}", reply)
    }
 }
@ -384,8 +398,8 @@ fn process_kern_message(io: &Io, mut stream: Option<&mut TcpStream>,
                session.congress.dma_manager.record_start();
                kern_acknowledge()
            }
-            &kern::DmaRecordAppend { timestamp, channel, address, data } => {
-                session.congress.dma_manager.record_append(timestamp, channel, address, data);
+            &kern::DmaRecordAppend(data) => {
+                session.congress.dma_manager.record_append(data);
                kern_acknowledge()
            }
            &kern::DmaRecordStop { name, duration } => {
--- a/artiq/test/coredevice/test_rtio.py
+++ b/artiq/test/coredevice/test_rtio.py
@ -488,6 +488,18 @@ class _DMA(EnvExperiment):
            delay(100*ns)
            self.ttl1.off()

+    @kernel
+    def record_many(self, n):
+        t1 = self.core.get_rtio_counter_mu()
+        with self.core_dma.record(self.trace_name):
+            for i in range(n//2):
+                delay(100*ns)
+                self.ttl1.on()
+                delay(100*ns)
+                self.ttl1.off()
+        t2 = self.core.get_rtio_counter_mu()
+        self.set_dataset("dma_record_time", self.core.mu_to_seconds(t2 - t1))
+
    @kernel
    def replay(self):
        self.core.break_realtime()
@ -555,3 +567,11 @@ class DMATest(ExperimentCase):
        exp.record()
        exp.replay_delta()
        self.assertEqual(exp.delta, 200)
+
+    def test_dma_record_time(self):
+        exp = self.create(_DMA)
+        count = 20000
+        exp.record_many(count)
+        dt = self.dataset_mgr.get("dma_record_time")
+        print("dt={}, dt/count={}".format(dt, dt/count))
+        self.assertLess(dt/count, 15*us)