From 9dfe9c124820da8bf4061c33271e191b1be77923 Mon Sep 17 00:00:00 2001 From: whitequark Date: Sat, 15 Apr 2017 07:26:18 +0000 Subject: [PATCH] DMA: improve recording performance. This commit moves DMA serialization code to the kernel CPU (to cope with the existence of rtio_output_wide) and batches the resulting sequences. This results in less data being transferred between kernel and comms CPUs (24 octets with one pointer before, 18 octets with no pointers now, for the common case of rtio_output), but most importantly reduces cache flushes, which now happen once per 64k octets. On average, it now takes about 15us to record a single RTIO event in a DMA trace. Fixes #712. --- artiq/firmware/ksupport/lib.rs | 97 ++++++++++++++++++++----- artiq/firmware/libproto/kernel_proto.rs | 8 +- artiq/firmware/runtime/rtio_dma.rs | 32 +------- artiq/firmware/runtime/session.rs | 18 ++++- artiq/test/coredevice/test_rtio.py | 20 +++++ 5 files changed, 116 insertions(+), 59 deletions(-) diff --git a/artiq/firmware/ksupport/lib.rs b/artiq/firmware/ksupport/lib.rs index bddbe3651..fd2027185 100644 --- a/artiq/firmware/ksupport/lib.rs +++ b/artiq/firmware/ksupport/lib.rs @@ -238,11 +238,33 @@ extern fn i2c_read(busno: i32, ack: bool) -> i32 { recv!(&I2cReadReply { data } => data) as i32 } -static mut DMA_RECORDING: bool = false; +const DMA_BUFFER_SIZE: usize = 64 * 1024; + +struct DmaRecorder { + active: bool, + #[allow(dead_code)] + padding: [u8; 3], //https://github.com/rust-lang/rust/issues/41315 + data_len: usize, + buffer: [u8; DMA_BUFFER_SIZE], +} + +static mut DMA_RECORDER: DmaRecorder = DmaRecorder { + active: false, + padding: [0; 3], + data_len: 0, + buffer: [0; DMA_BUFFER_SIZE], +}; + +fn dma_record_flush() { + unsafe { + send(&DmaRecordAppend(&DMA_RECORDER.buffer[..DMA_RECORDER.data_len])); + DMA_RECORDER.data_len = 0; + } +} extern fn dma_record_start() { unsafe { - if DMA_RECORDING { + if DMA_RECORDER.active { raise!("DMAError", "DMA is already recording") } @@ -252,7 +274,7 @@ extern fn dma_record_start() { library.rebind(b"rtio_output_wide", dma_record_output_wide as *const () as u32).unwrap(); - DMA_RECORDING = true; + DMA_RECORDER.active = true; send(&DmaRecordStart); } } @@ -261,7 +283,9 @@ extern fn dma_record_stop(name: CSlice, duration: i64) { let name = str::from_utf8(name.as_ref()).unwrap(); unsafe { - if !DMA_RECORDING { + dma_record_flush(); + + if !DMA_RECORDER.active { raise!("DMAError", "DMA is not recording") } @@ -271,7 +295,7 @@ extern fn dma_record_stop(name: CSlice, duration: i64) { library.rebind(b"rtio_output_wide", rtio::output_wide as *const () as u32).unwrap(); - DMA_RECORDING = false; + DMA_RECORDER.active = false; send(&DmaRecordStop { name: name, duration: duration as u64 @@ -279,23 +303,56 @@ extern fn dma_record_stop(name: CSlice, duration: i64) { } } -extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, data: i32) { - send(&DmaRecordAppend { - timestamp: timestamp as u64, - channel: channel as u32, - address: address as u32, - data: &[data as u32] - }) +extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, word: i32) { + dma_record_output_wide(timestamp, channel, address, [word].as_c_slice()) } -extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, data: CSlice) { - assert!(data.len() <= 16); // enforce the hardware limit - send(&DmaRecordAppend { - timestamp: timestamp as u64, - channel: channel as u32, - address: address as u32, - data: unsafe { mem::transmute::<&[i32], &[u32]>(data.as_ref()) } - }) +extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, words: CSlice) { + assert!(words.len() <= 16); // enforce the hardware limit + + // See gateware/rtio/dma.py. + let header_length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2; + let length = header_length + /*data*/words.len() * 4; + + let header = [ + (length >> 0) as u8, + (channel >> 0) as u8, + (channel >> 8) as u8, + (channel >> 16) as u8, + (timestamp >> 0) as u8, + (timestamp >> 8) as u8, + (timestamp >> 16) as u8, + (timestamp >> 24) as u8, + (timestamp >> 32) as u8, + (timestamp >> 40) as u8, + (timestamp >> 48) as u8, + (timestamp >> 56) as u8, + (address >> 0) as u8, + (address >> 8) as u8, + ]; + + let mut data = [0; 16 * 4]; + for (i, &word) in words.as_ref().iter().enumerate() { + let part = [ + (word >> 0) as u8, + (word >> 8) as u8, + (word >> 16) as u8, + (word >> 24) as u8, + ]; + data[i * 4..(i + 1) * 4].copy_from_slice(&part[..]); + } + let data = &data[..words.len() * 4]; + + unsafe { + if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length { + dma_record_flush() + } + let mut dst = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len.. + DMA_RECORDER.data_len + length]; + dst[..header_length].copy_from_slice(&header[..]); + dst[header_length..].copy_from_slice(&data[..]); + DMA_RECORDER.data_len += length; + } } extern fn dma_erase(name: CSlice) { diff --git a/artiq/firmware/libproto/kernel_proto.rs b/artiq/firmware/libproto/kernel_proto.rs index 36adc462d..cc8ca12ba 100644 --- a/artiq/firmware/libproto/kernel_proto.rs +++ b/artiq/firmware/libproto/kernel_proto.rs @@ -6,7 +6,6 @@ pub const KERNELCPU_PAYLOAD_ADDRESS: usize = 0x40840000; pub const KERNELCPU_LAST_ADDRESS: usize = 0x4fffffff; pub const KSUPPORT_HEADER_SIZE: usize = 0x80; -#[repr(C)] #[derive(Debug, Clone)] pub struct Exception<'a> { pub name: &'a str, @@ -30,12 +29,7 @@ pub enum Message<'a> { RtioInitRequest, DmaRecordStart, - DmaRecordAppend { - timestamp: u64, - channel: u32, - address: u32, - data: &'a [u32] - }, + DmaRecordAppend(&'a [u8]), DmaRecordStop { name: &'a str, duration: u64 diff --git a/artiq/firmware/runtime/rtio_dma.rs b/artiq/firmware/runtime/rtio_dma.rs index 118048ec2..e0dc86e41 100644 --- a/artiq/firmware/runtime/rtio_dma.rs +++ b/artiq/firmware/runtime/rtio_dma.rs @@ -31,36 +31,8 @@ impl Manager { self.recording = Vec::new(); } - pub fn record_append(&mut self, timestamp: u64, channel: u32, - address: u32, data: &[u32]) { - let writer = &mut self.recording; - // See gateware/rtio/dma.py. - let length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2 + - /*data*/data.len() * 4; - writer.write_all(&[ - (length >> 0) as u8, - (channel >> 0) as u8, - (channel >> 8) as u8, - (channel >> 16) as u8, - (timestamp >> 0) as u8, - (timestamp >> 8) as u8, - (timestamp >> 16) as u8, - (timestamp >> 24) as u8, - (timestamp >> 32) as u8, - (timestamp >> 40) as u8, - (timestamp >> 48) as u8, - (timestamp >> 56) as u8, - (address >> 0) as u8, - (address >> 8) as u8, - ]).unwrap(); - for &word in data { - writer.write_all(&[ - (word >> 0) as u8, - (word >> 8) as u8, - (word >> 16) as u8, - (word >> 24) as u8, - ]).unwrap(); - } + pub fn record_append(&mut self, data: &[u8]) { + self.recording.write_all(data).unwrap(); } pub fn record_stop(&mut self, name: &str, duration: u64) { diff --git a/artiq/firmware/runtime/session.rs b/artiq/firmware/runtime/session.rs index 871cd15e4..db58ed655 100644 --- a/artiq/firmware/runtime/session.rs +++ b/artiq/firmware/runtime/session.rs @@ -129,6 +129,13 @@ fn host_write(stream: &mut Write, reply: host::Reply) -> io::Result<()> { fn kern_send(io: &Io, request: &kern::Message) -> io::Result<()> { match request { &kern::LoadRequest(_) => debug!("comm->kern LoadRequest(...)"), + &kern::DmaPlaybackReply { trace, duration } => { + if trace.map(|data| data.len() > 100).unwrap_or(false) { + debug!("comm->kern DmaPlaybackReply {{ trace: ..., duration: {:?} }}", duration) + } else { + debug!("comm->kern {:?}", request) + } + } _ => debug!("comm->kern {:?}", request) } unsafe { mailbox::send(request as *const _ as usize) } @@ -150,6 +157,13 @@ fn kern_recv_dotrace(reply: &kern::Message) { match reply { &kern::Log(_) => debug!("comm<-kern Log(...)"), &kern::LogSlice(_) => debug!("comm<-kern LogSlice(...)"), + &kern::DmaRecordAppend(data) => { + if data.len() > 100 { + debug!("comm<-kern DmaRecordAppend(...)") + } else { + debug!("comm<-kern {:?}", reply) + } + } _ => debug!("comm<-kern {:?}", reply) } } @@ -384,8 +398,8 @@ fn process_kern_message(io: &Io, mut stream: Option<&mut TcpStream>, session.congress.dma_manager.record_start(); kern_acknowledge() } - &kern::DmaRecordAppend { timestamp, channel, address, data } => { - session.congress.dma_manager.record_append(timestamp, channel, address, data); + &kern::DmaRecordAppend(data) => { + session.congress.dma_manager.record_append(data); kern_acknowledge() } &kern::DmaRecordStop { name, duration } => { diff --git a/artiq/test/coredevice/test_rtio.py b/artiq/test/coredevice/test_rtio.py index 8f954e575..d0a1bb09b 100644 --- a/artiq/test/coredevice/test_rtio.py +++ b/artiq/test/coredevice/test_rtio.py @@ -488,6 +488,18 @@ class _DMA(EnvExperiment): delay(100*ns) self.ttl1.off() + @kernel + def record_many(self, n): + t1 = self.core.get_rtio_counter_mu() + with self.core_dma.record(self.trace_name): + for i in range(n//2): + delay(100*ns) + self.ttl1.on() + delay(100*ns) + self.ttl1.off() + t2 = self.core.get_rtio_counter_mu() + self.set_dataset("dma_record_time", self.core.mu_to_seconds(t2 - t1)) + @kernel def replay(self): self.core.break_realtime() @@ -555,3 +567,11 @@ class DMATest(ExperimentCase): exp.record() exp.replay_delta() self.assertEqual(exp.delta, 200) + + def test_dma_record_time(self): + exp = self.create(_DMA) + count = 20000 + exp.record_many(count) + dt = self.dataset_mgr.get("dma_record_time") + print("dt={}, dt/count={}".format(dt, dt/count)) + self.assertLess(dt/count, 15*us)