DMA: improve recording performance.

This commit moves DMA serialization code to the kernel CPU
(to cope with the existence of rtio_output_wide) and batches
the resulting sequences. This results in less data being transferred
between kernel and comms CPUs (24 octets with one pointer before,
18 octets with no pointers now, for the common case of rtio_output),
but most importantly reduces cache flushes, which now happen
once per 64k octets.

On average, it now takes about 15us to record a single RTIO event
in a DMA trace.

Fixes #712.
This commit is contained in:
whitequark 2017-04-15 07:26:18 +00:00
parent ea753bed17
commit 9dfe9c1248
5 changed files with 116 additions and 59 deletions

View File

@ -238,11 +238,33 @@ extern fn i2c_read(busno: i32, ack: bool) -> i32 {
recv!(&I2cReadReply { data } => data) as i32
}
static mut DMA_RECORDING: bool = false;
const DMA_BUFFER_SIZE: usize = 64 * 1024;
struct DmaRecorder {
active: bool,
#[allow(dead_code)]
padding: [u8; 3], //https://github.com/rust-lang/rust/issues/41315
data_len: usize,
buffer: [u8; DMA_BUFFER_SIZE],
}
static mut DMA_RECORDER: DmaRecorder = DmaRecorder {
active: false,
padding: [0; 3],
data_len: 0,
buffer: [0; DMA_BUFFER_SIZE],
};
fn dma_record_flush() {
unsafe {
send(&DmaRecordAppend(&DMA_RECORDER.buffer[..DMA_RECORDER.data_len]));
DMA_RECORDER.data_len = 0;
}
}
extern fn dma_record_start() {
unsafe {
if DMA_RECORDING {
if DMA_RECORDER.active {
raise!("DMAError", "DMA is already recording")
}
@ -252,7 +274,7 @@ extern fn dma_record_start() {
library.rebind(b"rtio_output_wide",
dma_record_output_wide as *const () as u32).unwrap();
DMA_RECORDING = true;
DMA_RECORDER.active = true;
send(&DmaRecordStart);
}
}
@ -261,7 +283,9 @@ extern fn dma_record_stop(name: CSlice<u8>, duration: i64) {
let name = str::from_utf8(name.as_ref()).unwrap();
unsafe {
if !DMA_RECORDING {
dma_record_flush();
if !DMA_RECORDER.active {
raise!("DMAError", "DMA is not recording")
}
@ -271,7 +295,7 @@ extern fn dma_record_stop(name: CSlice<u8>, duration: i64) {
library.rebind(b"rtio_output_wide",
rtio::output_wide as *const () as u32).unwrap();
DMA_RECORDING = false;
DMA_RECORDER.active = false;
send(&DmaRecordStop {
name: name,
duration: duration as u64
@ -279,23 +303,56 @@ extern fn dma_record_stop(name: CSlice<u8>, duration: i64) {
}
}
extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, data: i32) {
send(&DmaRecordAppend {
timestamp: timestamp as u64,
channel: channel as u32,
address: address as u32,
data: &[data as u32]
})
extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, word: i32) {
dma_record_output_wide(timestamp, channel, address, [word].as_c_slice())
}
extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, data: CSlice<i32>) {
assert!(data.len() <= 16); // enforce the hardware limit
send(&DmaRecordAppend {
timestamp: timestamp as u64,
channel: channel as u32,
address: address as u32,
data: unsafe { mem::transmute::<&[i32], &[u32]>(data.as_ref()) }
})
extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, words: CSlice<i32>) {
assert!(words.len() <= 16); // enforce the hardware limit
// See gateware/rtio/dma.py.
let header_length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2;
let length = header_length + /*data*/words.len() * 4;
let header = [
(length >> 0) as u8,
(channel >> 0) as u8,
(channel >> 8) as u8,
(channel >> 16) as u8,
(timestamp >> 0) as u8,
(timestamp >> 8) as u8,
(timestamp >> 16) as u8,
(timestamp >> 24) as u8,
(timestamp >> 32) as u8,
(timestamp >> 40) as u8,
(timestamp >> 48) as u8,
(timestamp >> 56) as u8,
(address >> 0) as u8,
(address >> 8) as u8,
];
let mut data = [0; 16 * 4];
for (i, &word) in words.as_ref().iter().enumerate() {
let part = [
(word >> 0) as u8,
(word >> 8) as u8,
(word >> 16) as u8,
(word >> 24) as u8,
];
data[i * 4..(i + 1) * 4].copy_from_slice(&part[..]);
}
let data = &data[..words.len() * 4];
unsafe {
if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length {
dma_record_flush()
}
let mut dst = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len..
DMA_RECORDER.data_len + length];
dst[..header_length].copy_from_slice(&header[..]);
dst[header_length..].copy_from_slice(&data[..]);
DMA_RECORDER.data_len += length;
}
}
extern fn dma_erase(name: CSlice<u8>) {

View File

@ -6,7 +6,6 @@ pub const KERNELCPU_PAYLOAD_ADDRESS: usize = 0x40840000;
pub const KERNELCPU_LAST_ADDRESS: usize = 0x4fffffff;
pub const KSUPPORT_HEADER_SIZE: usize = 0x80;
#[repr(C)]
#[derive(Debug, Clone)]
pub struct Exception<'a> {
pub name: &'a str,
@ -30,12 +29,7 @@ pub enum Message<'a> {
RtioInitRequest,
DmaRecordStart,
DmaRecordAppend {
timestamp: u64,
channel: u32,
address: u32,
data: &'a [u32]
},
DmaRecordAppend(&'a [u8]),
DmaRecordStop {
name: &'a str,
duration: u64

View File

@ -31,36 +31,8 @@ impl Manager {
self.recording = Vec::new();
}
pub fn record_append(&mut self, timestamp: u64, channel: u32,
address: u32, data: &[u32]) {
let writer = &mut self.recording;
// See gateware/rtio/dma.py.
let length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2 +
/*data*/data.len() * 4;
writer.write_all(&[
(length >> 0) as u8,
(channel >> 0) as u8,
(channel >> 8) as u8,
(channel >> 16) as u8,
(timestamp >> 0) as u8,
(timestamp >> 8) as u8,
(timestamp >> 16) as u8,
(timestamp >> 24) as u8,
(timestamp >> 32) as u8,
(timestamp >> 40) as u8,
(timestamp >> 48) as u8,
(timestamp >> 56) as u8,
(address >> 0) as u8,
(address >> 8) as u8,
]).unwrap();
for &word in data {
writer.write_all(&[
(word >> 0) as u8,
(word >> 8) as u8,
(word >> 16) as u8,
(word >> 24) as u8,
]).unwrap();
}
pub fn record_append(&mut self, data: &[u8]) {
self.recording.write_all(data).unwrap();
}
pub fn record_stop(&mut self, name: &str, duration: u64) {

View File

@ -129,6 +129,13 @@ fn host_write(stream: &mut Write, reply: host::Reply) -> io::Result<()> {
fn kern_send(io: &Io, request: &kern::Message) -> io::Result<()> {
match request {
&kern::LoadRequest(_) => debug!("comm->kern LoadRequest(...)"),
&kern::DmaPlaybackReply { trace, duration } => {
if trace.map(|data| data.len() > 100).unwrap_or(false) {
debug!("comm->kern DmaPlaybackReply {{ trace: ..., duration: {:?} }}", duration)
} else {
debug!("comm->kern {:?}", request)
}
}
_ => debug!("comm->kern {:?}", request)
}
unsafe { mailbox::send(request as *const _ as usize) }
@ -150,6 +157,13 @@ fn kern_recv_dotrace(reply: &kern::Message) {
match reply {
&kern::Log(_) => debug!("comm<-kern Log(...)"),
&kern::LogSlice(_) => debug!("comm<-kern LogSlice(...)"),
&kern::DmaRecordAppend(data) => {
if data.len() > 100 {
debug!("comm<-kern DmaRecordAppend(...)")
} else {
debug!("comm<-kern {:?}", reply)
}
}
_ => debug!("comm<-kern {:?}", reply)
}
}
@ -384,8 +398,8 @@ fn process_kern_message(io: &Io, mut stream: Option<&mut TcpStream>,
session.congress.dma_manager.record_start();
kern_acknowledge()
}
&kern::DmaRecordAppend { timestamp, channel, address, data } => {
session.congress.dma_manager.record_append(timestamp, channel, address, data);
&kern::DmaRecordAppend(data) => {
session.congress.dma_manager.record_append(data);
kern_acknowledge()
}
&kern::DmaRecordStop { name, duration } => {

View File

@ -488,6 +488,18 @@ class _DMA(EnvExperiment):
delay(100*ns)
self.ttl1.off()
@kernel
def record_many(self, n):
t1 = self.core.get_rtio_counter_mu()
with self.core_dma.record(self.trace_name):
for i in range(n//2):
delay(100*ns)
self.ttl1.on()
delay(100*ns)
self.ttl1.off()
t2 = self.core.get_rtio_counter_mu()
self.set_dataset("dma_record_time", self.core.mu_to_seconds(t2 - t1))
@kernel
def replay(self):
self.core.break_realtime()
@ -555,3 +567,11 @@ class DMATest(ExperimentCase):
exp.record()
exp.replay_delta()
self.assertEqual(exp.delta, 200)
def test_dma_record_time(self):
exp = self.create(_DMA)
count = 20000
exp.record_many(count)
dt = self.dataset_mgr.get("dma_record_time")
print("dt={}, dt/count={}".format(dt, dt/count))
self.assertLess(dt/count, 15*us)