firmware: optimize dma_record_output.

This removes a number of bounds checks and adds a fast path for
outputting exactly one word to DMA, which is the most common
operation.
This commit is contained in:
whitequark 2018-08-12 19:12:36 +00:00
parent bdd18de2c1
commit 38d60100ff
2 changed files with 42 additions and 27 deletions

View File

@ -1,5 +1,5 @@
#![feature(lang_items, asm, panic_unwind, libc, unwind_attributes, #![feature(lang_items, asm, panic_unwind, libc, unwind_attributes,
panic_implementation, panic_info_message)] panic_implementation, panic_info_message, nll)]
#![no_std] #![no_std]
extern crate libc; extern crate libc;
@ -300,19 +300,24 @@ extern fn dma_record_stop(duration: i64) {
} }
#[unwind(aborts)] #[unwind(aborts)]
extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, word: i32) { #[inline(always)]
dma_record_output_wide(timestamp, channel, address, [word].as_c_slice()) unsafe fn dma_record_output_prepare(timestamp: i64, channel: i32, address: i32,
} words: usize) -> &'static mut [u8] {
#[unwind(aborts)]
extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, words: CSlice<i32>) {
assert!(words.len() <= 16); // enforce the hardware limit
// See gateware/rtio/dma.py. // See gateware/rtio/dma.py.
let header_length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2; const HEADER_LENGTH: usize = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2;
let length = header_length + /*data*/words.len() * 4; let length = HEADER_LENGTH + /*data*/words * 4;
let header = [ if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length {
dma_record_flush()
}
let record = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len..
DMA_RECORDER.data_len + length];
DMA_RECORDER.data_len += length;
let (header, data) = record.split_at_mut(HEADER_LENGTH);
header.copy_from_slice(&[
(length >> 0) as u8, (length >> 0) as u8,
(channel >> 0) as u8, (channel >> 0) as u8,
(channel >> 8) as u8, (channel >> 8) as u8,
@ -327,29 +332,39 @@ extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, wor
(timestamp >> 56) as u8, (timestamp >> 56) as u8,
(address >> 0) as u8, (address >> 0) as u8,
(address >> 8) as u8, (address >> 8) as u8,
]; ]);
let mut data = [0; 16 * 4]; data
for (i, &word) in words.as_ref().iter().enumerate() { }
let part = [
#[unwind(aborts)]
extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, word: i32) {
unsafe {
let data = dma_record_output_prepare(timestamp, channel, address, 1);
data.copy_from_slice(&[
(word >> 0) as u8, (word >> 0) as u8,
(word >> 8) as u8, (word >> 8) as u8,
(word >> 16) as u8, (word >> 16) as u8,
(word >> 24) as u8, (word >> 24) as u8,
]; ]);
data[i * 4..(i + 1) * 4].copy_from_slice(&part[..]);
} }
let data = &data[..words.len() * 4]; }
#[unwind(aborts)]
extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, words: CSlice<i32>) {
assert!(words.len() <= 16); // enforce the hardware limit
unsafe { unsafe {
if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length { let mut data = dma_record_output_prepare(timestamp, channel, address, 1);
dma_record_flush() for word in words.as_ref().iter() {
data[..4].copy_from_slice(&[
(word >> 0) as u8,
(word >> 8) as u8,
(word >> 16) as u8,
(word >> 24) as u8,
]);
data = &mut data[4..];
} }
let dst = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len..
DMA_RECORDER.data_len + length];
dst[..header_length].copy_from_slice(&header[..]);
dst[header_length..].copy_from_slice(&data[..]);
DMA_RECORDER.data_len += length;
} }
} }

View File

@ -641,7 +641,7 @@ class DMATest(ExperimentCase):
exp.record_many(count) exp.record_many(count)
dt = self.dataset_mgr.get("dma_record_time") dt = self.dataset_mgr.get("dma_record_time")
print("dt={}, dt/count={}".format(dt, dt/count)) print("dt={}, dt/count={}".format(dt, dt/count))
self.assertLess(dt/count, 20*us) self.assertLess(dt/count, 11*us)
def test_dma_playback_time(self): def test_dma_playback_time(self):
# Skip on Kasli until #946 is resolved. # Skip on Kasli until #946 is resolved.