firmware: optimize dma_record_output.

This removes a number of bounds checks and adds a fast path for
outputting exactly one word to DMA, which is the most common
operation.
This commit is contained in:
whitequark 2018-08-12 19:12:36 +00:00
parent bdd18de2c1
commit 38d60100ff
2 changed files with 42 additions and 27 deletions

View File

@ -1,5 +1,5 @@
#![feature(lang_items, asm, panic_unwind, libc, unwind_attributes,
panic_implementation, panic_info_message)]
panic_implementation, panic_info_message, nll)]
#![no_std]
extern crate libc;
@ -300,19 +300,24 @@ extern fn dma_record_stop(duration: i64) {
}
#[unwind(aborts)]
extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, word: i32) {
dma_record_output_wide(timestamp, channel, address, [word].as_c_slice())
#[inline(always)]
unsafe fn dma_record_output_prepare(timestamp: i64, channel: i32, address: i32,
words: usize) -> &'static mut [u8] {
// See gateware/rtio/dma.py.
const HEADER_LENGTH: usize = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2;
let length = HEADER_LENGTH + /*data*/words * 4;
if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length {
dma_record_flush()
}
#[unwind(aborts)]
extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, words: CSlice<i32>) {
assert!(words.len() <= 16); // enforce the hardware limit
let record = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len..
DMA_RECORDER.data_len + length];
DMA_RECORDER.data_len += length;
// See gateware/rtio/dma.py.
let header_length = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/2;
let length = header_length + /*data*/words.len() * 4;
let (header, data) = record.split_at_mut(HEADER_LENGTH);
let header = [
header.copy_from_slice(&[
(length >> 0) as u8,
(channel >> 0) as u8,
(channel >> 8) as u8,
@ -327,29 +332,39 @@ extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, wor
(timestamp >> 56) as u8,
(address >> 0) as u8,
(address >> 8) as u8,
];
]);
let mut data = [0; 16 * 4];
for (i, &word) in words.as_ref().iter().enumerate() {
let part = [
data
}
#[unwind(aborts)]
extern fn dma_record_output(timestamp: i64, channel: i32, address: i32, word: i32) {
unsafe {
let data = dma_record_output_prepare(timestamp, channel, address, 1);
data.copy_from_slice(&[
(word >> 0) as u8,
(word >> 8) as u8,
(word >> 16) as u8,
(word >> 24) as u8,
];
data[i * 4..(i + 1) * 4].copy_from_slice(&part[..]);
]);
}
let data = &data[..words.len() * 4];
}
#[unwind(aborts)]
extern fn dma_record_output_wide(timestamp: i64, channel: i32, address: i32, words: CSlice<i32>) {
assert!(words.len() <= 16); // enforce the hardware limit
unsafe {
if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length {
dma_record_flush()
let mut data = dma_record_output_prepare(timestamp, channel, address, 1);
for word in words.as_ref().iter() {
data[..4].copy_from_slice(&[
(word >> 0) as u8,
(word >> 8) as u8,
(word >> 16) as u8,
(word >> 24) as u8,
]);
data = &mut data[4..];
}
let dst = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len..
DMA_RECORDER.data_len + length];
dst[..header_length].copy_from_slice(&header[..]);
dst[header_length..].copy_from_slice(&data[..]);
DMA_RECORDER.data_len += length;
}
}

View File

@ -641,7 +641,7 @@ class DMATest(ExperimentCase):
exp.record_many(count)
dt = self.dataset_mgr.get("dma_record_time")
print("dt={}, dt/count={}".format(dt, dt/count))
self.assertLess(dt/count, 20*us)
self.assertLess(dt/count, 11*us)
def test_dma_playback_time(self):
# Skip on Kasli until #946 is resolved.