From 03104210855e0006d3a7f0c9cfb332c0d0c2674a Mon Sep 17 00:00:00 2001 From: pca006132 Date: Tue, 21 Jul 2020 16:57:14 +0800 Subject: [PATCH] RTIO DMA: Compiled but not working. * Cache flush should be done before playback instead when getting the handler. * `csr::rtio_dma::enable_read()` would loop forever, probably bug in the gateware. --- examples/device_db.py | 6 +- examples/dma.py | 26 +++ src/Cargo.lock | 1 + src/libdyld/Cargo.toml | 2 + src/libdyld/src/lib.rs | 1 + src/libdyld/src/reloc.rs | 11 ++ src/runtime/src/kernel/api.rs | 8 + src/runtime/src/kernel/core1.rs | 12 +- src/runtime/src/kernel/dma.rs | 300 ++++++++++++++++++++++++++++++++ src/runtime/src/kernel/mod.rs | 4 + 10 files changed, 369 insertions(+), 2 deletions(-) create mode 100644 examples/dma.py create mode 100644 src/runtime/src/kernel/dma.rs diff --git a/examples/device_db.py b/examples/device_db.py index fbac672..9110390 100644 --- a/examples/device_db.py +++ b/examples/device_db.py @@ -12,7 +12,11 @@ device_db = { "target": "cortexa9" } }, - + "core_dma": { + "type": "local", + "module": "artiq.coredevice.dma", + "class": "CoreDMA" + }, # led? are common to all variants "led0": { "type": "local", diff --git a/examples/dma.py b/examples/dma.py new file mode 100644 index 0000000..ce93580 --- /dev/null +++ b/examples/dma.py @@ -0,0 +1,26 @@ +from artiq.experiment import * + +class DMAPulses(EnvExperiment): + def build(self): + self.setattr_device("core") + self.setattr_device("core_dma") + self.setattr_device("led0") + + @kernel + def record(self): + with self.core_dma.record("pulses"): + # all RTIO operations now go to the "pulses" + # DMA buffer, instead of being executed immediately. + self.led0.pulse(100*ns) + delay(100*ns) + + @kernel + def run(self): + self.core.reset() + self.record() + # prefetch the address of the DMA buffer + # for faster playback trigger + pulses_handle = self.core_dma.get_handle("pulses") + self.core.break_realtime() + self.core_dma.playback_handle(pulses_handle) + diff --git a/src/Cargo.lock b/src/Cargo.lock index c642b69..414b176 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -96,6 +96,7 @@ dependencies = [ name = "dyld" version = "0.1.0" dependencies = [ + "libcortex_a9", "log", ] diff --git a/src/libdyld/Cargo.toml b/src/libdyld/Cargo.toml index 3138f50..0ffc81f 100644 --- a/src/libdyld/Cargo.toml +++ b/src/libdyld/Cargo.toml @@ -8,3 +8,5 @@ name = "dyld" [dependencies] log = "0.4" +libcortex_a9 = { git = "https://git.m-labs.hk/M-Labs/zc706.git" } + diff --git a/src/libdyld/src/lib.rs b/src/libdyld/src/lib.rs index 54a07dd..15f6911 100644 --- a/src/libdyld/src/lib.rs +++ b/src/libdyld/src/lib.rs @@ -2,6 +2,7 @@ extern crate alloc; extern crate log; +extern crate libcortex_a9; use core::{convert, fmt, str}; use alloc::string::String; diff --git a/src/libdyld/src/reloc.rs b/src/libdyld/src/reloc.rs index 9200703..85667d0 100644 --- a/src/libdyld/src/reloc.rs +++ b/src/libdyld/src/reloc.rs @@ -7,6 +7,10 @@ use super::{ image::Image, Library, }; +use libcortex_a9::{ + cache::{dcci_slice, iciallu, bpiall}, + asm::{dsb, isb}, +}; pub trait Relocatable { fn offset(&self) -> usize; @@ -154,6 +158,13 @@ pub fn rebind( _ => {} } } + // FIXME: the cache maintainance operations may be more than enough, + // may cause performance degradation. + dcci_slice(lib.image.data); + iciallu(); + bpiall(); + dsb(); + isb(); Ok(()) } diff --git a/src/runtime/src/kernel/api.rs b/src/runtime/src/kernel/api.rs index c12b152..ae68fc5 100644 --- a/src/runtime/src/kernel/api.rs +++ b/src/runtime/src/kernel/api.rs @@ -3,6 +3,7 @@ use libm; use crate::eh_artiq; use crate::rtio; use super::rpc::{rpc_send, rpc_send_async, rpc_recv}; +use super::dma; macro_rules! api { ($i:ident) => ({ @@ -50,6 +51,13 @@ pub fn resolve(required: &[u8]) -> Option { api!(rtio_input_timestamped_data = rtio::input_timestamped_data), api!(rtio_log = rtio::log), + // rtio dma + api!(dma_record_start = dma::dma_record_start), + api!(dma_record_stop = dma::dma_record_stop), + api!(dma_erase = dma::dma_erase), + api!(dma_retrieve = dma::dma_retrieve), + api!(dma_playback = dma::dma_playback), + // Double-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 2 api!(__aeabi_dadd), diff --git a/src/runtime/src/kernel/core1.rs b/src/runtime/src/kernel/core1.rs index 41f18f0..75c56c3 100644 --- a/src/runtime/src/kernel/core1.rs +++ b/src/runtime/src/kernel/core1.rs @@ -16,8 +16,10 @@ use crate::eh_artiq; use super::{ api::resolve, rpc::rpc_send_async, + dma::init_dma, CHANNEL_0TO1, CHANNEL_1TO0, KERNEL_CHANNEL_0TO1, KERNEL_CHANNEL_1TO0, + KERNEL_LIBRARY, Message, }; @@ -93,6 +95,10 @@ impl KernelImage { }) } + pub fn get_library_ptr(&mut self) -> *mut Library { + &mut self.library as *mut Library + } + pub unsafe fn exec(&mut self) { // Flush data cache entries for the image in DDR, including // Memory/Instruction Synchronization Barriers @@ -118,6 +124,9 @@ pub fn main_core1() { enable_fpu(); debug!("FPU enabled on Core1"); + init_dma(); + debug!("Init DMA!"); + let mut core1_tx = None; while core1_tx.is_none() { core1_tx = CHANNEL_1TO0.lock().take(); @@ -139,9 +148,10 @@ pub fn main_core1() { let result = dyld::load(&data, &resolve) .and_then(KernelImage::new); match result { - Ok(kernel) => { + Ok(mut kernel) => { unsafe { KERNEL_LOAD_ADDR = kernel.library.image.as_ptr() as usize; + KERNEL_LIBRARY = kernel.get_library_ptr(); } loaded_kernel = Some(kernel); debug!("kernel loaded"); diff --git a/src/runtime/src/kernel/dma.rs b/src/runtime/src/kernel/dma.rs new file mode 100644 index 0000000..ba55835 --- /dev/null +++ b/src/runtime/src/kernel/dma.rs @@ -0,0 +1,300 @@ +use crate::{ + pl::csr, + artiq_raise, + rtio, +}; +use alloc::{vec::Vec, string::String, collections::BTreeMap, str}; +use cslice::CSlice; +use super::KERNEL_LIBRARY; +use core::mem; +use log::debug; + +use libcortex_a9::{ + cache::dcci_slice, + asm::dsb, +}; + +const ALIGNMENT: usize = 16 * 8; +const DMA_BUFFER_SIZE: usize = 16 * 8 * 1024; + +struct DmaRecorder { + active: bool, + data_len: usize, + buffer: [u8; DMA_BUFFER_SIZE], +} + +static mut DMA_RECORDER: DmaRecorder = DmaRecorder { + active: false, + data_len: 0, + buffer: [0; DMA_BUFFER_SIZE], +}; + +#[derive(Debug)] +struct Entry { + trace: Vec, + padding_len: usize, + duration: u64 +} + +#[derive(Debug)] +pub struct Manager { + entries: BTreeMap, + recording_name: String, + recording_trace: Vec +} + +// Copied from https://github.com/m-labs/artiq/blob/master/artiq/firmware/runtime/rtio_dma.rs +// basically without modification except removing some warnings. +impl Manager { + pub fn new() -> Manager { + Manager { + entries: BTreeMap::new(), + recording_name: String::new(), + recording_trace: Vec::new(), + } + } + + pub fn record_start(&mut self, name: &str) { + self.recording_name = String::from(name); + self.recording_trace = Vec::new(); + + // or we could needlessly OOM replacing a large trace + self.entries.remove(name); + } + + pub fn record_append(&mut self, data: &[u8]) { + self.recording_trace.extend_from_slice(data); + } + + pub fn record_stop(&mut self, duration: u64) { + let mut trace = Vec::new(); + mem::swap(&mut self.recording_trace, &mut trace); + trace.push(0); + let data_len = trace.len(); + + // Realign. + trace.reserve(ALIGNMENT - 1); + let padding = ALIGNMENT - trace.as_ptr() as usize % ALIGNMENT; + let padding = if padding == ALIGNMENT { 0 } else { padding }; + for _ in 0..padding { + // Vec guarantees that this will not reallocate + trace.push(0) + } + for i in 1..data_len + 1 { + trace[data_len + padding - i] = trace[data_len - i] + } + + let mut name = String::new(); + mem::swap(&mut self.recording_name, &mut name); + self.entries.insert(name, Entry { + trace, duration, + padding_len: padding, + }); + } + + pub fn erase(&mut self, name: &str) { + self.entries.remove(name); + } + + pub fn with_trace(&self, name: &str, f: F) -> R + where F: FnOnce(Option<&[u8]>, u64) -> R { + match self.entries.get(name) { + Some(entry) => f(Some(&entry.trace[entry.padding_len..]), entry.duration), + None => f(None, 0) + } + } +} + + +static mut DMA_MANAGER: Option = None; + +#[repr(C)] +pub struct DmaTrace { + duration: i64, + address: i32, +} + +pub fn init_dma() { + unsafe { + DMA_MANAGER = Some(Manager::new()); + } +} + +fn dma_record_flush() { + unsafe { + let manager = DMA_MANAGER.as_mut().unwrap(); + manager.record_append(&DMA_RECORDER.buffer[..DMA_RECORDER.data_len]); + DMA_RECORDER.data_len = 0; + } +} + +pub extern fn dma_record_start(name: CSlice) { + let name = str::from_utf8(name.as_ref()).unwrap(); + + unsafe { + if DMA_RECORDER.active { + artiq_raise!("DMAError", "DMA is already recording") + } + + let library = KERNEL_LIBRARY.as_mut().unwrap(); + library.rebind(b"rtio_output", + dma_record_output as *const ()).unwrap(); + library.rebind(b"rtio_output_wide", + dma_record_output_wide as *const ()).unwrap(); + + DMA_RECORDER.active = true; + let manager = DMA_MANAGER.as_mut().unwrap(); + manager.record_start(name); + } +} + +pub extern fn dma_record_stop(duration: i64) { + unsafe { + dma_record_flush(); + + if !DMA_RECORDER.active { + artiq_raise!("DMAError", "DMA is not recording") + } + + let library = KERNEL_LIBRARY.as_mut().unwrap(); + library.rebind(b"rtio_output", + rtio::output as *const ()).unwrap(); + library.rebind(b"rtio_output_wide", + rtio::output_wide as *const ()).unwrap(); + + DMA_RECORDER.active = false; + let manager = DMA_MANAGER.as_mut().unwrap(); + manager.record_stop(duration as u64); + } +} + +#[inline(always)] +unsafe fn dma_record_output_prepare(timestamp: i64, target: i32, + words: usize) -> &'static mut [u8] { + // See gateware/rtio/dma.py. + const HEADER_LENGTH: usize = /*length*/1 + /*channel*/3 + /*timestamp*/8 + /*address*/1; + let length = HEADER_LENGTH + /*data*/words * 4; + + if DMA_RECORDER.buffer.len() - DMA_RECORDER.data_len < length { + dma_record_flush() + } + + let record = &mut DMA_RECORDER.buffer[DMA_RECORDER.data_len.. + DMA_RECORDER.data_len + length]; + DMA_RECORDER.data_len += length; + + let (header, data) = record.split_at_mut(HEADER_LENGTH); + + header.copy_from_slice(&[ + (length >> 0) as u8, + (target >> 8) as u8, + (target >> 16) as u8, + (target >> 24) as u8, + (timestamp >> 0) as u8, + (timestamp >> 8) as u8, + (timestamp >> 16) as u8, + (timestamp >> 24) as u8, + (timestamp >> 32) as u8, + (timestamp >> 40) as u8, + (timestamp >> 48) as u8, + (timestamp >> 56) as u8, + (target >> 0) as u8, + ]); + + data +} + +pub extern fn dma_record_output(target: i32, word: i32) { + unsafe { + let timestamp = csr::rtio::now_read() as i64; + let data = dma_record_output_prepare(timestamp, target, 1); + data.copy_from_slice(&[ + (word >> 0) as u8, + (word >> 8) as u8, + (word >> 16) as u8, + (word >> 24) as u8, + ]); + } +} + +pub extern fn dma_record_output_wide(target: i32, words: CSlice) { + assert!(words.len() <= 16); // enforce the hardware limit + + unsafe { + let timestamp = csr::rtio::now_read() as i64; + let mut data = dma_record_output_prepare(timestamp, target, words.len()); + for word in words.as_ref().iter() { + data[..4].copy_from_slice(&[ + (word >> 0) as u8, + (word >> 8) as u8, + (word >> 16) as u8, + (word >> 24) as u8, + ]); + data = &mut data[4..]; + } + } +} + +pub extern fn dma_erase(name: CSlice) { + let name = str::from_utf8(name.as_ref()).unwrap(); + + let manager = unsafe { + DMA_MANAGER.as_mut().unwrap() + }; + manager.erase(name); +} + +pub extern fn dma_retrieve(name: CSlice) -> DmaTrace { + let name = str::from_utf8(name.as_ref()).unwrap(); + + let manager = unsafe { + DMA_MANAGER.as_mut().unwrap() + }; + let (trace, duration) = manager.with_trace(name, |trace, duration| (trace.map(|v| { + dcci_slice(v); + dsb(); + v.as_ptr() + }), duration)); + match trace { + Some(ptr) => Ok(DmaTrace { + address: ptr as i32, + duration: duration as i64, + }), + None => Err(()) + }.unwrap_or_else(|_| { + artiq_raise!("DMAError", "DMA trace not found"); + }) +} + +pub extern fn dma_playback(timestamp: i64, ptr: i32) { + assert!(ptr % ALIGNMENT as i32 == 0); + + debug!("DMA Playback"); + unsafe { + csr::rtio_dma::base_address_write(ptr as u32); + csr::rtio_dma::time_offset_write(timestamp as u64); + + csr::cri_con::selected_write(1); + csr::rtio_dma::enable_write(1); + while csr::rtio_dma::enable_read() != 0 {} + csr::cri_con::selected_write(0); + + let error = csr::rtio_dma::error_read(); + if error != 0 { + let timestamp = csr::rtio_dma::error_timestamp_read(); + let channel = csr::rtio_dma::error_channel_read(); + csr::rtio_dma::error_write(1); + if error & 1 != 0 { + artiq_raise!("RTIOUnderflow", + "RTIO underflow at {0} mu, channel {1}", + timestamp as i64, channel as i64, 0); + } + if error & 2 != 0 { + artiq_raise!("RTIODestinationUnreachable", + "RTIO destination unreachable, output, at {0} mu, channel {1}", + timestamp as i64, channel as i64, 0); + } + } + } +} + diff --git a/src/runtime/src/kernel/mod.rs b/src/runtime/src/kernel/mod.rs index c07cc37..7c5171f 100644 --- a/src/runtime/src/kernel/mod.rs +++ b/src/runtime/src/kernel/mod.rs @@ -1,6 +1,7 @@ use core::ptr; use alloc::{vec::Vec, sync::Arc, string::String}; +use dyld::Library; use libcortex_a9::{mutex::Mutex, sync_channel}; use crate::eh_artiq; @@ -9,6 +10,7 @@ pub use control::Control; pub mod core1; mod api; mod rpc; +mod dma; #[derive(Debug)] pub struct RPCException { @@ -39,3 +41,5 @@ static CHANNEL_1TO0: Mutex>> = Mutex::new(N static mut KERNEL_CHANNEL_0TO1: *mut () = ptr::null_mut(); static mut KERNEL_CHANNEL_1TO0: *mut () = ptr::null_mut(); +static mut KERNEL_LIBRARY: *mut Library = ptr::null_mut(); +