dma: don't swap buffers

* This uses a new closure-based method to the DMA HAL implementation which gives access to the inactive buffer directly. * It removes changing addresses, the third buffer for DBM, the inactive address poisoning, and allows the cancellation of the redundant repeat memory barriers and compiler fences. * This is now around 20 instructions per buffer down from about 100 cycles before. * Also introduces a new `SampleBuffer` type alias. * The required unpacking of the resources structure is a bit annoying but could probably abstraced away. TODO: * Test * Adapt `lockin`
2021-05-24 20:06:31 +02:00 · 2021-05-24 20:06:31 +02:00 · 3165c680d6
commit 3165c680d6
parent d8cc3c74d9
6 changed files with 107 additions and 110 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -802,8 +802,7 @@ dependencies = [
 [[package]]
 name = "stm32h7xx-hal"
 version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67034b80041bc33a48df1c1c435b6ae3bb18c35e42aa7e702ce8363b96793398"
+source = "git+https://github.com/quartiq/stm32h7xx-hal.git?rev=cca4ecc#cca4ecc3e0cc8cb2f7a9652c4099d50b44977493"
 dependencies = [
 "bare-metal 1.0.0",
 "cast",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -56,7 +56,9 @@ rev = "523d71d"

 [dependencies.stm32h7xx-hal]
 features = ["stm32h743v", "rt", "unproven", "ethernet", "quadspi"]
-version = "0.9.0"
+# version = "0.9.0"
+git = "https://github.com/quartiq/stm32h7xx-hal.git"
+rev = "cca4ecc"

 [patch.crates-io.miniconf]
 git = "https://github.com/quartiq/miniconf.git"
--- a/src/bin/dual-iir.rs
+++ b/src/bin/dual-iir.rs
@ -2,6 +2,8 @@
 #![no_std]
 #![no_main]

+use core::sync::atomic::{fence, Ordering};
+
 use stabilizer::{hardware, net};

 use miniconf::Miniconf;
@ -120,51 +122,73 @@ const APP: () = {
    /// Because the ADC and DAC operate at the same rate, these two constraints actually implement
    /// the same time bounds, meeting one also means the other is also met.
    #[task(binds=DMA1_STR4, resources=[adcs, digital_inputs, dacs, iir_state, settings, telemetry], priority=2)]
-    fn process(c: process::Context) {
-        let adc_samples = [
-            c.resources.adcs.0.acquire_buffer(),
-            c.resources.adcs.1.acquire_buffer(),
-        ];
+    fn process(mut c: process::Context) {
+        let adc0 = &mut c.resources.adcs.0;
+        let adc1 = &mut c.resources.adcs.1;
+        let dac0 = &mut c.resources.dacs.0;
+        let dac1 = &mut c.resources.dacs.1;
+        let di = &c.resources.digital_inputs;
+        let settings = &c.resources.settings;
+        let iir_state = &mut c.resources.iir_state;
+        let telemetry = &mut c.resources.telemetry;
+        adc0.with_buffer(|a0| {
+            adc1.with_buffer(|a1| {
+                dac0.with_buffer(|d0| {
+                    dac1.with_buffer(|d1| {
+                        let adc_samples = [a0, a1];
+                        let dac_samples = [d0, d1];

-        let dac_samples = [
-            c.resources.dacs.0.acquire_buffer(),
-            c.resources.dacs.1.acquire_buffer(),
-        ];
+                        let digital_inputs =
+                            [di.0.is_high().unwrap(), di.1.is_high().unwrap()];

-        let digital_inputs = [
-            c.resources.digital_inputs.0.is_high().unwrap(),
-            c.resources.digital_inputs.1.is_high().unwrap(),
-        ];
+                        let hold = settings.force_hold
+                            || (digital_inputs[1] && settings.allow_hold);

-        let hold = c.resources.settings.force_hold
-            || (digital_inputs[1] && c.resources.settings.allow_hold);
+                        // Preserve instruction and data ordering w.r.t. DMA flag access.
+                        fence(Ordering::SeqCst);

                        for channel in 0..adc_samples.len() {
-            for sample in 0..adc_samples[0].len() {
-                let mut y = f32::from(adc_samples[channel][sample] as i16);
-                for i in 0..c.resources.iir_state[channel].len() {
-                    y = c.resources.settings.iir_ch[channel][i].update(
-                        &mut c.resources.iir_state[channel][i],
-                        y,
-                        hold,
-                    );
-                }
-                // Note(unsafe): The filter limits ensure that the value is in range.
+                            adc_samples[channel]
+                                .iter()
+                                .zip(dac_samples[channel].iter_mut())
+                                .map(|(ai, di)| {
+                                    let x = f32::from(*ai as i16);
+                                    let y = settings.iir_ch[channel]
+                                        .iter()
+                                        .zip(iir_state[channel].iter_mut())
+                                        .fold(x, |yi, (ch, state)| {
+                                            ch.update(state, yi, hold)
+                                        });
+                                    // Note(unsafe): The filter limits must ensure that
+                                    // the value is in range.
                                    // The truncation introduces 1/2 LSB distortion.
-                let y = unsafe { y.to_int_unchecked::<i16>() };
+                                    let y: i16 =
+                                        unsafe { y.to_int_unchecked() };
                                    // Convert to DAC code
-                dac_samples[channel][sample] = DacCode::from(y).0;
-            }
+                                    *di = DacCode::from(y).0;
+                                })
+                                .last();
                        }

                        // Update telemetry measurements.
-        c.resources.telemetry.adcs =
-            [AdcCode(adc_samples[0][0]), AdcCode(adc_samples[1][0])];
+                        telemetry.adcs = [
+                            AdcCode(adc_samples[0][0]),
+                            AdcCode(adc_samples[1][0]),
+                        ];

-        c.resources.telemetry.dacs =
-            [DacCode(dac_samples[0][0]), DacCode(dac_samples[1][0])];
+                        telemetry.dacs = [
+                            DacCode(dac_samples[0][0]),
+                            DacCode(dac_samples[1][0]),
+                        ];

-        c.resources.telemetry.digital_inputs = digital_inputs;
+                        telemetry.digital_inputs = digital_inputs;
+
+                        // Preserve instruction and data ordering w.r.t. DMA flag access.
+                        fence(Ordering::SeqCst);
+                    })
+                })
+            })
+        });
    }

    #[idle(resources=[network], spawn=[settings_update])]
--- a/src/hardware/adc.rs
+++ b/src/hardware/adc.rs
@ -29,15 +29,9 @@
 ///! available. When enough samples have been collected, a transfer-complete interrupt is generated
 ///! and the ADC samples are available for processing.
 ///!
-///! The SPI peripheral internally has an 8- or 16-byte TX and RX FIFO, which corresponds to a 4- or
-///! 8-sample buffer for incoming ADC samples. During the handling of the DMA transfer completion,
-///! there is a small window where buffers are swapped over where it's possible that a sample could
-///! be lost. In order to avoid this, the SPI RX FIFO is effectively used as a "sample overflow"
-///! region and can buffer a number of samples until the next DMA transfer is configured. If a DMA
-///! transfer is still not set in time, the SPI peripheral will generate an input-overrun interrupt.
-///! This interrupt then serves as a means of detecting if samples have been lost, which will occur
-///! whenever data processing takes longer than the collection period.
-///!
+///! After a complete transfer of a batch of samples, the inactive buffer is available to the
+///! user for processing. The processing must complete before the DMA transfer of the next batch
+///! completes.
 ///!
 ///! ## Starting Data Collection
 ///!
@ -68,13 +62,12 @@
 ///! sample DMA requests, which can be completed by setting e.g. ADC0's comparison to a counter
 ///! value of 0 and ADC1's comparison to a counter value of 1.
 ///!
-///! In this implementation, single buffer mode DMA transfers are used because the SPI RX FIFO can
-///! be used as a means to both detect and buffer ADC samples during the buffer swap-over. Because
-///! of this, double-buffered mode does not offer any advantages over single-buffered mode (unless
-///! double-buffered mode offers less overhead due to the DMA disable/enable procedure).
+///! In this implementation, double buffer mode DMA transfers are used because the SPI RX FIFOs
+///! have finite depth, FIFO access is slower than AXISRAM access, and because the single
+///! buffer mode DMA disable/enable and buffer update sequence is slow.
 use stm32h7xx_hal as hal;

-use super::design_parameters::SAMPLE_BUFFER_SIZE;
+use super::design_parameters::{SampleBuffer, SAMPLE_BUFFER_SIZE};
 use super::timers;

 use hal::dma::{
@ -119,8 +112,7 @@ static mut SPI_EOT_CLEAR: [u32; 1] = [0x00];
 // processed). Note that the contents of AXI SRAM is uninitialized, so the buffer contents on
 // startup are undefined. The dimensions are `ADC_BUF[adc_index][ping_pong_index][sample_index]`.
 #[link_section = ".axisram.buffers"]
-static mut ADC_BUF: [[[u16; SAMPLE_BUFFER_SIZE]; 2]; 2] =
-    [[[0; SAMPLE_BUFFER_SIZE]; 2]; 2];
+static mut ADC_BUF: [[SampleBuffer; 2]; 2] = [[[0; SAMPLE_BUFFER_SIZE]; 2]; 2];

 macro_rules! adc_input {
    ($name:ident, $index:literal, $trigger_stream:ident, $data_stream:ident, $clear_stream:ident,
@ -192,12 +184,11 @@ macro_rules! adc_input {

            /// Represents data associated with ADC.
            pub struct $name {
-                next_buffer: Option<&'static mut [u16; SAMPLE_BUFFER_SIZE]>,
                transfer: Transfer<
                    hal::dma::dma::$data_stream<hal::stm32::DMA1>,
                    hal::spi::Spi<hal::stm32::$spi, hal::spi::Disabled, u16>,
                    PeripheralToMemory,
-                    &'static mut [u16; SAMPLE_BUFFER_SIZE],
+                    &'static mut SampleBuffer,
                    hal::dma::DBTransfer,
                >,
                trigger_transfer: Transfer<
@ -316,6 +307,7 @@ macro_rules! adc_input {
                    // data stream is used to trigger a transfer completion interrupt.
                    let data_config = DmaConfig::default()
                        .memory_increment(true)
+                        .double_buffer(true)
                        .transfer_complete_interrupt($index == 1)
                        .priority(Priority::VeryHigh);

@ -333,17 +325,14 @@ macro_rules! adc_input {
                        Transfer::init(
                            data_stream,
                            spi,
-                            // Note(unsafe): The ADC_BUF[$index][0] is "owned" by this peripheral.
+                            // Note(unsafe): The ADC_BUF[$index] is "owned" by this peripheral.
                            // It shall not be used anywhere else in the module.
                            unsafe { &mut ADC_BUF[$index][0] },
-                            None,
+                            unsafe { Some(&mut ADC_BUF[$index][1]) },
                            data_config,
                        );

                    Self {
-                        // Note(unsafe): The ADC_BUF[$index][1] is "owned" by this peripheral. It
-                        // shall not be used anywhere else in the module.
-                        next_buffer: unsafe { Some(&mut ADC_BUF[$index][1]) },
                        transfer: data_transfer,
                        trigger_transfer,
                        clear_transfer,
@ -364,27 +353,17 @@ macro_rules! adc_input {

                }

-                /// Obtain a buffer filled with ADC samples.
+                /// Wait for the transfer of the currently active buffer to complete,
+                /// then call a function on the now inactive buffer and acknowledge the
+                /// transfer complete flag.
                ///
-                /// # Returns
-                /// A reference to the underlying buffer that has been filled with ADC samples.
-                pub fn acquire_buffer(&mut self) -> &[u16; SAMPLE_BUFFER_SIZE] {
-                    // Wait for the transfer to fully complete before continuing.  Note: If a device
-                    // hangs up, check that this conditional is passing correctly, as there is no
-                    // time-out checks here in the interest of execution speed.
-                    while !self.transfer.get_transfer_complete_flag() {}
-
-                    let next_buffer = self.next_buffer.take().unwrap();
-
-                    // Start the next transfer.
-                    self.transfer.clear_interrupts();
-                    let (prev_buffer, _, _) =
-                        self.transfer.next_transfer(next_buffer).unwrap();
-
-                     // .unwrap_none() https://github.com/rust-lang/rust/issues/62633
-                    self.next_buffer.replace(prev_buffer);
-
-                    self.next_buffer.as_ref().unwrap()
+                /// NOTE(unsafe): Memory safety and access ordering is not guaranteed
+                /// (see the HAL DMA docs).
+                pub fn with_buffer<F, R>(&mut self, f: F) -> R
+                    where
+                F: FnOnce(&mut SampleBuffer) -> R,
+                {
+                    unsafe { self.transfer.next_dbm_transfer_with(|buf, _current| f(buf)) }
                }
            }
        }
--- a/src/hardware/dac.rs
+++ b/src/hardware/dac.rs
@ -52,7 +52,7 @@
 ///! served promptly after the transfer completes.
 use stm32h7xx_hal as hal;

-use super::design_parameters::SAMPLE_BUFFER_SIZE;
+use super::design_parameters::{SampleBuffer, SAMPLE_BUFFER_SIZE};
 use super::timers;

 use hal::dma::{
@ -66,8 +66,7 @@ use hal::dma::{
 // processed). Note that the contents of AXI SRAM is uninitialized, so the buffer contents on
 // startup are undefined. The dimensions are `ADC_BUF[adc_index][ping_pong_index][sample_index]`.
 #[link_section = ".axisram.buffers"]
-static mut DAC_BUF: [[[u16; SAMPLE_BUFFER_SIZE]; 3]; 2] =
-    [[[0; SAMPLE_BUFFER_SIZE]; 3]; 2];
+static mut DAC_BUF: [[SampleBuffer; 2]; 2] = [[[0; SAMPLE_BUFFER_SIZE]; 2]; 2];

 /// Custom type for referencing DAC output codes.
 /// The internal integer is the raw code written to the DAC output register.
@ -137,13 +136,12 @@ macro_rules! dac_output {

        /// Represents data associated with DAC.
        pub struct $name {
-            next_buffer: Option<&'static mut [u16; SAMPLE_BUFFER_SIZE]>,
            // Note: SPI TX functionality may not be used from this structure to ensure safety with DMA.
            transfer: Transfer<
                hal::dma::dma::$data_stream<hal::stm32::DMA1>,
                $spi,
                MemoryToPeripheral,
-                &'static mut [u16; SAMPLE_BUFFER_SIZE],
+                &'static mut SampleBuffer,
                hal::dma::DBTransfer,
            >,
        }
@ -198,33 +196,26 @@ macro_rules! dac_output {
                        trigger_config,
                    );

-                Self {
-                    transfer,
-                    // Note(unsafe): This buffer is only used once and provided for the next DMA transfer.
-                    next_buffer: unsafe { Some(&mut DAC_BUF[$index][2]) },
-                }
+                Self { transfer }
            }

            pub fn start(&mut self) {
                self.transfer.start(|spi| spi.start_dma());
            }

-            /// Acquire the next output buffer to populate it with DAC codes.
-            pub fn acquire_buffer(&mut self) -> &mut [u16; SAMPLE_BUFFER_SIZE] {
-                // Note: If a device hangs up, check that this conditional is passing correctly, as
-                // there is no time-out checks here in the interest of execution speed.
-                while !self.transfer.get_transfer_complete_flag() {}
-
-                let next_buffer = self.next_buffer.take().unwrap();
-
-                // Start the next transfer.
-                let (prev_buffer, _, _) =
-                    self.transfer.next_transfer(next_buffer).unwrap();
-
-                // .unwrap_none() https://github.com/rust-lang/rust/issues/62633
-                self.next_buffer.replace(prev_buffer);
-
-                self.next_buffer.as_mut().unwrap()
+            /// Wait for the transfer of the currently active buffer to complete,
+            /// then call a function on the now inactive buffer and acknowledge the
+            /// transfer complete flag.
+            ///
+            /// NOTE(unsafe): Memory safety and access ordering is not guaranteed
+            /// (see the HAL DMA docs).
+            pub fn with_buffer<F, R>(&mut self, f: F) -> R
+            where
+                F: FnOnce(&mut SampleBuffer) -> R,
+            {
+                unsafe {
+                    self.transfer.next_dbm_transfer_with(|buf, _current| f(buf))
+                }
            }
        }
    };
--- a/src/hardware/design_parameters.rs
+++ b/src/hardware/design_parameters.rs
@ -50,5 +50,7 @@ pub const ADC_SAMPLE_TICKS: u16 = 1 << ADC_SAMPLE_TICKS_LOG2;
 pub const SAMPLE_BUFFER_SIZE_LOG2: u8 = 3;
 pub const SAMPLE_BUFFER_SIZE: usize = 1 << SAMPLE_BUFFER_SIZE_LOG2;

+pub type SampleBuffer = [u16; SAMPLE_BUFFER_SIZE];
+
 // The MQTT broker IPv4 address
 pub const MQTT_BROKER: [u8; 4] = [10, 34, 16, 10];