From 6f3a30292c4ef54ea2c861a66478a7250e12afa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Sat, 20 Feb 2021 20:27:30 +0100
Subject: [PATCH 01/15] iir_int: make miniconf-able

---
 dsp/src/iir_int.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/dsp/src/iir_int.rs b/dsp/src/iir_int.rs
index 4af50f1..02fdeb1 100644
--- a/dsp/src/iir_int.rs
+++ b/dsp/src/iir_int.rs
@@ -1,5 +1,6 @@
 use core::f64::consts::PI;
-use serde::{Deserialize, Serialize};
+use miniconf::StringSet;
+use serde::Deserialize;
 
 /// Generic vector for integer IIR filter.
 /// This struct is used to hold the x/y input/output data vector or the b/a coefficient
@@ -55,7 +56,7 @@ fn macc(y0: i32, x: &[i32], a: &[i32], shift: u32) -> i32 {
 /// See `dsp::iir::IIR` for general implementation details.
 /// Offset and limiting disabled to suit lowpass applications.
 /// Coefficient scaling fixed and optimized.
-#[derive(Copy, Clone, Default, Deserialize, Serialize)]
+#[derive(Copy, Clone, Default, Debug, StringSet, Deserialize)]
 pub struct IIR {
     pub ba: Vec5,
     // pub y_offset: i32,

From f671e0c94252568fb00cfc709ac5dfa81e43ffa3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Mon, 22 Feb 2021 16:36:56 +0100
Subject: [PATCH 02/15] complex: add saturating_add/sub

---
 dsp/src/complex.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/dsp/src/complex.rs b/dsp/src/complex.rs
index 3349460..dc36349 100644
--- a/dsp/src/complex.rs
+++ b/dsp/src/complex.rs
@@ -8,6 +8,8 @@ pub trait ComplexExt<T, U> {
     fn abs_sqr(&self) -> U;
     fn log2(&self) -> T;
     fn arg(&self) -> T;
+    fn saturating_add(&self, other: Self) -> Self;
+    fn saturating_sub(&self, other: Self) -> Self;
 }
 
 impl ComplexExt<i32, u32> for Complex<i32> {
@@ -23,7 +25,7 @@ impl ComplexExt<i32, u32> for Complex<i32> {
     /// ```
     fn from_angle(angle: i32) -> Self {
         let (c, s) = cossin(angle);
-        Self { re: c, im: s }
+        Self::new(c, s)
     }
 
     /// Return the absolute square (the squared magnitude).
@@ -85,6 +87,15 @@ impl ComplexExt<i32, u32> for Complex<i32> {
     fn arg(&self) -> i32 {
         atan2(self.im, self.re)
     }
+
+    fn saturating_add(&self, other: Self) -> Self {
+        Self::new(self.re.saturating_add(other.re), self.im.saturating_add(other.im))
+    }
+
+    fn saturating_sub(&self, other: Self) -> Self {
+        Self::new(self.re.saturating_sub(other.re), self.im.saturating_sub(other.im))
+    }
+
 }
 
 /// Full scale fixed point multiplication.

From ffdf4026fbdbf9a899aec7025b5710542ced6870 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Mon, 22 Feb 2021 16:40:51 +0100
Subject: [PATCH 03/15] fmt

---
 dsp/src/complex.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dsp/src/complex.rs b/dsp/src/complex.rs
index dc36349..1eb003f 100644
--- a/dsp/src/complex.rs
+++ b/dsp/src/complex.rs
@@ -89,13 +89,18 @@ impl ComplexExt<i32, u32> for Complex<i32> {
     }
 
     fn saturating_add(&self, other: Self) -> Self {
-        Self::new(self.re.saturating_add(other.re), self.im.saturating_add(other.im))
+        Self::new(
+            self.re.saturating_add(other.re),
+            self.im.saturating_add(other.im),
+        )
     }
 
     fn saturating_sub(&self, other: Self) -> Self {
-        Self::new(self.re.saturating_sub(other.re), self.im.saturating_sub(other.im))
+        Self::new(
+            self.re.saturating_sub(other.re),
+            self.im.saturating_sub(other.im),
+        )
     }
-
 }
 
 /// Full scale fixed point multiplication.

From 6c6c2e64a7d97aef57fda7034cf4062ae04ee5fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Tue, 23 Feb 2021 16:46:16 +0100
Subject: [PATCH 04/15] lockin: make order generic

---
 Cargo.lock                 | 1 +
 Cargo.toml                 | 1 +
 dsp/src/lockin.rs          | 8 ++++----
 src/bin/lockin-external.rs | 8 +++-----
 src/bin/lockin-internal.rs | 3 ++-
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7fa03f0..0999288 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -724,6 +724,7 @@ dependencies = [
  "dsp",
  "embedded-hal",
  "enum-iterator",
+ "generic-array 0.14.4",
  "heapless",
  "log",
  "mcp23017",
diff --git a/Cargo.toml b/Cargo.toml
index 599e595..f384fe9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,7 @@ dsp = { path = "dsp" }
 ad9959 = { path = "ad9959" }
 smoltcp-nal = "0.1.0"
 miniconf = "0.1"
+generic-array = "0.14"
 
 [patch.crates-io.miniconf]
 git = "https://github.com/quartiq/miniconf.git"
diff --git a/dsp/src/lockin.rs b/dsp/src/lockin.rs
index 6a977b4..613a133 100644
--- a/dsp/src/lockin.rs
+++ b/dsp/src/lockin.rs
@@ -1,12 +1,12 @@
 use super::{Complex, ComplexExt, Lowpass, MulScaled};
-use generic_array::typenum::U2;
+use generic_array::ArrayLength;
 
 #[derive(Clone, Default)]
-pub struct Lockin {
-    state: [Lowpass<U2>; 2],
+pub struct Lockin<N: ArrayLength<i32>> {
+    state: [Lowpass<N>; 2],
 }
 
-impl Lockin {
+impl<N: ArrayLength<i32>> Lockin<N> {
     /// Update the lockin with a sample taken at a given phase.
     pub fn update(&mut self, sample: i32, phase: i32, k: u8) -> Complex<i32> {
         // Get the LO signal for demodulation and mix the sample;
diff --git a/src/bin/lockin-external.rs b/src/bin/lockin-external.rs
index 3b7c6a8..e44c17e 100644
--- a/src/bin/lockin-external.rs
+++ b/src/bin/lockin-external.rs
@@ -2,10 +2,8 @@
 #![no_std]
 #![no_main]
 
-use stm32h7xx_hal as hal;
-
+use generic_array::typenum::U4;
 use stabilizer::{hardware, hardware::design_parameters};
-
 use dsp::{Accu, Complex, ComplexExt, Lockin, RPLL};
 use hardware::{
     Adc0Input, Adc1Input, Dac0Output, Dac1Output, InputStamper, AFE0, AFE1,
@@ -20,7 +18,7 @@ const APP: () = {
 
         timestamper: InputStamper,
         pll: RPLL,
-        lockin: Lockin,
+        lockin: Lockin<U4>,
     }
 
     #[init]
@@ -156,7 +154,7 @@ const APP: () = {
 
     #[task(binds = ETH, priority = 1)]
     fn eth(_: eth::Context) {
-        unsafe { hal::ethernet::interrupt_handler() }
+        unsafe { stm32h7xx_hal::ethernet::interrupt_handler() }
     }
 
     #[task(binds = SPI2, priority = 3)]
diff --git a/src/bin/lockin-internal.rs b/src/bin/lockin-internal.rs
index 1f9df6e..7ea7e1d 100644
--- a/src/bin/lockin-internal.rs
+++ b/src/bin/lockin-internal.rs
@@ -2,6 +2,7 @@
 #![no_std]
 #![no_main]
 
+use generic_array::typenum::U2;
 use dsp::{Accu, Complex, ComplexExt, Lockin};
 use hardware::{Adc1Input, Dac0Output, Dac1Output, AFE0, AFE1};
 use stabilizer::{hardware, hardware::design_parameters};
@@ -20,7 +21,7 @@ const APP: () = {
         adc: Adc1Input,
         dacs: (Dac0Output, Dac1Output),
 
-        lockin: Lockin,
+        lockin: Lockin<U2>,
     }
 
     #[init]

From e86f449dc0aa103701ba268b6dbc3f9fb5b1f683 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Tue, 23 Feb 2021 16:57:09 +0100
Subject: [PATCH 05/15] lockin bins: remove stale todos, align and document
 [nfc]

---
 src/bin/lockin-external.rs | 21 +++++++---------
 src/bin/lockin-internal.rs | 49 +++++++++++++-------------------------
 2 files changed, 26 insertions(+), 44 deletions(-)

diff --git a/src/bin/lockin-external.rs b/src/bin/lockin-external.rs
index e44c17e..1b11c76 100644
--- a/src/bin/lockin-external.rs
+++ b/src/bin/lockin-external.rs
@@ -2,12 +2,12 @@
 #![no_std]
 #![no_main]
 
-use generic_array::typenum::U4;
-use stabilizer::{hardware, hardware::design_parameters};
 use dsp::{Accu, Complex, ComplexExt, Lockin, RPLL};
+use generic_array::typenum::U4;
 use hardware::{
     Adc0Input, Adc1Input, Dac0Output, Dac1Output, InputStamper, AFE0, AFE1,
 };
+use stabilizer::{hardware, hardware::design_parameters};
 
 #[rtic::app(device = stm32h7xx_hal::stm32, peripherals = true, monotonic = rtic::cyccnt::CYCCNT)]
 const APP: () = {
@@ -86,22 +86,20 @@ const APP: () = {
             .map(|t| t as i32);
         let (pll_phase, pll_frequency) = c.resources.pll.update(
             timestamp,
-            21, // frequency settling time (log2 counter cycles), TODO: expose
-            21, // phase settling time, TODO: expose
+            21, // frequency settling time (log2 counter cycles),
+            21, // phase settling time
         );
 
         // Harmonic index of the LO: -1 to _de_modulate the fundamental (complex conjugate)
-        let harmonic: i32 = -1; // TODO: expose
+        let harmonic: i32 = -1;
 
         // Demodulation LO phase offset
-        let phase_offset: i32 = 0; // TODO: expose
+        let phase_offset: i32 = 0;
 
         // Log2 lowpass time constant
-        let time_constant: u8 = 6; // TODO: expose
+        let time_constant: u8 = 6;
 
         let sample_frequency = ((pll_frequency
-            // half-up rounding bias
-            // .wrapping_add(1 << design_parameters::SAMPLE_BUFFER_SIZE_LOG2 - 1)
             >> design_parameters::SAMPLE_BUFFER_SIZE_LOG2)
             as i32)
             .wrapping_mul(harmonic);
@@ -129,7 +127,7 @@ const APP: () = {
             Quadrature,
         }
 
-        let conf = Conf::FrequencyDiscriminator; // TODO: expose
+        let conf = Conf::FrequencyDiscriminator;
         let output = match conf {
             // Convert from IQ to power and phase.
             Conf::PowerPhase => [(output.log2() << 24) as _, output.arg()],
@@ -147,7 +145,6 @@ const APP: () = {
     #[idle(resources=[afes])]
     fn idle(_: idle::Context) -> ! {
         loop {
-            // TODO: Implement network interface.
             cortex_m::asm::wfi();
         }
     }
@@ -164,7 +161,7 @@ const APP: () = {
 
     #[task(binds = SPI3, priority = 3)]
     fn spi3(_: spi3::Context) {
-        panic!("ADC0 input overrun");
+        panic!("ADC1 input overrun");
     }
 
     #[task(binds = SPI4, priority = 3)]
diff --git a/src/bin/lockin-internal.rs b/src/bin/lockin-internal.rs
index 7ea7e1d..d4e90e6 100644
--- a/src/bin/lockin-internal.rs
+++ b/src/bin/lockin-internal.rs
@@ -2,8 +2,8 @@
 #![no_std]
 #![no_main]
 
-use generic_array::typenum::U2;
 use dsp::{Accu, Complex, ComplexExt, Lockin};
+use generic_array::typenum::U2;
 use hardware::{Adc1Input, Dac0Output, Dac1Output, AFE0, AFE1};
 use stabilizer::{hardware, hardware::design_parameters};
 
@@ -45,24 +45,13 @@ const APP: () = {
         }
     }
 
-    /// Main DSP processing routine for Stabilizer.
+    /// Main DSP processing routine.
     ///
-    /// # Note
-    /// Processing time for the DSP application code is bounded by the following constraints:
+    /// See `dual-iir` for general notes on processing time and timing.
     ///
-    /// DSP application code starts after the ADC has generated a batch of samples and must be
-    /// completed by the time the next batch of ADC samples has been acquired (plus the FIFO buffer
-    /// time). If this constraint is not met, firmware will panic due to an ADC input overrun.
-    ///
-    /// The DSP application code must also fill out the next DAC output buffer in time such that the
-    /// DAC can switch to it when it has completed the current buffer. If this constraint is not met
-    /// it's possible that old DAC codes will be generated on the output and the output samples will
-    /// be delayed by 1 batch.
-    ///
-    /// Because the ADC and DAC operate at the same rate, these two constraints actually implement
-    /// the same time bounds, meeting one also means the other is also met.
-    ///
-    /// TODO: Document
+    /// This is an implementation of an internal-reference lockin on the ADC1 signal.
+    /// The reference at f_sample/8 is output on DAC0 and the phase of the demodulated
+    /// signal on DAC1.
     #[task(binds=DMA1_STR4, resources=[adc, dacs, lockin], priority=2)]
     fn process(c: process::Context) {
         let lockin = c.resources.lockin;
@@ -72,29 +61,23 @@ const APP: () = {
             c.resources.dacs.1.acquire_buffer(),
         ];
 
-        // DAC0 always generates a fixed sinusoidal output.
-        dac_samples[0]
-            .iter_mut()
-            .zip(DAC_SEQUENCE.iter())
-            .for_each(|(d, s)| *d = *s as u16 ^ 0x8000);
-
         // Reference phase and frequency are known.
-        let pll_phase = 0;
+        let pll_phase = 0i32;
         let pll_frequency =
             1i32 << (32 - design_parameters::SAMPLE_BUFFER_SIZE_LOG2);
 
-        // Harmonic index of the LO: -1 to _de_modulate the fundamental
-        let harmonic: i32 = -1; // TODO: expose
+        // Harmonic index of the LO: -1 to _de_modulate the fundamental (complex conjugate)
+        let harmonic: i32 = -1;
 
         // Demodulation LO phase offset
-        let phase_offset: i32 = (0.25 * i32::MAX as f32) as i32; // TODO: expose
+        let phase_offset: i32 = 1 << 30;
 
         // Log2 lowpass time constant.
         let time_constant: u8 = 8;
 
         let sample_frequency = (pll_frequency as i32).wrapping_mul(harmonic);
-        let sample_phase = phase_offset
-            .wrapping_add((pll_phase as i32).wrapping_mul(harmonic));
+        let sample_phase =
+            phase_offset.wrapping_add(pll_phase.wrapping_mul(harmonic));
 
         let output: Complex<i32> = adc_samples
             .iter()
@@ -110,15 +93,17 @@ const APP: () = {
             .unwrap()
             * 2; // Full scale assuming the 2f component is gone.
 
-        for value in dac_samples[1].iter_mut() {
-            *value = (output.arg() >> 16) as u16 ^ 0x8000;
+        // Convert to DAC data.
+        for i in 0..dac_samples[0].len() {
+            // DAC0 always generates a fixed sinusoidal output.
+            dac_samples[0][i] = DAC_SEQUENCE[i] as u16 ^ 0x8000;
+            dac_samples[1][i] = (output.arg() >> 16) as u16 ^ 0x8000;
         }
     }
 
     #[idle(resources=[afes])]
     fn idle(_: idle::Context) -> ! {
         loop {
-            // TODO: Implement network interface.
             cortex_m::asm::wfi();
         }
     }

From 016323c94d2da871d1e08eeecc3f13af8ecfb406 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Tue, 23 Feb 2021 17:15:07 +0100
Subject: [PATCH 06/15] avoid clippy

---
 src/bin/lockin-internal.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/lockin-internal.rs b/src/bin/lockin-internal.rs
index d4e90e6..1e7ebc0 100644
--- a/src/bin/lockin-internal.rs
+++ b/src/bin/lockin-internal.rs
@@ -94,7 +94,7 @@ const APP: () = {
             * 2; // Full scale assuming the 2f component is gone.
 
         // Convert to DAC data.
-        for i in 0..dac_samples[0].len() {
+        for i in 0..dac_samples[1].len() {
             // DAC0 always generates a fixed sinusoidal output.
             dac_samples[0][i] = DAC_SEQUENCE[i] as u16 ^ 0x8000;
             dac_samples[1][i] = (output.arg() >> 16) as u16 ^ 0x8000;

From 41f6d971c3f00dd11547303f002b416a607aaf12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Tue, 23 Feb 2021 17:22:53 +0100
Subject: [PATCH 07/15] avoid clippy better

---
 src/bin/lockin-internal.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bin/lockin-internal.rs b/src/bin/lockin-internal.rs
index 1e7ebc0..ee9da2f 100644
--- a/src/bin/lockin-internal.rs
+++ b/src/bin/lockin-internal.rs
@@ -94,9 +94,9 @@ const APP: () = {
             * 2; // Full scale assuming the 2f component is gone.
 
         // Convert to DAC data.
-        for i in 0..dac_samples[1].len() {
+        for (i, data) in DAC_SEQUENCE.iter().enumerate() {
             // DAC0 always generates a fixed sinusoidal output.
-            dac_samples[0][i] = DAC_SEQUENCE[i] as u16 ^ 0x8000;
+            dac_samples[0][i] = *data as u16 ^ 0x8000;
             dac_samples[1][i] = (output.arg() >> 16) as u16 ^ 0x8000;
         }
     }

From a2e0539218422e1b0d6c00e4241f4656d7bfc56d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Tue, 23 Feb 2021 17:31:44 +0100
Subject: [PATCH 08/15] ci: cron daily

---
 .github/workflows/ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f4b66b0..cf68b3f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,7 +5,9 @@ on:
     branches: [master, staging, trying]
   pull_request:
     branches: [master]
-
+  schedule:
+    # UTC
+    - cron: '48 4 * * *'
 env:
   CARGO_TERM_COLOR: always
 

From b071fef15c4dbd8c4b66b4f8b97cf1256592966a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Thu, 25 Feb 2021 16:38:57 +0100
Subject: [PATCH 09/15] iir_int: apply rounding bias summarily for speed

---
 dsp/src/lowpass.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dsp/src/lowpass.rs b/dsp/src/lowpass.rs
index 5ab803d..10d5838 100644
--- a/dsp/src/lowpass.rs
+++ b/dsp/src/lowpass.rs
@@ -27,10 +27,10 @@ impl<N: ArrayLength<i32>> Lowpass<N> {
         // Note T-DF-I and the zeros at Nyquist.
         let mut x = x;
         for y in self.y.iter_mut() {
-            let dy = x.saturating_sub(*y).saturating_add(1 << (k - 1)) >> k;
+            let dy = x.saturating_sub(*y) >> k;
             *y += dy;
             x = *y - (dy >> 1);
         }
-        x
+        x.saturating_add((self.y.len() as i32) << (k - 1))
     }
 }

From 067ef79560623ff6ae461173ad313aa546763ffd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Fri, 26 Feb 2021 19:47:48 +0100
Subject: [PATCH 10/15] cossin: shave off a few more instructions

---
 dsp/build.rs      |  8 ++++----
 dsp/src/cossin.rs | 44 +++++++++++++++++++++++---------------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/dsp/build.rs b/dsp/build.rs
index ca43736..5e5fe1d 100644
--- a/dsp/build.rs
+++ b/dsp/build.rs
@@ -15,7 +15,7 @@ fn write_cossin_table() {
         .unwrap();
     write!(
         file,
-        "pub(crate) const COSSIN: [(u16, u16); 1 << COSSIN_DEPTH] = ["
+        "pub(crate) const COSSIN: [u32; 1 << COSSIN_DEPTH] = ["
     )
     .unwrap();
 
@@ -29,12 +29,12 @@ fn write_cossin_table() {
         // use midpoint samples to save one entry in the LUT
         let phase = (PI / 4. / (1 << DEPTH) as f64) * (i as f64 + 0.5);
         // add one bit accuracy to cos due to 0.5 < cos(z) <= 1 for |z| < pi/4
-        let cos = ((phase.cos() - 0.5) * 2. * AMPLITUDE).round() as u16;
-        let sin = (phase.sin() * AMPLITUDE).round() as u16;
+        let cos = ((phase.cos() - 0.5) * 2. * AMPLITUDE).round() as u32 - 1;
+        let sin = (phase.sin() * AMPLITUDE).round() as u32;
         if i % 4 == 0 {
             write!(file, "\n   ").unwrap();
         }
-        write!(file, " ({}, {}),", cos, sin).unwrap();
+        write!(file, " {},", cos + (sin << 16)).unwrap();
     }
     writeln!(file, "\n];").unwrap();
 
diff --git a/dsp/src/cossin.rs b/dsp/src/cossin.rs
index 4a5cccd..2f21cec 100644
--- a/dsp/src/cossin.rs
+++ b/dsp/src/cossin.rs
@@ -14,7 +14,7 @@ include!(concat!(env!("OUT_DIR"), "/cossin_table.rs"));
 /// tuple. With a 7-bit deep LUT there is 1e-5 max and 6e-8 RMS error
 /// in each quadrature over 20 bit phase.
 pub fn cossin(phase: i32) -> (i32, i32) {
-    // Phase bits excluding the three highes MSB
+    // Phase bits excluding the three highest MSB
     const OCTANT_BITS: usize = 32 - 3;
 
     // This is a slightly more compact way to compute the four flags for
@@ -22,50 +22,52 @@ pub fn cossin(phase: i32) -> (i32, i32) {
     let mut octant = (phase as u32) >> OCTANT_BITS;
     octant ^= octant << 1;
 
-    // Mask off octant bits. This leaves the angle in the range [0, pi/4).
-    let mut phase = phase & ((1 << OCTANT_BITS) - 1);
-
+    let mut phase = phase;
     if octant & 1 != 0 {
         // phase = pi/4 - phase
-        phase = (1 << OCTANT_BITS) - 1 - phase;
+        phase = !phase;
     }
 
-    let lookup = COSSIN[(phase >> (OCTANT_BITS - COSSIN_DEPTH)) as usize];
-    // 1/2 < cos(0 <= x <= pi/4) <= 1: Shift the cos
-    // values and scale the sine values as encoded in the LUT.
-    let mut cos = lookup.0 as i32 + u16::MAX as i32;
-    let mut sin = (lookup.1 as i32) << 1;
+    // Mask off octant bits. This leaves the angle in the range [0, pi/4).
+    phase &= (1 << OCTANT_BITS) - 1;
 
     // 16 + 1 bits for cos/sin and 15 for dphi to saturate the i32 range.
     const ALIGN_MSB: usize = 32 - 16 - 1;
     phase >>= OCTANT_BITS - COSSIN_DEPTH - ALIGN_MSB;
+
+    let lookup = COSSIN[(phase >> ALIGN_MSB) as usize];
     phase &= (1 << ALIGN_MSB) - 1;
+
     // The phase values used for the LUT are at midpoint for the truncated phase.
     // Interpolate relative to the LUT entry midpoint.
+    // Also cancel the -1 bias that was conditionally introduced above.
     phase -= (1 << (ALIGN_MSB - 1)) - (octant & 1) as i32;
-    // Fixed point pi/4.
-    const PI4: i32 = (PI / 4. * (1 << (32 - ALIGN_MSB)) as f64) as i32;
-    // No rounding bias necessary here since we keep enough low bits.
-    let dphi = (phase * PI4) >> (32 - ALIGN_MSB);
 
-    // Make room for the sign bit.
-    let dcos = (sin * dphi) >> (COSSIN_DEPTH + 1);
+    // Fixed point pi/4.
+    const PI4: i32 = (PI / 4. * (1 << 16) as f64) as i32;
+    // No rounding bias necessary here since we keep enough low bits.
+    let dphi = (phase * PI4) >> 16;
+
+    // 1/2 < cos(0 <= x <= pi/4) <= 1: Shift the cos
+    // values and scale the sine values as encoded in the LUT.
+    let mut cos = (lookup & 0xffff) as i32 + (1 << 16);
+    let mut sin = (lookup >> 16) as i32;
+
+    let dcos = (sin * dphi) >> COSSIN_DEPTH;
     let dsin = (cos * dphi) >> (COSSIN_DEPTH + 1);
 
     cos = (cos << (ALIGN_MSB - 1)) - dcos;
-    sin = (sin << (ALIGN_MSB - 1)) + dsin;
+    sin = (sin << ALIGN_MSB) + dsin;
 
     // Unmap using octant bits.
     if octant & 2 != 0 {
         core::mem::swap(&mut sin, &mut cos);
     }
-
     if octant & 4 != 0 {
-        cos *= -1;
+        cos = -cos;
     }
-
     if octant & 8 != 0 {
-        sin *= -1;
+        sin = -sin;
     }
 
     (cos, sin)

From f4c6e07a381352a35a12e0b42ace635100fde958 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Sat, 27 Feb 2021 14:54:46 +0100
Subject: [PATCH 11/15] dsp/bench: add lowpass

---
 dsp/benches/micro.rs | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/dsp/benches/micro.rs b/dsp/benches/micro.rs
index 1f2cc1f..b3c1549 100644
--- a/dsp/benches/micro.rs
+++ b/dsp/benches/micro.rs
@@ -1,8 +1,9 @@
 use core::f32::consts::PI;
-use dsp::{atan2, cossin};
-use dsp::{iir, iir_int};
-use dsp::{PLL, RPLL};
+
 use easybench::bench_env;
+use generic_array::typenum::U4;
+
+use dsp::{Lowpass, atan2, cossin, iir, iir_int, PLL, RPLL};
 
 fn atan2_bench() {
     let xi = (10 << 16) as i32;
@@ -70,6 +71,18 @@ fn iir_bench() {
     );
 }
 
+fn lowpass_bench() {
+    let mut dut = Lowpass::<U4>::default();
+    println!(
+        "Lowpass::<U4>::update(x, k): {}",
+        bench_env((0x32421, 14), |(x, k)| dut.update(*x, *k))
+    );
+    println!(
+        "Lowpass::<U4>::update(x, 14): {}",
+        bench_env(0x32421, |x| dut.update(*x, 14))
+    );
+}
+
 fn main() {
     atan2_bench();
     cossin_bench();
@@ -77,4 +90,5 @@ fn main() {
     pll_bench();
     iir_int_bench();
     iir_bench();
+    lowpass_bench();
 }

From 1805961d5dade1ed90361a8de6bc49b7c2d4a928 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Thu, 25 Feb 2021 18:01:53 +0100
Subject: [PATCH 12/15] macc_i32: move to tools

---
 dsp/src/iir_int.rs | 14 ++------------
 dsp/src/tools.rs   | 11 +++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/dsp/src/iir_int.rs b/dsp/src/iir_int.rs
index 02fdeb1..ef0c1dc 100644
--- a/dsp/src/iir_int.rs
+++ b/dsp/src/iir_int.rs
@@ -1,3 +1,4 @@
+use super::tools::macc_i32;
 use core::f64::consts::PI;
 use miniconf::StringSet;
 use serde::Deserialize;
@@ -40,17 +41,6 @@ impl Coeff for Vec5 {
     }
 }
 
-fn macc(y0: i32, x: &[i32], a: &[i32], shift: u32) -> i32 {
-    // Rounding bias, half up
-    let y0 = ((y0 as i64) << shift) + (1 << (shift - 1));
-    let y = x
-        .iter()
-        .zip(a)
-        .map(|(x, a)| *x as i64 * *a as i64)
-        .fold(y0, |y, xa| y + xa);
-    (y >> shift) as i32
-}
-
 /// Integer biquad IIR
 ///
 /// See `dsp::iir::IIR` for general implementation details.
@@ -86,7 +76,7 @@ impl IIR {
         // Store x0            x0 x1 x2 y1 y2
         xy[0] = x0;
         // Compute y0 by multiply-accumulate
-        let y0 = macc(0, xy, &self.ba, IIR::SHIFT);
+        let y0 = macc_i32(0, xy, &self.ba, IIR::SHIFT);
         // Limit y0
         // let y0 = y0.max(self.y_min).min(self.y_max);
         // Store y0            x0 x1 y0 y1 y2
diff --git a/dsp/src/tools.rs b/dsp/src/tools.rs
index 378734f..6fc2b12 100644
--- a/dsp/src/tools.rs
+++ b/dsp/src/tools.rs
@@ -77,6 +77,17 @@ where
         .fold(y0, |y, xa| y + xa)
 }
 
+pub fn macc_i32(y0: i32, x: &[i32], a: &[i32], shift: u32) -> i32 {
+    // Rounding bias, half up
+    let y0 = ((y0 as i64) << shift) + (1 << (shift - 1));
+    let y = x
+        .iter()
+        .zip(a)
+        .map(|(x, a)| *x as i64 * *a as i64)
+        .fold(y0, |y, xa| y + xa);
+    (y >> shift) as i32
+}
+
 /// Combine high and low i32 into a single downscaled i32, saturating the type.
 pub fn saturating_scale(lo: i32, hi: i32, shift: u32) -> i32 {
     debug_assert!(shift & 31 == shift);

From b2d6b5c10c74f1ddad710588baa40892870ab0ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Sat, 27 Feb 2021 15:02:16 +0100
Subject: [PATCH 13/15] dsp/bench: fmt

---
 dsp/benches/micro.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dsp/benches/micro.rs b/dsp/benches/micro.rs
index b3c1549..0902a4c 100644
--- a/dsp/benches/micro.rs
+++ b/dsp/benches/micro.rs
@@ -3,7 +3,7 @@ use core::f32::consts::PI;
 use easybench::bench_env;
 use generic_array::typenum::U4;
 
-use dsp::{Lowpass, atan2, cossin, iir, iir_int, PLL, RPLL};
+use dsp::{atan2, cossin, iir, iir_int, Lowpass, PLL, RPLL};
 
 fn atan2_bench() {
     let xi = (10 << 16) as i32;

From cd289687eafd20f24c5d81ac9a0fe15a42152278 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Sat, 27 Feb 2021 15:37:15 +0100
Subject: [PATCH 14/15] cossin: tighten test

---
 dsp/src/cossin.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dsp/src/cossin.rs b/dsp/src/cossin.rs
index 2f21cec..9240d0d 100644
--- a/dsp/src/cossin.rs
+++ b/dsp/src/cossin.rs
@@ -11,7 +11,7 @@ include!(concat!(env!("OUT_DIR"), "/cossin_table.rs"));
 ///
 /// # Returns
 /// The cos and sin values of the provided phase as a `(i32, i32)`
-/// tuple. With a 7-bit deep LUT there is 1e-5 max and 6e-8 RMS error
+/// tuple. With a 7-bit deep LUT there is 9e-6 max and 6e-8 RMS error
 /// in each quadrature over 20 bit phase.
 pub fn cossin(phase: i32) -> (i32, i32) {
     // Phase bits excluding the three highest MSB
@@ -81,7 +81,7 @@ mod tests {
     #[test]
     fn cossin_error_max_rms_all_phase() {
         // Constant amplitude error due to LUT data range.
-        const AMPLITUDE: f64 = ((1i64 << 31) - (1i64 << 15)) as _;
+        const AMPLITUDE: f64 = (1i64 << 31) as f64 - 0.85 * (1i64 << 15) as f64;
         const MAX_PHASE: f64 = (1i64 << 32) as _;
         let mut rms_err = (0f64, 0f64);
         let mut sum_err = (0f64, 0f64);
@@ -144,7 +144,7 @@ mod tests {
         assert!(rms_err.0.sqrt() < 6e-8);
         assert!(rms_err.1.sqrt() < 6e-8);
 
-        assert!(max_err.0 < 1.1e-5);
-        assert!(max_err.1 < 1.1e-5);
+        assert!(max_err.0 < 9e-6);
+        assert!(max_err.1 < 9e-6);
     }
 }

From 0d6b297d4e0e983d39b6118f8f9f69bd5958aea4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=B6rdens?= <rj@quartiq.de>
Date: Mon, 1 Mar 2021 13:29:59 +0100
Subject: [PATCH 15/15] cossin: don't cancel -1 phase offset

---
 dsp/build.rs      | 12 +++++++-----
 dsp/src/cossin.rs | 27 +++++++++++++++------------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/dsp/build.rs b/dsp/build.rs
index 5e5fe1d..edd56b0 100644
--- a/dsp/build.rs
+++ b/dsp/build.rs
@@ -26,14 +26,16 @@ fn write_cossin_table() {
     const AMPLITUDE: f64 = u16::MAX as f64;
 
     for i in 0..(1 << DEPTH) {
-        // use midpoint samples to save one entry in the LUT
-        let phase = (PI / 4. / (1 << DEPTH) as f64) * (i as f64 + 0.5);
-        // add one bit accuracy to cos due to 0.5 < cos(z) <= 1 for |z| < pi/4
-        let cos = ((phase.cos() - 0.5) * 2. * AMPLITUDE).round() as u32 - 1;
-        let sin = (phase.sin() * AMPLITUDE).round() as u32;
         if i % 4 == 0 {
             write!(file, "\n   ").unwrap();
         }
+        // Use midpoint samples to save one entry in the LUT
+        let (sin, cos) =
+            (PI / 4. * ((i as f64 + 0.5) / (1 << DEPTH) as f64)).sin_cos();
+        // Add one bit accuracy to cos due to 0.5 < cos(z) <= 1 for |z| < pi/4
+        // The -1 LSB is cancelled when unscaling with the biased half amplitude
+        let cos = ((cos * 2. - 1.) * AMPLITUDE - 1.).round() as u32;
+        let sin = (sin * AMPLITUDE).round() as u32;
         write!(file, " {},", cos + (sin << 16)).unwrap();
     }
     writeln!(file, "\n];").unwrap();
diff --git a/dsp/src/cossin.rs b/dsp/src/cossin.rs
index 9240d0d..5dae7c3 100644
--- a/dsp/src/cossin.rs
+++ b/dsp/src/cossin.rs
@@ -11,7 +11,7 @@ include!(concat!(env!("OUT_DIR"), "/cossin_table.rs"));
 ///
 /// # Returns
 /// The cos and sin values of the provided phase as a `(i32, i32)`
-/// tuple. With a 7-bit deep LUT there is 9e-6 max and 6e-8 RMS error
+/// tuple. With a 7-bit deep LUT there is 9e-6 max and 4e-6 RMS error
 /// in each quadrature over 20 bit phase.
 pub fn cossin(phase: i32) -> (i32, i32) {
     // Phase bits excluding the three highest MSB
@@ -40,11 +40,14 @@ pub fn cossin(phase: i32) -> (i32, i32) {
 
     // The phase values used for the LUT are at midpoint for the truncated phase.
     // Interpolate relative to the LUT entry midpoint.
-    // Also cancel the -1 bias that was conditionally introduced above.
-    phase -= (1 << (ALIGN_MSB - 1)) - (octant & 1) as i32;
+    phase -= 1 << (ALIGN_MSB - 1);
+
+    // Cancel the -1 bias that was conditionally introduced above.
+    // This lowers the DC spur from 2e-8 to 2e-10 magnitude.
+    // phase += (octant & 1) as i32;
 
     // Fixed point pi/4.
-    const PI4: i32 = (PI / 4. * (1 << 16) as f64) as i32;
+    const PI4: i32 = (PI / 4. * (1 << 16) as f64) as _;
     // No rounding bias necessary here since we keep enough low bits.
     let dphi = (phase * PI4) >> 16;
 
@@ -123,8 +126,8 @@ mod tests {
             max_err.0 = max_err.0.max(err.0.abs());
             max_err.1 = max_err.1.max(err.1.abs());
         }
-        rms_err.0 /= MAX_PHASE;
-        rms_err.1 /= MAX_PHASE;
+        rms_err.0 /= (1 << PHASE_DEPTH) as f64;
+        rms_err.1 /= (1 << PHASE_DEPTH) as f64;
 
         println!("sum: {:.2e} {:.2e}", sum.0, sum.1);
         println!("demod: {:.2e} {:.2e}", demod.0, demod.1);
@@ -133,18 +136,18 @@ mod tests {
         println!("max: {:.2e} {:.2e}", max_err.0, max_err.1);
 
         assert!(sum.0.abs() < 4e-10);
-        assert!(sum.1.abs() < 4e-10);
+        assert!(sum.1.abs() < 3e-8);
 
         assert!(demod.0.abs() < 4e-10);
-        assert!(demod.1.abs() < 4e-10);
+        assert!(demod.1.abs() < 1e-8);
 
         assert!(sum_err.0.abs() < 4e-10);
         assert!(sum_err.1.abs() < 4e-10);
 
-        assert!(rms_err.0.sqrt() < 6e-8);
-        assert!(rms_err.1.sqrt() < 6e-8);
+        assert!(rms_err.0.sqrt() < 4e-6);
+        assert!(rms_err.1.sqrt() < 4e-6);
 
-        assert!(max_err.0 < 9e-6);
-        assert!(max_err.1 < 9e-6);
+        assert!(max_err.0 < 1e-5);
+        assert!(max_err.1 < 1e-5);
     }
 }