diff --git a/.cargo/config b/.cargo/config
index 382c36e..ea1d6c0 100644
--- a/.cargo/config
+++ b/.cargo/config
@@ -1,6 +1,16 @@
 [target.'cfg(all(target_arch = "arm", target_os = "none"))']
 runner = "gdb-multiarch -q -x openocd.gdb"
-rustflags = ["-C", "link-arg=-Tlink.x"]
+rustflags = [
+    "-C", "link-arg=-Tlink.x",
+# The target (below) defaults to cortex-m4
+# There currently are two different options to go beyond that:
+# 1. cortex-m7 has the right flags and instructions (FPU) but no instruction schedule yet
+    "-C", "target-cpu=cortex-m7",
+# 2. cortex-m4 with the additional fpv5 instructions and a potentially
+# better-than-nothing instruction schedule
+    "-C", "target-feature=+fp-armv8d16",
+# When combined they are equivalent to (1) alone
+]
 
 [build]
 target = "thumbv7em-none-eabihf"
diff --git a/Cargo.toml b/Cargo.toml
index 301956c..896eecf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,7 +62,7 @@ branch = "dma"
 [features]
 semihosting = ["panic-semihosting", "cortex-m-log/semihosting"]
 bkpt = [ ]
-nightly = ["cortex-m/inline-asm"]
+nightly = ["cortex-m/inline-asm", "dsp/nightly"]
 
 [profile.dev]
 codegen-units = 1
diff --git a/dsp/Cargo.toml b/dsp/Cargo.toml
index 625d0f0..c8ef52b 100644
--- a/dsp/Cargo.toml
+++ b/dsp/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2018"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"], default-features = false }
+
+[features]
+nightly = []
diff --git a/dsp/src/iir.rs b/dsp/src/iir.rs
index fac1c4c..c6f2100 100644
--- a/dsp/src/iir.rs
+++ b/dsp/src/iir.rs
@@ -1,4 +1,4 @@
-use core::ops::{Add, Mul};
+use core::ops::{Add, Mul, Neg};
 use serde::{Deserialize, Serialize};
 
 use core::f32;
@@ -8,23 +8,35 @@ use core::f32;
 // `compiler-intrinsics`/llvm should have better (robust, universal, and
 // faster) implementations.
 
-fn abs(x: f32) -> f32 {
-    if x >= 0. {
+fn abs<T>(x: T) -> T
+where
+    T: PartialOrd + Default + Neg<Output = T>,
+{
+    if x >= T::default() {
         x
     } else {
         -x
     }
 }
 
-fn copysign(x: f32, y: f32) -> f32 {
-    if (x >= 0. && y >= 0.) || (x <= 0. && y <= 0.) {
+fn copysign<T>(x: T, y: T) -> T
+where
+    T: PartialOrd + Default + Neg<Output = T>,
+{
+    if (x >= T::default() && y >= T::default())
+        || (x <= T::default() && y <= T::default())
+    {
         x
     } else {
         -x
     }
 }
 
-fn max(x: f32, y: f32) -> f32 {
+#[cfg(not(feature = "nightly"))]
+fn max<T>(x: T, y: T) -> T
+where
+    T: PartialOrd,
+{
     if x > y {
         x
     } else {
@@ -32,7 +44,11 @@ fn max(x: f32, y: f32) -> f32 {
     }
 }
 
-fn min(x: f32, y: f32) -> f32 {
+#[cfg(not(feature = "nightly"))]
+fn min<T>(x: T, y: T) -> T
+where
+    T: PartialOrd,
+{
     if x < y {
         x
     } else {
@@ -40,6 +56,16 @@ fn min(x: f32, y: f32) -> f32 {
     }
 }
 
+#[cfg(feature = "nightly")]
+fn max(x: f32, y: f32) -> f32 {
+    core::intrinsics::maxnumf32(x, y)
+}
+
+#[cfg(feature = "nightly")]
+fn min(x: f32, y: f32) -> f32 {
+    core::intrinsics::minnumf32(x, y)
+}
+
 // Multiply-accumulate vectors `x` and `a`.
 //
 // A.k.a. dot product.
@@ -50,7 +76,7 @@ where
 {
     x.iter()
         .zip(a)
-        .map(|(&x, &a)| x * a)
+        .map(|(x, a)| *x * *a)
         .fold(y0, |y, xa| y + xa)
 }
 
@@ -58,10 +84,10 @@ where
 ///
 /// To represent the IIR state (input and output memory) during the filter update
 /// this contains the three inputs (x0, x1, x2) and the two outputs (y1, y2)
-/// concatenated.
+/// concatenated. Lower indices correspond to more recent samples.
 /// To represent the IIR coefficients, this contains the feed-forward
-/// coefficients (b0, b1, b2) followd by the feed-back coefficients (a1, a2),
-/// all normalized such that a0 = 1.
+/// coefficients (b0, b1, b2) followd by the negated feed-back coefficients
+/// (-a1, -a2), all five normalized such that a0 = 1.
 pub type IIRState = [f32; 5];
 
 /// IIR configuration.
@@ -159,10 +185,13 @@ impl IIR {
     /// * `xy` - Current filter state.
     /// * `x0` - New input.
     pub fn update(&self, xy: &mut IIRState, x0: f32) -> f32 {
+        let n = self.ba.len();
+        debug_assert!(xy.len() == n);
         // `xy` contains       x0 x1 y0 y1 y2
         // Increment time      x1 x2 y1 y2 y3
-        // Rotate              y3 x1 x2 y1 y2
-        xy.rotate_right(1);
+        // Shift               x1 x1 x2 y1 y2
+        // This unrolls better than xy.rotate_right(1)
+        xy.copy_within(0..n - 1, 1);
         // Store x0            x0 x1 x2 y1 y2
         xy[0] = x0;
         // Compute y0 by multiply-accumulate
@@ -170,7 +199,7 @@ impl IIR {
         // Limit y0
         let y0 = max(self.y_min, min(self.y_max, y0));
         // Store y0            x0 x1 y0 y1 y2
-        xy[xy.len() / 2] = y0;
+        xy[n / 2] = y0;
         y0
     }
 }
diff --git a/dsp/src/lib.rs b/dsp/src/lib.rs
index 3c44bbc..b2acf34 100644
--- a/dsp/src/lib.rs
+++ b/dsp/src/lib.rs
@@ -1,3 +1,4 @@
 #![no_std]
+#![cfg_attr(feature = "nightly", feature(asm, core_intrinsics))]
 
 pub mod iir;
diff --git a/src/main.rs b/src/main.rs
index e6f83b5..845a9b4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -13,6 +13,9 @@
 fn panic(_info: &core::panic::PanicInfo) -> ! {
     let gpiod = unsafe { &*hal::stm32::GPIOD::ptr() };
     gpiod.odr.modify(|_, w| w.odr6().high().odr12().high()); // FP_LED_1, FP_LED_3
+    #[cfg(feature = "nightly")]
+    core::intrinsics::abort();
+    #[cfg(not(feature = "nightly"))]
     unsafe {
         core::intrinsics::abort();
     }
@@ -760,7 +763,11 @@ const APP: () = {
                 let x = f32::from(adc_samples[channel][sample] as i16);
                 let y = c.resources.iir_ch[channel]
                     .update(&mut c.resources.iir_state[channel], x);
-                dac_samples[channel][sample] = y as i16 as u16 ^ 0x8000;
+                // Note(unsafe): The filter limits ensure that the value is in range.
+                // The truncation introduces 1/2 LSB distortion.
+                let y = unsafe { y.to_int_unchecked::<i16>() };
+                // Convert to DAC code
+                dac_samples[channel][sample] = y as u16 ^ 0x8000;
             }
         }
         let [dac0, dac1] = dac_samples;