diff --git a/dsp/Cargo.toml b/dsp/Cargo.toml index 8313a49..548e64f 100644 --- a/dsp/Cargo.toml +++ b/dsp/Cargo.toml @@ -12,7 +12,7 @@ serde = { version = "1.0", features = ["derive"], default-features = false } criterion = "0.3" [[bench]] -name = "cossin" +name = "trig" harness = false [features] diff --git a/dsp/benches/cossin.rs b/dsp/benches/cossin.rs deleted file mode 100644 index 4e23774..0000000 --- a/dsp/benches/cossin.rs +++ /dev/null @@ -1,13 +0,0 @@ -use core::f32::consts::PI; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use dsp::trig::cossin; - -fn cossin_bench(c: &mut Criterion) { - let zi = -0x7304_2531_i32; - let zf = zi as f32 / i32::MAX as f32 * PI; - c.bench_function("cossin(zi)", |b| b.iter(|| cossin(black_box(zi)))); - c.bench_function("zf.sin_cos()", |b| b.iter(|| black_box(zf).sin_cos())); -} - -criterion_group!(benches, cossin_bench); -criterion_main!(benches); diff --git a/dsp/benches/trig.rs b/dsp/benches/trig.rs new file mode 100644 index 0000000..19b6cce --- /dev/null +++ b/dsp/benches/trig.rs @@ -0,0 +1,28 @@ +use core::f32::consts::PI; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use dsp::trig::{atan2, cossin}; + +fn atan2_bench(c: &mut Criterion) { + let xi = (10 << 16) as i32; + let xf = xi as f32 / i32::MAX as f32; + + let yi = (-26_328 << 16) as i32; + let yf = yi as f32 / i32::MAX as f32; + + c.bench_function("atan2(y, x)", |b| { + b.iter(|| atan2(black_box(yi), black_box(xi))) + }); + c.bench_function("y.atan2(x)", |b| { + b.iter(|| black_box(yf).atan2(black_box(xf))) + }); +} + +fn cossin_bench(c: &mut Criterion) { + let zi = -0x7304_2531_i32; + let zf = zi as f32 / i32::MAX as f32 * PI; + c.bench_function("cossin(zi)", |b| b.iter(|| cossin(black_box(zi)))); + c.bench_function("zf.sin_cos()", |b| b.iter(|| black_box(zf).sin_cos())); +} + +criterion_group!(benches, atan2_bench, cossin_bench); +criterion_main!(benches); diff --git a/dsp/src/iir.rs b/dsp/src/iir.rs index c6f2100..dbec8d8 100644 --- a/dsp/src/iir.rs +++ b/dsp/src/iir.rs @@ -1,85 +1,8 @@ -use core::ops::{Add, Mul, Neg}; use serde::{Deserialize, Serialize}; +use super::{abs, copysign, macc, max, min}; use core::f32; -// These are implemented here because core::f32 doesn't have them (yet). -// They are naive and don't handle inf/nan. -// `compiler-intrinsics`/llvm should have better (robust, universal, and -// faster) implementations. - -fn abs(x: T) -> T -where - T: PartialOrd + Default + Neg, -{ - if x >= T::default() { - x - } else { - -x - } -} - -fn copysign(x: T, y: T) -> T -where - T: PartialOrd + Default + Neg, -{ - if (x >= T::default() && y >= T::default()) - || (x <= T::default() && y <= T::default()) - { - x - } else { - -x - } -} - -#[cfg(not(feature = "nightly"))] -fn max(x: T, y: T) -> T -where - T: PartialOrd, -{ - if x > y { - x - } else { - y - } -} - -#[cfg(not(feature = "nightly"))] -fn min(x: T, y: T) -> T -where - T: PartialOrd, -{ - if x < y { - x - } else { - y - } -} - -#[cfg(feature = "nightly")] -fn max(x: f32, y: f32) -> f32 { - core::intrinsics::maxnumf32(x, y) -} - -#[cfg(feature = "nightly")] -fn min(x: f32, y: f32) -> f32 { - core::intrinsics::minnumf32(x, y) -} - -// Multiply-accumulate vectors `x` and `a`. -// -// A.k.a. dot product. -// Rust/LLVM optimize this nicely. -fn macc(y0: T, x: &[T], a: &[T]) -> T -where - T: Add + Mul + Copy, -{ - x.iter() - .zip(a) - .map(|(x, a)| *x * *a) - .fold(y0, |y, xa| y + xa) -} - /// IIR state and coefficients type. /// /// To represent the IIR state (input and output memory) during the filter update diff --git a/dsp/src/lib.rs b/dsp/src/lib.rs index 6dd20f7..67b1882 100644 --- a/dsp/src/lib.rs +++ b/dsp/src/lib.rs @@ -1,6 +1,8 @@ #![cfg_attr(not(test), no_std)] #![cfg_attr(feature = "nightly", feature(asm, core_intrinsics))] +use core::ops::{Add, Mul, Neg}; + pub type Complex = (T, T); /// Round up half. @@ -18,6 +20,83 @@ pub fn shift_round(x: i32, shift: usize) -> i32 { (x + (1 << (shift - 1))) >> shift } +fn abs(x: T) -> T +where + T: PartialOrd + Default + Neg, +{ + if x >= T::default() { + x + } else { + -x + } +} + +// These are implemented here because core::f32 doesn't have them (yet). +// They are naive and don't handle inf/nan. +// `compiler-intrinsics`/llvm should have better (robust, universal, and +// faster) implementations. + +fn copysign(x: T, y: T) -> T +where + T: PartialOrd + Default + Neg, +{ + if (x >= T::default() && y >= T::default()) + || (x <= T::default() && y <= T::default()) + { + x + } else { + -x + } +} + +#[cfg(not(feature = "nightly"))] +fn max(x: T, y: T) -> T +where + T: PartialOrd, +{ + if x > y { + x + } else { + y + } +} + +#[cfg(not(feature = "nightly"))] +fn min(x: T, y: T) -> T +where + T: PartialOrd, +{ + if x < y { + x + } else { + y + } +} + +#[cfg(feature = "nightly")] +fn max(x: f32, y: f32) -> f32 { + core::intrinsics::maxnumf32(x, y) +} + +#[cfg(feature = "nightly")] +fn min(x: f32, y: f32) -> f32 { + core::intrinsics::minnumf32(x, y) +} + +// Multiply-accumulate vectors `x` and `a`. +// +// A.k.a. dot product. +// Rust/LLVM optimize this nicely. +fn macc(y0: T, x: &[T], a: &[T]) -> T +where + T: Add + Mul + Copy, +{ + x.iter() + .zip(a) + .map(|(x, a)| *x * *a) + .fold(y0, |y, xa| y + xa) +} + pub mod iir; pub mod lockin; pub mod pll; diff --git a/dsp/src/testing.rs b/dsp/src/testing.rs index 1a8e109..098ec87 100644 --- a/dsp/src/testing.rs +++ b/dsp/src/testing.rs @@ -1,6 +1,10 @@ use super::Complex; -pub fn isclose(a: f32, b: f32, rtol: f32, atol: f32) -> bool { +pub fn isclose(a: f64, b: f64, rtol: f64, atol: f64) -> bool { + (a - b).abs() <= a.abs().max(b.abs()) * rtol + atol +} + +pub fn isclosef(a: f32, b: f32, rtol: f32, atol: f32) -> bool { (a - b).abs() <= a.abs().max(b.abs()) * rtol + atol } @@ -10,7 +14,7 @@ pub fn complex_isclose( rtol: f32, atol: f32, ) -> bool { - isclose(a.0, b.0, rtol, atol) && isclose(a.1, b.1, rtol, atol) + isclosef(a.0, b.0, rtol, atol) && isclosef(a.1, b.1, rtol, atol) } pub fn complex_allclose( diff --git a/dsp/src/trig.rs b/dsp/src/trig.rs index 5a99232..9873d7e 100644 --- a/dsp/src/trig.rs +++ b/dsp/src/trig.rs @@ -1,8 +1,95 @@ -use super::Complex; +use super::{shift_round, Complex}; use core::f64::consts::PI; include!(concat!(env!("OUT_DIR"), "/cossin_table.rs")); +/// 2-argument arctangent function. +/// +/// This implementation uses all integer arithmetic for fast +/// computation. It is designed to have high accuracy near the axes +/// and lower away from the axes. It is additionally designed so that +/// the error changes slowly with respect to the angle. +/// +/// # Arguments +/// +/// * `y` - Y-axis component. +/// * `x` - X-axis component. +/// +/// # Returns +/// +/// The angle between the x-axis and the ray to the point (x,y). The +/// result range is from i32::MIN to i32::MAX, where i32::MIN +/// represents -pi and, equivalently, +pi. i32::MAX represents one +/// count less than +pi. +pub fn atan2(y: i32, x: i32) -> i32 { + let mut y = y >> 16; + let mut x = x >> 16; + + let sign = ((y >> 14) & 2) | ((x >> 15) & 1); + if sign & 1 == 1 { + x *= -1; + } + if sign & 2 == 2 { + y *= -1; + } + + let y_greater = y > x; + + // Uses the general procedure described in the following + // Mathematics stack exchange answer: + // + // https://math.stackexchange.com/a/1105038/583981 + // + // The atan approximation method has been modified to be cheaper + // to compute and to be more compatible with integer + // arithmetic. The approximation technique used here is + // + // pi / 4 * x + 0.285 * x * (1 - abs(x)) + // + // which is taken from Rajan 2006: Efficient Approximations for + // the Arctangent Function. + if y_greater { + core::mem::swap(&mut x, &mut y); + } + + if x == 0 { + return 0; + } + + // We need to share the 31 available non-sign bits between the + // atan argument and constant factors used in the atan + // approximation. Sharing the bits roughly equally between them + // gives good accuracy. Additionally, we cannot increase the + // number of atan argument bits beyond 15 because we must square + // it. + const ATAN_ARGUMENT_BITS: usize = 15; + let ratio = (y << ATAN_ARGUMENT_BITS) / x; + + let mut angle = { + const K1: i32 = ((1. / 4. + 0.285 / PI) + * (1 << (31 - ATAN_ARGUMENT_BITS)) as f64) + as i32; + const K2: i32 = + ((0.285 / PI) * (1 << (31 - ATAN_ARGUMENT_BITS)) as f64) as i32; + + ratio * K1 - K2 * shift_round(ratio * ratio, ATAN_ARGUMENT_BITS) + }; + + if y_greater { + angle = (i32::MAX >> 1) - angle; + } + + if sign & 1 == 1 { + angle = i32::MAX - angle; + } + + if sign & 2 == 2 { + angle *= -1; + } + + angle +} + /// Compute the cosine and sine of an angle. /// This is ported from the MiSoC cossin core. /// (https://github.com/m-labs/misoc/blob/master/misoc/cores/cossin.py) @@ -75,8 +162,75 @@ pub fn cossin(phase: i32) -> Complex { #[cfg(test)] mod tests { use super::*; + use crate::testing::isclose; + use core::f64::consts::PI; + + fn angle_to_axis(angle: f64) -> f64 { + let angle = angle % (PI / 2.); + (PI / 2. - angle).min(angle) + } + #[test] - fn error_max_rms_all_phase() { + fn atan2_absolute_error() { + const NUM_VALS: usize = 1_001; + let mut test_vals: [f64; NUM_VALS] = [0.; NUM_VALS]; + let val_bounds: (f64, f64) = (-1., 1.); + let val_delta: f64 = + (val_bounds.1 - val_bounds.0) / (NUM_VALS - 1) as f64; + for i in 0..NUM_VALS { + test_vals[i] = val_bounds.0 + i as f64 * val_delta; + } + + let atol: f64 = 4e-5; + let rtol: f64 = 0.127; + for &x in test_vals.iter() { + for &y in test_vals.iter() { + let actual = (y.atan2(x) as f64 * i16::MAX as f64).round() + / i16::MAX as f64; + let tol = atol + rtol * angle_to_axis(actual).abs(); + let computed = (atan2( + ((y * i16::MAX as f64) as i32) << 16, + ((x * i16::MAX as f64) as i32) << 16, + ) >> 16) as f64 + / i16::MAX as f64 + * PI; + + if !isclose(computed, actual, 0., tol) { + println!("(x, y) : {}, {}", x, y); + println!("actual : {}", actual); + println!("computed : {}", computed); + println!("tolerance: {}\n", tol); + assert!(false); + } + } + } + + // test min and max explicitly + for (x, y) in [ + ((i16::MIN as i32 + 1) << 16, -(1 << 16) as i32), + ((i16::MIN as i32 + 1) << 16, (1 << 16) as i32), + ] + .iter() + { + let yf = *y as f64 / ((i16::MAX as i32) << 16) as f64; + let xf = *x as f64 / ((i16::MAX as i32) << 16) as f64; + let actual = + (yf.atan2(xf) * i16::MAX as f64).round() / i16::MAX as f64; + let computed = (atan2(*y, *x) >> 16) as f64 / i16::MAX as f64 * PI; + let tol = atol + rtol * angle_to_axis(actual).abs(); + + if !isclose(computed, actual, 0., tol) { + println!("(x, y) : {}, {}", *x, *y); + println!("actual : {}", actual); + println!("computed : {}", computed); + println!("tolerance: {}\n", tol); + assert!(false); + } + } + } + + #[test] + fn cossin_error_max_rms_all_phase() { // Constant amplitude error due to LUT data range. const AMPLITUDE: f64 = ((1i64 << 31) - (1i64 << 15)) as f64; const MAX_PHASE: f64 = (1i64 << 32) as f64;