diff --git a/dsp/Cargo.toml b/dsp/Cargo.toml
index 8313a49..548e64f 100644
--- a/dsp/Cargo.toml
+++ b/dsp/Cargo.toml
@@ -12,7 +12,7 @@ serde = { version = "1.0", features = ["derive"], default-features = false }
 criterion = "0.3"
 
 [[bench]]
-name = "cossin"
+name = "trig"
 harness = false
 
 [features]
diff --git a/dsp/benches/cossin.rs b/dsp/benches/cossin.rs
deleted file mode 100644
index 4e23774..0000000
--- a/dsp/benches/cossin.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-use core::f32::consts::PI;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use dsp::trig::cossin;
-
-fn cossin_bench(c: &mut Criterion) {
-    let zi = -0x7304_2531_i32;
-    let zf = zi as f32 / i32::MAX as f32 * PI;
-    c.bench_function("cossin(zi)", |b| b.iter(|| cossin(black_box(zi))));
-    c.bench_function("zf.sin_cos()", |b| b.iter(|| black_box(zf).sin_cos()));
-}
-
-criterion_group!(benches, cossin_bench);
-criterion_main!(benches);
diff --git a/dsp/benches/trig.rs b/dsp/benches/trig.rs
new file mode 100644
index 0000000..19b6cce
--- /dev/null
+++ b/dsp/benches/trig.rs
@@ -0,0 +1,28 @@
+use core::f32::consts::PI;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use dsp::trig::{atan2, cossin};
+
+fn atan2_bench(c: &mut Criterion) {
+    let xi = (10 << 16) as i32;
+    let xf = xi as f32 / i32::MAX as f32;
+
+    let yi = (-26_328 << 16) as i32;
+    let yf = yi as f32 / i32::MAX as f32;
+
+    c.bench_function("atan2(y, x)", |b| {
+        b.iter(|| atan2(black_box(yi), black_box(xi)))
+    });
+    c.bench_function("y.atan2(x)", |b| {
+        b.iter(|| black_box(yf).atan2(black_box(xf)))
+    });
+}
+
+fn cossin_bench(c: &mut Criterion) {
+    let zi = -0x7304_2531_i32;
+    let zf = zi as f32 / i32::MAX as f32 * PI;
+    c.bench_function("cossin(zi)", |b| b.iter(|| cossin(black_box(zi))));
+    c.bench_function("zf.sin_cos()", |b| b.iter(|| black_box(zf).sin_cos()));
+}
+
+criterion_group!(benches, atan2_bench, cossin_bench);
+criterion_main!(benches);
diff --git a/dsp/src/iir.rs b/dsp/src/iir.rs
index c6f2100..dbec8d8 100644
--- a/dsp/src/iir.rs
+++ b/dsp/src/iir.rs
@@ -1,85 +1,8 @@
-use core::ops::{Add, Mul, Neg};
 use serde::{Deserialize, Serialize};
 
+use super::{abs, copysign, macc, max, min};
 use core::f32;
 
-// These are implemented here because core::f32 doesn't have them (yet).
-// They are naive and don't handle inf/nan.
-// `compiler-intrinsics`/llvm should have better (robust, universal, and
-// faster) implementations.
-
-fn abs<T>(x: T) -> T
-where
-    T: PartialOrd + Default + Neg<Output = T>,
-{
-    if x >= T::default() {
-        x
-    } else {
-        -x
-    }
-}
-
-fn copysign<T>(x: T, y: T) -> T
-where
-    T: PartialOrd + Default + Neg<Output = T>,
-{
-    if (x >= T::default() && y >= T::default())
-        || (x <= T::default() && y <= T::default())
-    {
-        x
-    } else {
-        -x
-    }
-}
-
-#[cfg(not(feature = "nightly"))]
-fn max<T>(x: T, y: T) -> T
-where
-    T: PartialOrd,
-{
-    if x > y {
-        x
-    } else {
-        y
-    }
-}
-
-#[cfg(not(feature = "nightly"))]
-fn min<T>(x: T, y: T) -> T
-where
-    T: PartialOrd,
-{
-    if x < y {
-        x
-    } else {
-        y
-    }
-}
-
-#[cfg(feature = "nightly")]
-fn max(x: f32, y: f32) -> f32 {
-    core::intrinsics::maxnumf32(x, y)
-}
-
-#[cfg(feature = "nightly")]
-fn min(x: f32, y: f32) -> f32 {
-    core::intrinsics::minnumf32(x, y)
-}
-
-// Multiply-accumulate vectors `x` and `a`.
-//
-// A.k.a. dot product.
-// Rust/LLVM optimize this nicely.
-fn macc<T>(y0: T, x: &[T], a: &[T]) -> T
-where
-    T: Add<Output = T> + Mul<Output = T> + Copy,
-{
-    x.iter()
-        .zip(a)
-        .map(|(x, a)| *x * *a)
-        .fold(y0, |y, xa| y + xa)
-}
-
 /// IIR state and coefficients type.
 ///
 /// To represent the IIR state (input and output memory) during the filter update
diff --git a/dsp/src/lib.rs b/dsp/src/lib.rs
index 6dd20f7..67b1882 100644
--- a/dsp/src/lib.rs
+++ b/dsp/src/lib.rs
@@ -1,6 +1,8 @@
 #![cfg_attr(not(test), no_std)]
 #![cfg_attr(feature = "nightly", feature(asm, core_intrinsics))]
 
+use core::ops::{Add, Mul, Neg};
+
 pub type Complex<T> = (T, T);
 
 /// Round up half.
@@ -18,6 +20,83 @@ pub fn shift_round(x: i32, shift: usize) -> i32 {
     (x + (1 << (shift - 1))) >> shift
 }
 
+fn abs<T>(x: T) -> T
+where
+    T: PartialOrd + Default + Neg<Output = T>,
+{
+    if x >= T::default() {
+        x
+    } else {
+        -x
+    }
+}
+
+// These are implemented here because core::f32 doesn't have them (yet).
+// They are naive and don't handle inf/nan.
+// `compiler-intrinsics`/llvm should have better (robust, universal, and
+// faster) implementations.
+
+fn copysign<T>(x: T, y: T) -> T
+where
+    T: PartialOrd + Default + Neg<Output = T>,
+{
+    if (x >= T::default() && y >= T::default())
+        || (x <= T::default() && y <= T::default())
+    {
+        x
+    } else {
+        -x
+    }
+}
+
+#[cfg(not(feature = "nightly"))]
+fn max<T>(x: T, y: T) -> T
+where
+    T: PartialOrd,
+{
+    if x > y {
+        x
+    } else {
+        y
+    }
+}
+
+#[cfg(not(feature = "nightly"))]
+fn min<T>(x: T, y: T) -> T
+where
+    T: PartialOrd,
+{
+    if x < y {
+        x
+    } else {
+        y
+    }
+}
+
+#[cfg(feature = "nightly")]
+fn max(x: f32, y: f32) -> f32 {
+    core::intrinsics::maxnumf32(x, y)
+}
+
+#[cfg(feature = "nightly")]
+fn min(x: f32, y: f32) -> f32 {
+    core::intrinsics::minnumf32(x, y)
+}
+
+// Multiply-accumulate vectors `x` and `a`.
+//
+// A.k.a. dot product.
+// Rust/LLVM optimize this nicely.
+fn macc<T>(y0: T, x: &[T], a: &[T]) -> T
+where
+    T: Add<Output = T> + Mul<Output = T> + Copy,
+{
+    x.iter()
+        .zip(a)
+        .map(|(x, a)| *x * *a)
+        .fold(y0, |y, xa| y + xa)
+}
+
 pub mod iir;
 pub mod lockin;
 pub mod pll;
diff --git a/dsp/src/testing.rs b/dsp/src/testing.rs
index 1a8e109..098ec87 100644
--- a/dsp/src/testing.rs
+++ b/dsp/src/testing.rs
@@ -1,6 +1,10 @@
 use super::Complex;
 
-pub fn isclose(a: f32, b: f32, rtol: f32, atol: f32) -> bool {
+pub fn isclose(a: f64, b: f64, rtol: f64, atol: f64) -> bool {
+    (a - b).abs() <= a.abs().max(b.abs()) * rtol + atol
+}
+
+pub fn isclosef(a: f32, b: f32, rtol: f32, atol: f32) -> bool {
     (a - b).abs() <= a.abs().max(b.abs()) * rtol + atol
 }
 
@@ -10,7 +14,7 @@ pub fn complex_isclose(
     rtol: f32,
     atol: f32,
 ) -> bool {
-    isclose(a.0, b.0, rtol, atol) && isclose(a.1, b.1, rtol, atol)
+    isclosef(a.0, b.0, rtol, atol) && isclosef(a.1, b.1, rtol, atol)
 }
 
 pub fn complex_allclose(
diff --git a/dsp/src/trig.rs b/dsp/src/trig.rs
index 5a99232..9873d7e 100644
--- a/dsp/src/trig.rs
+++ b/dsp/src/trig.rs
@@ -1,8 +1,95 @@
-use super::Complex;
+use super::{shift_round, Complex};
 use core::f64::consts::PI;
 
 include!(concat!(env!("OUT_DIR"), "/cossin_table.rs"));
 
+/// 2-argument arctangent function.
+///
+/// This implementation uses all integer arithmetic for fast
+/// computation. It is designed to have high accuracy near the axes
+/// and lower away from the axes. It is additionally designed so that
+/// the error changes slowly with respect to the angle.
+///
+/// # Arguments
+///
+/// * `y` - Y-axis component.
+/// * `x` - X-axis component.
+///
+/// # Returns
+///
+/// The angle between the x-axis and the ray to the point (x,y). The
+/// result range is from i32::MIN to i32::MAX, where i32::MIN
+/// represents -pi and, equivalently, +pi. i32::MAX represents one
+/// count less than +pi.
+pub fn atan2(y: i32, x: i32) -> i32 {
+    let mut y = y >> 16;
+    let mut x = x >> 16;
+
+    let sign = ((y >> 14) & 2) | ((x >> 15) & 1);
+    if sign & 1 == 1 {
+        x *= -1;
+    }
+    if sign & 2 == 2 {
+        y *= -1;
+    }
+
+    let y_greater = y > x;
+
+    // Uses the general procedure described in the following
+    // Mathematics stack exchange answer:
+    //
+    // https://math.stackexchange.com/a/1105038/583981
+    //
+    // The atan approximation method has been modified to be cheaper
+    // to compute and to be more compatible with integer
+    // arithmetic. The approximation technique used here is
+    //
+    // pi / 4 * x + 0.285 * x * (1 - abs(x))
+    //
+    // which is taken from Rajan 2006: Efficient Approximations for
+    // the Arctangent Function.
+    if y_greater {
+        core::mem::swap(&mut x, &mut y);
+    }
+
+    if x == 0 {
+        return 0;
+    }
+
+    // We need to share the 31 available non-sign bits between the
+    // atan argument and constant factors used in the atan
+    // approximation. Sharing the bits roughly equally between them
+    // gives good accuracy. Additionally, we cannot increase the
+    // number of atan argument bits beyond 15 because we must square
+    // it.
+    const ATAN_ARGUMENT_BITS: usize = 15;
+    let ratio = (y << ATAN_ARGUMENT_BITS) / x;
+
+    let mut angle = {
+        const K1: i32 = ((1. / 4. + 0.285 / PI)
+            * (1 << (31 - ATAN_ARGUMENT_BITS)) as f64)
+            as i32;
+        const K2: i32 =
+            ((0.285 / PI) * (1 << (31 - ATAN_ARGUMENT_BITS)) as f64) as i32;
+
+        ratio * K1 - K2 * shift_round(ratio * ratio, ATAN_ARGUMENT_BITS)
+    };
+
+    if y_greater {
+        angle = (i32::MAX >> 1) - angle;
+    }
+
+    if sign & 1 == 1 {
+        angle = i32::MAX - angle;
+    }
+
+    if sign & 2 == 2 {
+        angle *= -1;
+    }
+
+    angle
+}
+
 /// Compute the cosine and sine of an angle.
 /// This is ported from the MiSoC cossin core.
 /// (https://github.com/m-labs/misoc/blob/master/misoc/cores/cossin.py)
@@ -75,8 +162,75 @@ pub fn cossin(phase: i32) -> Complex<i32> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::testing::isclose;
+    use core::f64::consts::PI;
+
+    fn angle_to_axis(angle: f64) -> f64 {
+        let angle = angle % (PI / 2.);
+        (PI / 2. - angle).min(angle)
+    }
+
     #[test]
-    fn error_max_rms_all_phase() {
+    fn atan2_absolute_error() {
+        const NUM_VALS: usize = 1_001;
+        let mut test_vals: [f64; NUM_VALS] = [0.; NUM_VALS];
+        let val_bounds: (f64, f64) = (-1., 1.);
+        let val_delta: f64 =
+            (val_bounds.1 - val_bounds.0) / (NUM_VALS - 1) as f64;
+        for i in 0..NUM_VALS {
+            test_vals[i] = val_bounds.0 + i as f64 * val_delta;
+        }
+
+        let atol: f64 = 4e-5;
+        let rtol: f64 = 0.127;
+        for &x in test_vals.iter() {
+            for &y in test_vals.iter() {
+                let actual = (y.atan2(x) as f64 * i16::MAX as f64).round()
+                    / i16::MAX as f64;
+                let tol = atol + rtol * angle_to_axis(actual).abs();
+                let computed = (atan2(
+                    ((y * i16::MAX as f64) as i32) << 16,
+                    ((x * i16::MAX as f64) as i32) << 16,
+                ) >> 16) as f64
+                    / i16::MAX as f64
+                    * PI;
+
+                if !isclose(computed, actual, 0., tol) {
+                    println!("(x, y)   : {}, {}", x, y);
+                    println!("actual   : {}", actual);
+                    println!("computed : {}", computed);
+                    println!("tolerance: {}\n", tol);
+                    assert!(false);
+                }
+            }
+        }
+
+        // test min and max explicitly
+        for (x, y) in [
+            ((i16::MIN as i32 + 1) << 16, -(1 << 16) as i32),
+            ((i16::MIN as i32 + 1) << 16, (1 << 16) as i32),
+        ]
+        .iter()
+        {
+            let yf = *y as f64 / ((i16::MAX as i32) << 16) as f64;
+            let xf = *x as f64 / ((i16::MAX as i32) << 16) as f64;
+            let actual =
+                (yf.atan2(xf) * i16::MAX as f64).round() / i16::MAX as f64;
+            let computed = (atan2(*y, *x) >> 16) as f64 / i16::MAX as f64 * PI;
+            let tol = atol + rtol * angle_to_axis(actual).abs();
+
+            if !isclose(computed, actual, 0., tol) {
+                println!("(x, y)   : {}, {}", *x, *y);
+                println!("actual   : {}", actual);
+                println!("computed : {}", computed);
+                println!("tolerance: {}\n", tol);
+                assert!(false);
+            }
+        }
+    }
+
+    #[test]
+    fn cossin_error_max_rms_all_phase() {
         // Constant amplitude error due to LUT data range.
         const AMPLITUDE: f64 = ((1i64 << 31) - (1i64 << 15)) as f64;
         const MAX_PHASE: f64 = (1i64 << 32) as f64;