artiq/artiq/gateware/dsp/fir.py

from math import floor
from operator import add
from functools import reduce
from collections import namedtuple

import numpy as np

from migen import *


def halfgen4(width, n, df=1e-3):
    """
    http://recycle.lbl.gov/~ldoolitt/halfband

    params:
        * `up` is the passband/stopband width, as a fraction of
          input sampling rate
        * `n is the order of half-band filter to generate
    returns:
        * `a` is the full set of FIR coefficients, `4*n-1` long.
          implement wisely.
    """

    npt = n*40
    wmax = 2*np.pi*width
    wfit = (1 - np.linspace(0, 1, npt)[:, None]**2)*wmax

    target = .5*np.ones_like(wfit)
    basis = np.cos(wfit*np.arange(1, 2*n, 2))
    weight = np.ones_like(wfit)

    f0 = None

    for i in range(40):
        l = np.linalg.pinv(basis*weight)@(target*weight)
        err = np.fabs(basis@l - .5)
        f = np.max(err)/np.mean(err)
        if f0 and (f0 - f)/(f0 + f) < df/2:
            break
        f0 = f
        weight[err > (1 - df)*np.max(err)] *= 1 + 1.5/(i + 11)
    a = np.c_[l, np.zeros_like(l)].ravel()[:-1]
    a = np.r_[a[::-1], 1, a]/2
    return a


_Widths = namedtuple("_Widths", "A B P")

_widths = {
    "DSP48E1": _Widths(25, 18, 48),
}


class ParallelFIR(Module):
    """Full-rate parallelized finite impulse response filter.

    Tries to use transposed form as much as possible.

    :param coefficients: tap coefficients (normalized to 1.),
        increasing delay.
    :param parallelism: number of samples per cycle.
    :param width: bit width of input and output.
    :param arch: architecture (default: "DSP48E1").
    """
    def __init__(self, coefficients, parallelism, width=16,
                 arch="DSP48E1"):
        self.width = width
        self.parallelism = p = parallelism
        n = len(coefficients)
        # input and output: old to new, decreasing delay
        self.i = [Signal((width, True)) for i in range(p)]
        self.o = [Signal((width, True)) for i in range(p)]
        self.latency = (n + 1)//2//p + 2
        w = _widths[arch]

        c_max = max(abs(c) for c in coefficients)
        c_shift = bits_for(floor((1 << w.B - 2) / c_max))
        self.coefficients = cs = [int(round(c*(1 << c_shift)))
                                  for c in coefficients]
        assert max(bits_for(c) for c in cs) <= w.B

        ###

        # Delay line: increasing delay
        x = [Signal((w.A, True), reset_less=True) for _ in range(n + p - 1)]
        x_shift = w.A - width
        # reduce by pre-adder gain
        x_shift -= bits_for(max(cs.count(c) for c in cs if c) - 1)
        # TODO: reduce by P width limit?
        assert x_shift + width <= w.A

        assert sum(abs(c)*(1 << w.A - 1) for c in cs) <= (1 << w.P - 1) - 1

        for xi, xj in zip(x, self.i[::-1]):
            self.sync += xi.eq(xj << x_shift)
        for xi, xj in zip(x[len(self.i):], x):
            self.sync += xi.eq(xj)

        for delay in range(p):
            o = Signal((w.P, True), reset_less=True)
            self.comb += self.o[delay].eq(o >> c_shift + x_shift)
            # Make products
            for i, c in enumerate(cs):
                # simplify for halfband and symmetric filters
                if not c or c in cs[:i]:
                    continue
                js = [j + p - 1 for j, cj in enumerate(cs) if cj == c]
                m = Signal.like(o)
                o0, o = o, Signal.like(o)
                q = Signal.like(x[0])
                if delay + p <= js[0]:
                    self.sync += o0.eq(o + m)
                    delay += p
                else:
                    self.comb += o0.eq(o + m)
                assert js[0] - delay >= 0
                self.comb += q.eq(reduce(add, [x[j - delay] for j in js]))
                self.sync += m.eq(c*q)
            # symmetric rounding
            if c_shift + x_shift > 1:
                self.comb += o.eq((1 << c_shift + x_shift - 1) - 1)


class FIR(ParallelFIR):
    def __init__(self, *args, **kwargs):
        super().__init__(self, *args, parallelism=1, **kwargs)
        self.i = self.i[0]
        self.o = self.o[0]


def halfgen4_cascade(rate, width, order=None):
    """Generate coefficients for cascaded half-band filters.
    Coefficients are normalized to a gain of two per stage to compensate for
    the zero stuffing.

    :param rate: upsampling rate. power of two
    :param width: passband/stopband width in units of input sampling rate.
    :param order: highest order, defaults to :param:`rate`"""
    if order is None:
        order = rate
    coeff = []
    p = 1
    while p < rate:
        p *= 2
        coeff.append(2*halfgen4(width*p/rate/2, order*p//rate))
    return coeff


class ParallelHBFUpsampler(Module):
    """Parallel, power-of-two, half-band, cascading upsampler.

    Coefficients should be normalized to overall gain of 2
    (highest/center coefficient being 1)."""
    def __init__(self, coefficients, width=16, **kwargs):
        self.parallelism = 1  # accumulate
        self.latency = 0  # accumulate
        self.width = width
        self.i = Signal((width, True))

        ###

        i = [self.i]
        for coeff in coefficients:
            self.parallelism *= 2
            hbf = ParallelFIR(coeff, self.parallelism, width, **kwargs)
            self.submodules += hbf
            self.comb += [a.eq(b) for a, b in zip(hbf.i[1::2], i)]
            i = hbf.o
            self.latency += hbf.latency
        self.o = i
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`from math import floor`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`from operator import add`
			`from functools import reduce`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`from collections import namedtuple`

gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`import numpy as np`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`from migen import *`


fir: cleanup halfgen4 2016-12-18 04:19:46 +08:00			`def halfgen4(width, n, df=1e-3):`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`"""`
			`http://recycle.lbl.gov/~ldoolitt/halfband`

			`params:`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			* `up` is the passband/stopband width, as a fraction of
			`input sampling rate`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			* `n is the order of half-band filter to generate
			`returns:`
			* `a` is the full set of FIR coefficients, `4*n-1` long.
			`implement wisely.`
			`"""`

			`npt = n*40`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`wmax = 2np.piwidth`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`wfit = (1 - np.linspace(0, 1, npt)[:, None]*2)wmax`

			`target = .5*np.ones_like(wfit)`
			`basis = np.cos(wfitnp.arange(1, 2n, 2))`
			`weight = np.ones_like(wfit)`
fir: cleanup halfgen4 2016-12-18 04:19:46 +08:00
			`f0 = None`

gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`for i in range(40):`
			`l = np.linalg.pinv(basisweight)@(targetweight)`
fir: cleanup halfgen4 2016-12-18 04:19:46 +08:00			`err = np.fabs(basis@l - .5)`
			`f = np.max(err)/np.mean(err)`
			`if f0 and (f0 - f)/(f0 + f) < df/2:`
			`break`
			`f0 = f`
			`weight[err > (1 - df)np.max(err)] = 1 + 1.5/(i + 11)`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`a = np.c_[l, np.zeros_like(l)].ravel()[:-1]`
			`a = np.r_[a[::-1], 1, a]/2`
			`return a`


fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`_Widths = namedtuple("_Widths", "A B P")`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`_widths = {`
			`"DSP48E1": _Widths(25, 18, 48),`
			`}`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00

			`class ParallelFIR(Module):`
			`"""Full-rate parallelized finite impulse response filter.`

fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`Tries to use transposed form as much as possible.`

fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`:param coefficients: tap coefficients (normalized to 1.),`
			`increasing delay.`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`:param parallelism: number of samples per cycle.`
			`:param width: bit width of input and output.`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`:param arch: architecture (default: "DSP48E1").`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`"""`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`def __init__(self, coefficients, parallelism, width=16,`
			`arch="DSP48E1"):`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`self.width = width`
			`self.parallelism = p = parallelism`
			`n = len(coefficients)`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`# input and output: old to new, decreasing delay`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`self.i = [Signal((width, True)) for i in range(p)]`
			`self.o = [Signal((width, True)) for i in range(p)]`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`self.latency = (n + 1)//2//p + 2`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`w = _widths[arch]`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`c_max = max(abs(c) for c in coefficients)`
			`c_shift = bits_for(floor((1 << w.B - 2) / c_max))`
			`self.coefficients = cs = [int(round(c*(1 << c_shift)))`
			`for c in coefficients]`
fir: check widths 2017-06-13 02:07:23 +08:00			`assert max(bits_for(c) for c in cs) <= w.B`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`###`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00
			`# Delay line: increasing delay`
dsp.fir: use pipelin-reset 2017-06-29 01:09:21 +08:00			`x = [Signal((w.A, True), reset_less=True) for _ in range(n + p - 1)]`
fir: check widths 2017-06-13 02:07:23 +08:00			`x_shift = w.A - width`
			`# reduce by pre-adder gain`
			`x_shift -= bits_for(max(cs.count(c) for c in cs if c) - 1)`
			`# TODO: reduce by P width limit?`
			`assert x_shift + width <= w.A`

			`assert sum(abs(c)*(1 << w.A - 1) for c in cs) <= (1 << w.P - 1) - 1`

fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`for xi, xj in zip(x, self.i[::-1]):`
			`self.sync += xi.eq(xj << x_shift)`
			`for xi, xj in zip(x[len(self.i):], x):`
			`self.sync += xi.eq(xj)`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`for delay in range(p):`
dsp.fir: use pipelin-reset 2017-06-29 01:09:21 +08:00			`o = Signal((w.P, True), reset_less=True)`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`self.comb += self.o[delay].eq(o >> c_shift + x_shift)`
fir: register multiplier output 2016-12-09 00:00:39 +08:00			`# Make products`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`for i, c in enumerate(cs):`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`# simplify for halfband and symmetric filters`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`if not c or c in cs[:i]:`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`continue`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`js = [j + p - 1 for j, cj in enumerate(cs) if cj == c]`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`m = Signal.like(o)`
			`o0, o = o, Signal.like(o)`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`q = Signal.like(x[0])`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`if delay + p <= js[0]:`
			`self.sync += o0.eq(o + m)`
			`delay += p`
			`else:`
			`self.comb += o0.eq(o + m)`
			`assert js[0] - delay >= 0`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`self.comb += q.eq(reduce(add, [x[j - delay] for j in js]))`
			`self.sync += m.eq(c*q)`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`# symmetric rounding`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`if c_shift + x_shift > 1:`
			`self.comb += o.eq((1 << c_shift + x_shift - 1) - 1)`


			`class FIR(ParallelFIR):`
			`def __init__(self, args, *kwargs):`
			`super().__init__(self, args, parallelism=1, *kwargs)`
			`self.i = self.i[0]`
			`self.o = self.o[0]`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00

			`def halfgen4_cascade(rate, width, order=None):`
			`"""Generate coefficients for cascaded half-band filters.`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`Coefficients are normalized to a gain of two per stage to compensate for`
			`the zero stuffing.`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00
			`:param rate: upsampling rate. power of two`
			`:param width: passband/stopband width in units of input sampling rate.`
			:param order: highest order, defaults to :param:`rate`"""
			`if order is None:`
			`order = rate`
			`coeff = []`
			`p = 1`
			`while p < rate:`
			`p *= 2`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`coeff.append(2halfgen4(widthp/rate/2, order*p//rate))`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`return coeff`


			`class ParallelHBFUpsampler(Module):`
			`"""Parallel, power-of-two, half-band, cascading upsampler.`

			`Coefficients should be normalized to overall gain of 2`
			`(highest/center coefficient being 1)."""`
			`def __init__(self, coefficients, width=16, **kwargs):`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`self.parallelism = 1 # accumulate`
			`self.latency = 0 # accumulate`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`self.width = width`
			`self.i = Signal((width, True))`

			`###`

			`i = [self.i]`
			`for coeff in coefficients:`
			`self.parallelism *= 2`
Revert "fir/ParallelHBFUpsampler: add headroom (gain=2)" This reverts commit 6ac9d0c41efba9d9a6f85754b8ec131a9e74f0cc. Overshooting behavior must to be handled outside the FIR. 2017-06-13 02:07:25 +08:00			`hbf = ParallelFIR(coeff, self.parallelism, width, **kwargs)`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`self.submodules += hbf`
fir: simplify latency compensation Don't try to tweak out the last bit of latency by feeding the HBF input early. Instead feed it late so the interpolated samples are early and the latency is an even multiple of the super-sample cycle. 2017-06-29 01:13:43 +08:00			`self.comb += [a.eq(b) for a, b in zip(hbf.i[1::2], i)]`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`i = hbf.o`
			`self.latency += hbf.latency`
			`self.o = i`