artiq/artiq/gateware/dsp/fir.py

from math import floor
from operator import add
from functools import reduce
from collections import namedtuple

import numpy as np

from migen import *


def halfgen4(width, n, df=1e-3):
    """
    http://recycle.lbl.gov/~ldoolitt/halfband

    params:
        * `up` is the passband/stopband width, as a fraction of
          input sampling rate
        * `n is the order of half-band filter to generate
    returns:
        * `a` is the full set of FIR coefficients, `4*n-1` long.
          implement wisely.
    """

    npt = n*40
    wmax = 2*np.pi*width
    wfit = (1 - np.linspace(0, 1, npt)[:, None]**2)*wmax

    target = .5*np.ones_like(wfit)
    basis = np.cos(wfit*np.arange(1, 2*n, 2))
    weight = np.ones_like(wfit)

    f0 = None

    for i in range(40):
        l = np.linalg.pinv(basis*weight)@(target*weight)
        err = np.fabs(basis@l - .5)
        f = np.max(err)/np.mean(err)
        if f0 and (f0 - f)/(f0 + f) < df/2:
            break
        f0 = f
        weight[err > (1 - df)*np.max(err)] *= 1 + 1.5/(i + 11)
    a = np.c_[l, np.zeros_like(l)].ravel()[:-1]
    a = np.r_[a[::-1], 1, a]/2
    return a


_Widths = namedtuple("_Widths", "A B P")

_widths = {
    "DSP48E1": _Widths(25, 18, 48),
}


class ParallelFIR(Module):
    """Full-rate parallelized finite impulse response filter.

    Tries to use transposed form as much as possible.

    :param coefficients: tap coefficients (normalized to 1.),
        increasing delay.
    :param parallelism: number of samples per cycle.
    :param width: bit width of input and output.
    :param arch: architecture (default: "DSP48E1").
    """
    def __init__(self, coefficients, parallelism, width=16,
                 arch="DSP48E1", cull_delays=()):
        self.width = width
        self.parallelism = p = parallelism
        n = len(coefficients)
        # input and output: old to new, decreasing delay
        self.i = [Signal((width, True)) for i in range(p)]
        self.o = [Signal((width, True)) for i in range(p)]
        self.latency = (n + 1)//2//p + 2
        w = _widths[arch]

        c_max = max(abs(c) for c in coefficients)
        c_shift = bits_for(floor((1 << w.B - 2) / c_max))
        self.coefficients = cs = [int(round(c*(1 << c_shift)))
                                  for c in coefficients]
        assert max(bits_for(c) for c in cs) <= w.B
        max_out = sum(abs(c)*(1 << w.A - 1) for c in cs)
        assert max_out <= (1 << w.P - 1) - 1, (bits_for(max_out), w)

        ###

        # Delay line: increasing delay
        x = [Signal((w.A, True), reset_less=True) for _ in range(n + p - 1)]

        for xi, xj in zip(x, self.i[::-1]):
            self.comb += xi.eq(xj)
        for xi, xj in zip(x[len(self.i):], x):
            self.sync += xi.eq(xj)

        for delay in range(p):
            o = Signal((w.P, True), reset_less=True)
            self.sync += self.o[delay].eq(o >> c_shift)
            # Make products
            tap = delay
            for i, c in enumerate(cs):
                # simplify for halfband and symmetric filters
                if not c or c in cs[:i]:
                    continue
                js = [j + p - 1 for j, cj in enumerate(cs) if cj == c]
                m = Signal.like(o)
                o0, o = o, Signal.like(o)
                q = Signal.like(x[0])
                if tap + p <= js[0]:
                    self.sync += o0.eq(o + m)
                    tap += p
                else:
                    self.comb += o0.eq(o + m)
                assert min(js) - tap >= 0
                js = [j for j in js
                        if (p - 1 - j - tap) % p not in cull_delays]
                if not js:
                    continue
                self.comb += q.eq(reduce(add, [x[j - tap] for j in js]))
                self.sync += m.eq(c*q)
            # symmetric rounding
            if c_shift > 1 and delay not in cull_delays:
                self.comb += o.eq((1 << c_shift - 1) - 1)


class FIR(ParallelFIR):
    def __init__(self, *args, **kwargs):
        super().__init__(self, *args, parallelism=1, **kwargs)
        self.i = self.i[0]
        self.o = self.o[0]


def halfgen4_cascade(rate, width, order=None):
    """Generate coefficients for cascaded half-band filters.

    Coefficients are normalized to a gain of two per stage to compensate for
    the zero stuffing.

    :param rate: upsampling rate. power of two
    :param width: passband/stopband width in units of input sampling rate.
    :param order: highest order, defaults to :param:`rate`"""
    if order is None:
        order = rate
    coeff = []
    p = 1
    while p < rate:
        p *= 2
        coeff.append(2*halfgen4(width*p/rate/2, order*p//rate))
    return coeff


class ParallelHBFUpsampler(Module):
    """Parallel, power-of-two, half-band, cascading upsampler.

    Coefficients should be normalized to overall gain of 2
    (highest/center coefficient being 1)."""
    def __init__(self, coefficients, width=16, **kwargs):
        self.parallelism = 1  # accumulate
        self.latency = 0  # accumulate
        self.width = width
        self.i = Signal((width, True))

        ###

        i = [self.i]
        for coeff in coefficients:
            self.parallelism *= 2
            hbf = ParallelFIR(coeff, self.parallelism, width=width,
                    cull_delays={0}, **kwargs)
            self.submodules += hbf
            self.comb += [a.eq(b) for a, b in zip(hbf.i[1::2], i)]
            i = hbf.o
            self.latency += hbf.latency
        self.o = i
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`from math import floor`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`from operator import add`
			`from functools import reduce`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`from collections import namedtuple`

gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`import numpy as np`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`from migen import *`


fir: cleanup halfgen4 2016-12-18 04:19:46 +08:00			`def halfgen4(width, n, df=1e-3):`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`"""`
			`http://recycle.lbl.gov/~ldoolitt/halfband`

			`params:`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			* `up` is the passband/stopband width, as a fraction of
			`input sampling rate`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			* `n is the order of half-band filter to generate
			`returns:`
			* `a` is the full set of FIR coefficients, `4*n-1` long.
			`implement wisely.`
			`"""`

			`npt = n*40`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`wmax = 2np.piwidth`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`wfit = (1 - np.linspace(0, 1, npt)[:, None]*2)wmax`

			`target = .5*np.ones_like(wfit)`
			`basis = np.cos(wfitnp.arange(1, 2n, 2))`
			`weight = np.ones_like(wfit)`
fir: cleanup halfgen4 2016-12-18 04:19:46 +08:00
			`f0 = None`

gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`for i in range(40):`
			`l = np.linalg.pinv(basisweight)@(targetweight)`
fir: cleanup halfgen4 2016-12-18 04:19:46 +08:00			`err = np.fabs(basis@l - .5)`
			`f = np.max(err)/np.mean(err)`
			`if f0 and (f0 - f)/(f0 + f) < df/2:`
			`break`
			`f0 = f`
			`weight[err > (1 - df)np.max(err)] = 1 + 1.5/(i + 11)`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00			`a = np.c_[l, np.zeros_like(l)].ravel()[:-1]`
			`a = np.r_[a[::-1], 1, a]/2`
			`return a`


fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`_Widths = namedtuple("_Widths", "A B P")`
gateware/dsp: add FIR and test 2016-12-08 02:14:23 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`_widths = {`
			`"DSP48E1": _Widths(25, 18, 48),`
			`}`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00

			`class ParallelFIR(Module):`
			`"""Full-rate parallelized finite impulse response filter.`

fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`Tries to use transposed form as much as possible.`

fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`:param coefficients: tap coefficients (normalized to 1.),`
			`increasing delay.`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`:param parallelism: number of samples per cycle.`
			`:param width: bit width of input and output.`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`:param arch: architecture (default: "DSP48E1").`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`"""`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`def __init__(self, coefficients, parallelism, width=16,`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`arch="DSP48E1", cull_delays=()):`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`self.width = width`
			`self.parallelism = p = parallelism`
			`n = len(coefficients)`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`# input and output: old to new, decreasing delay`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`self.i = [Signal((width, True)) for i in range(p)]`
			`self.o = [Signal((width, True)) for i in range(p)]`
dsp.fir: pipeline final systolic adder 2017-06-29 17:33:19 +08:00			`self.latency = (n + 1)//2//p + 2`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`w = _widths[arch]`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`c_max = max(abs(c) for c in coefficients)`
			`c_shift = bits_for(floor((1 << w.B - 2) / c_max))`
			`self.coefficients = cs = [int(round(c*(1 << c_shift)))`
			`for c in coefficients]`
fir: check widths 2017-06-13 02:07:23 +08:00			`assert max(bits_for(c) for c in cs) <= w.B`
dsp.fir: cleanup 2017-06-29 18:08:43 +08:00			`max_out = sum(abs(c)*(1 << w.A - 1) for c in cs)`
			`assert max_out <= (1 << w.P - 1) - 1, (bits_for(max_out), w)`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`###`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00
			`# Delay line: increasing delay`
dsp.fir: use pipelin-reset 2017-06-29 01:09:21 +08:00			`x = [Signal((w.A, True), reset_less=True) for _ in range(n + p - 1)]`
fir: check widths 2017-06-13 02:07:23 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`for xi, xj in zip(x, self.i[::-1]):`
dsp.fir: drop x shift 2017-06-29 01:55:15 +08:00			`self.comb += xi.eq(xj)`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`for xi, xj in zip(x[len(self.i):], x):`
			`self.sync += xi.eq(xj)`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`for delay in range(p):`
dsp.fir: use pipelin-reset 2017-06-29 01:09:21 +08:00			`o = Signal((w.P, True), reset_less=True)`
dsp.fir: pipeline final systolic adder 2017-06-29 17:33:19 +08:00			`self.sync += self.o[delay].eq(o >> c_shift)`
fir: register multiplier output 2016-12-09 00:00:39 +08:00			`# Make products`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`tap = delay`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`for i, c in enumerate(cs):`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`# simplify for halfband and symmetric filters`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`if not c or c in cs[:i]:`
fir: add ParallelFIR and test 2016-12-08 20:05:13 +08:00			`continue`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`js = [j + p - 1 for j, cj in enumerate(cs) if cj == c]`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`m = Signal.like(o)`
			`o0, o = o, Signal.like(o)`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`q = Signal.like(x[0])`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`if tap + p <= js[0]:`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`self.sync += o0.eq(o + m)`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`tap += p`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`else:`
			`self.comb += o0.eq(o + m)`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`assert min(js) - tap >= 0`
dsp.fir: cleanup 2017-06-29 18:08:43 +08:00			`js = [j for j in js`
			`if (p - 1 - j - tap) % p not in cull_delays]`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`if not js:`
			`continue`
			`self.comb += q.eq(reduce(add, [x[j - tap] for j in js]))`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`self.sync += m.eq(c*q)`
fir: automatically use transposed topology 2016-12-15 02:15:50 +08:00			`# symmetric rounding`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`if c_shift > 1 and delay not in cull_delays:`
dsp.fir: drop x shift 2017-06-29 01:55:15 +08:00			`self.comb += o.eq((1 << c_shift - 1) - 1)`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00

			`class FIR(ParallelFIR):`
			`def __init__(self, args, *kwargs):`
			`super().__init__(self, args, parallelism=1, *kwargs)`
			`self.i = self.i[0]`
			`self.o = self.o[0]`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00

			`def halfgen4_cascade(rate, width, order=None):`
			`"""Generate coefficients for cascaded half-band filters.`
dsp.fir: cleanup 2017-06-29 18:08:43 +08:00
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`Coefficients are normalized to a gain of two per stage to compensate for`
			`the zero stuffing.`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00
			`:param rate: upsampling rate. power of two`
			`:param width: passband/stopband width in units of input sampling rate.`
			:param order: highest order, defaults to :param:`rate`"""
			`if order is None:`
			`order = rate`
			`coeff = []`
			`p = 1`
			`while p < rate:`
			`p *= 2`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`coeff.append(2halfgen4(widthp/rate/2, order*p//rate))`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`return coeff`


			`class ParallelHBFUpsampler(Module):`
			`"""Parallel, power-of-two, half-band, cascading upsampler.`

			`Coefficients should be normalized to overall gain of 2`
			`(highest/center coefficient being 1)."""`
			`def __init__(self, coefficients, width=16, **kwargs):`
fir: streamline, optimize DSP extraction, left-align inputs 2016-12-21 04:39:51 +08:00			`self.parallelism = 1 # accumulate`
			`self.latency = 0 # accumulate`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`self.width = width`
			`self.i = Signal((width, True))`

			`###`

			`i = [self.i]`
			`for coeff in coefficients:`
			`self.parallelism *= 2`
dsp.fir: actively cull zero delays 2017-06-29 17:24:56 +08:00			`hbf = ParallelFIR(coeff, self.parallelism, width=width,`
			`cull_delays={0}, **kwargs)`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`self.submodules += hbf`
fir: simplify latency compensation Don't try to tweak out the last bit of latency by feeding the HBF input early. Instead feed it late so the interpolated samples are early and the latency is an even multiple of the super-sample cycle. 2017-06-29 01:13:43 +08:00			`self.comb += [a.eq(b) for a, b in zip(hbf.i[1::2], i)]`
fir: add ParallelHBFCascade 2016-12-08 22:30:26 +08:00			`i = hbf.o`
			`self.latency += hbf.latency`
			`self.o = i`