fir: streamline, optimize DSP extraction, left-align inputs

This commit is contained in:
Robert Jördens 2016-12-20 21:39:51 +01:00
parent cfb66117af
commit f5f662200b
3 changed files with 61 additions and 108 deletions

View File

@ -1,6 +1,10 @@
from math import floor
from operator import add from operator import add
from functools import reduce from functools import reduce
from collections import namedtuple
import numpy as np import numpy as np
from migen import * from migen import *
@ -40,56 +44,11 @@ def halfgen4(width, n, df=1e-3):
return a return a
class FIR(Module): _Widths = namedtuple("_Widths", "A B P")
"""Full-rate finite impulse response filter.
Tries to use transposed form (adder chain instead of adder tree) _widths = {
as much as possible. "DSP48E1": _Widths(25, 18, 48),
}
:param coefficients: integer taps, increasing delay.
:param width: bit width of input and output.
:param shift: scale factor (as power of two).
"""
def __init__(self, coefficients, width=16, shift=None):
self.width = width
self.i = Signal((width, True))
self.o = Signal((width, True))
n = len(coefficients)
self.latency = n//2 + 3
###
if shift is None:
shift = bits_for(sum(abs(c) for c in coefficients)) - 1
# Delay line: increasing delay
x = [Signal((width, True)) for _ in range(n)]
self.sync += [xi.eq(xj) for xi, xj in zip(x, [self.i] + x)]
o = Signal((width + shift + 1, True))
self.comb += self.o.eq(o >> shift)
delay = -1
# Make products
for i, c in enumerate(coefficients):
# simplify for halfband and symmetric filters
if not c or c in coefficients[:i]:
continue
js = [j for j, cj in enumerate(coefficients) if cj == c]
m = Signal.like(o)
o0, o = o, Signal.like(o)
if delay < js[0]:
self.sync += o0.eq(o + m)
delay += 1
else:
self.comb += o0.eq(o + m)
assert js[0] - delay >= 0
xs = [x[j - delay] for j in js]
s = Signal((bits_for(len(xs)) - 1 + len(xs[0]), True))
self.comb += s.eq(sum(xs))
self.sync += m.eq(c*s)
# symmetric rounding
if shift:
self.comb += o.eq((1 << shift - 1) - 1)
class ParallelFIR(Module): class ParallelFIR(Module):
@ -97,12 +56,14 @@ class ParallelFIR(Module):
Tries to use transposed form as much as possible. Tries to use transposed form as much as possible.
:param coefficients: integer taps, increasing delay. :param coefficients: tap coefficients (normalized to 1.),
increasing delay.
:param parallelism: number of samples per cycle. :param parallelism: number of samples per cycle.
:param width: bit width of input and output. :param width: bit width of input and output.
:param shift: scale factor (as power of two). :param arch: architecture (default: "DSP48E1").
""" """
def __init__(self, coefficients, parallelism, width=16, shift=None): def __init__(self, coefficients, parallelism, width=16,
arch="DSP48E1"):
self.width = width self.width = width
self.parallelism = p = parallelism self.parallelism = p = parallelism
n = len(coefficients) n = len(coefficients)
@ -111,45 +72,60 @@ class ParallelFIR(Module):
self.o = [Signal((width, True)) for i in range(p)] self.o = [Signal((width, True)) for i in range(p)]
self.latency = (n + 1)//2//p + 2 self.latency = (n + 1)//2//p + 2
# ... plus one sample # ... plus one sample
w = _widths[arch]
c_max = max(abs(c) for c in coefficients)
c_shift = bits_for(floor((1 << w.B - 2) / c_max))
self.coefficients = cs = [int(round(c*(1 << c_shift)))
for c in coefficients]
### ###
if shift is None:
shift = bits_for(sum(abs(c) for c in coefficients)) - 1
# Delay line: increasing delay # Delay line: increasing delay
x = [Signal((width, True)) for _ in range(n + p - 1)] x = [Signal((w.A, True)) for _ in range(n + p - 1)]
self.sync += [xi.eq(xj) for xi, xj in zip(x, self.i[::-1] + x)] x_shift = w.A - width - bits_for(
max(cs.count(c) for c in cs if c) - 1)
for xi, xj in zip(x, self.i[::-1]):
self.sync += xi.eq(xj << x_shift)
for xi, xj in zip(x[len(self.i):], x):
self.sync += xi.eq(xj)
for delay in range(p): for delay in range(p):
o = Signal((width + shift + 1, True)) o = Signal((w.P, True))
self.comb += self.o[delay].eq(o >> shift) self.comb += self.o[delay].eq(o >> c_shift + x_shift)
# Make products # Make products
for i, c in enumerate(coefficients): for i, c in enumerate(cs):
# simplify for halfband and symmetric filters # simplify for halfband and symmetric filters
if not c or c in coefficients[:i]: if not c or c in cs[:i]:
continue continue
js = [j + p - 1 for j, cj in enumerate(coefficients) js = [j + p - 1 for j, cj in enumerate(cs) if cj == c]
if cj == c]
m = Signal.like(o) m = Signal.like(o)
o0, o = o, Signal.like(o) o0, o = o, Signal.like(o)
q = Signal.like(x[0])
if delay + p <= js[0]: if delay + p <= js[0]:
self.sync += o0.eq(o + m) self.sync += o0.eq(o + m)
delay += p delay += p
else: else:
self.comb += o0.eq(o + m) self.comb += o0.eq(o + m)
assert js[0] - delay >= 0 assert js[0] - delay >= 0
xs = [x[j - delay] for j in js] self.comb += q.eq(reduce(add, [x[j - delay] for j in js]))
s = Signal((bits_for(len(xs)) - 1 + len(xs[0]), True)) self.sync += m.eq(c*q)
self.comb += s.eq(sum(xs))
self.sync += m.eq(c*s)
# symmetric rounding # symmetric rounding
if shift: if c_shift + x_shift > 1:
self.comb += o.eq((1 << shift - 1) - 1) self.comb += o.eq((1 << c_shift + x_shift - 1) - 1)
class FIR(ParallelFIR):
def __init__(self, *args, **kwargs):
super().__init__(self, *args, parallelism=1, **kwargs)
self.i = self.i[0]
self.o = self.o[0]
def halfgen4_cascade(rate, width, order=None): def halfgen4_cascade(rate, width, order=None):
"""Generate coefficients for cascaded half-band filters. """Generate coefficients for cascaded half-band filters.
Coefficients are normalized to a gain of two per stage to compensate for
the zero stuffing.
:param rate: upsampling rate. power of two :param rate: upsampling rate. power of two
:param width: passband/stopband width in units of input sampling rate. :param width: passband/stopband width in units of input sampling rate.
@ -160,7 +136,7 @@ def halfgen4_cascade(rate, width, order=None):
p = 1 p = 1
while p < rate: while p < rate:
p *= 2 p *= 2
coeff.append(halfgen4(width*p/rate/2, order*p//rate)) coeff.append(2*halfgen4(width*p/rate/2, order*p//rate))
return coeff return coeff
@ -170,8 +146,8 @@ class ParallelHBFUpsampler(Module):
Coefficients should be normalized to overall gain of 2 Coefficients should be normalized to overall gain of 2
(highest/center coefficient being 1).""" (highest/center coefficient being 1)."""
def __init__(self, coefficients, width=16, **kwargs): def __init__(self, coefficients, width=16, **kwargs):
self.parallelism = 1 self.parallelism = 1 # accumulate
self.latency = 0 self.latency = 0 # accumulate
self.width = width self.width = width
self.i = Signal((width, True)) self.i = Signal((width, True))
@ -180,7 +156,6 @@ class ParallelHBFUpsampler(Module):
i = [self.i] i = [self.i]
for coeff in coefficients: for coeff in coefficients:
self.parallelism *= 2 self.parallelism *= 2
# assert coeff[len(coeff)//2 + 1] == 1
hbf = ParallelFIR(coeff, self.parallelism, width, **kwargs) hbf = ParallelFIR(coeff, self.parallelism, width, **kwargs)
self.submodules += hbf self.submodules += hbf
self.comb += [a.eq(b) for a, b in zip(hbf.i[::2], i)] self.comb += [a.eq(b) for a, b in zip(hbf.i[::2], i)]

View File

@ -128,10 +128,8 @@ class Channel(Module, SatAddMixin):
self.submodules.a1 = a1 = SplineParallelDDS(widths, orders) self.submodules.a1 = a1 = SplineParallelDDS(widths, orders)
self.submodules.a2 = a2 = SplineParallelDDS(widths, orders) self.submodules.a2 = a2 = SplineParallelDDS(widths, orders)
coeff = [[int(round((1 << 18)*ci)) for ci in c] coeff = halfgen4_cascade(parallelism, width=.4, order=8)
for c in halfgen4_cascade(parallelism, width=.4, order=8)] hbf = [ParallelHBFUpsampler(coeff, width=width) for i in range(2)]
hbf = [ParallelHBFUpsampler(coeff, width=width, shift=17)
for i in range(2)]
self.submodules.b = b = SplineParallelDUC( self.submodules.b = b = SplineParallelDUC(
widths._replace(a=len(hbf[0].o[0]), f=widths.f - width), orders, widths._replace(a=len(hbf[0].o[0]), f=widths.f - width), orders,
parallelism=parallelism) parallelism=parallelism)

View File

@ -11,16 +11,16 @@ class Transfer(Module):
self.submodules.dut = dut self.submodules.dut = dut
def drive(self, x): def drive(self, x):
for xi in x: for xi in x.reshape(-1, self.dut.parallelism):
yield self.dut.i.eq(int(xi)) yield [ij.eq(int(xj)) for ij, xj in zip(self.dut.i, xi)]
yield yield
def record(self, y): def record(self, y):
for i in range(self.dut.latency): for i in range(self.dut.latency):
yield yield
for i in range(len(y)): for yi in y.reshape(-1, self.dut.parallelism):
yield yield
y[i] = (yield self.dut.o) yi[:] = (yield from [(yield o) for o in self.dut.o])
def run(self, samples, amplitude=1.): def run(self, samples, amplitude=1.):
w = 2**(self.dut.width - 1) - 1 w = 2**(self.dut.width - 1) - 1
@ -63,21 +63,7 @@ class Transfer(Module):
return fig return fig
class ParallelTransfer(Transfer): class UpTransfer(Transfer):
def drive(self, x):
for xi in x.reshape(-1, self.dut.parallelism):
yield [ij.eq(int(xj)) for ij, xj in zip(self.dut.i, xi)]
yield
def record(self, y):
for i in range(self.dut.latency):
yield
for yi in y.reshape(-1, self.dut.parallelism):
yield
yi[:] = (yield from [(yield o) for o in self.dut.o])
class UpTransfer(ParallelTransfer):
def drive(self, x): def drive(self, x):
x = x.reshape(-1, len(self.dut.o)) x = x.reshape(-1, len(self.dut.o))
x[:, 1:] = 0 x[:, 1:] = 0
@ -94,21 +80,15 @@ class UpTransfer(ParallelTransfer):
def _main(): def _main():
coeff = fir.halfgen4(.4/2, 8) if True:
coeff_int = [int(round(c * (1 << 16 - 1))) for c in coeff] coeff = fir.halfgen4_cascade(8, width=.4, order=8)
if False: dut = fir.ParallelHBFUpsampler(coeff, width=16)
coeff = [[int(round((1 << 19) * ci)) for ci in c]
for c in fir.halfgen4_cascade(8, width=.4, order=8)]
dut = fir.ParallelHBFUpsampler(coeff, width=16, shift=18)
# print(verilog.convert(dut, ios=set([dut.i] + dut.o))) # print(verilog.convert(dut, ios=set([dut.i] + dut.o)))
tb = UpTransfer(dut) tb = UpTransfer(dut)
elif True:
dut = fir.ParallelFIR(coeff_int, parallelism=4, width=16)
# print(verilog.convert(dut, ios=set(dut.i + dut.o)))
tb = ParallelTransfer(dut)
else: else:
dut = fir.FIR(coeff_int, width=16) coeff = fir.halfgen4(.4/2, 8)
# print(verilog.convert(dut, ios={dut.i, dut.o})) dut = fir.ParallelFIR(coeff, parallelism=4, width=16)
# print(verilog.convert(dut, ios=set(dut.i + dut.o)))
tb = Transfer(dut) tb = Transfer(dut)
x, y = tb.run(samples=1 << 10, amplitude=.5) x, y = tb.run(samples=1 << 10, amplitude=.5)