cxp: add PHY and pipeline

testing: add loopback tx for rx testing
testing: add trigger, trigger ack for testing
cxp: add upconn & downconn phy
cxp: add upconn & downconn pipeline
cxp: add rtlink
cxp: add test packet & error counter CSR
cxp: fix ch1 rx mem cannot be read
cxp: add frame buffer to use KiB instead of KB
cxp: add heartbeat & hostid csr
This commit is contained in:
morgan 2024-06-14 17:17:56 +08:00
parent b51c5efdd4
commit 77e7e51982

602
src/gateware/cxp.py Normal file
View File

@ -0,0 +1,602 @@
from migen import *
from migen.genlib.cdc import MultiReg, PulseSynchronizer
from misoc.interconnect.csr import *
from misoc.interconnect.stream import StrideConverter
from artiq.gateware.rtio import rtlink
from cxp_downconn import CXP_RXPHYs
from cxp_upconn import CXP_TXPHYs
from cxp_pipeline import (
Command_Packet_Reader,
Command_Test_Packet_Writer,
Duplicated_Char_Decoder,
Heartbeat_Packet_Reader,
Idle_Word_Inserter,
Packet_Arbiter,
Packet_Wrapper,
Test_Sequence_Checker,
Trigger_ACK_Inserter,
Trigger_ACK_Reader,
Trigger_Inserter,
Trigger_Reader,
)
from cxp_frame_pipeline import *
import cxp_router
from types import SimpleNamespace
class CXP_PHYS(Module, AutoCSR):
def __init__(self, refclk, upconn_pads, downconn_pads, sys_clk_freq, master=0):
assert len(upconn_pads) == len(downconn_pads)
self.submodules.tx = CXP_TXPHYs(upconn_pads, sys_clk_freq)
self.submodules.rx = CXP_RXPHYs(refclk, downconn_pads, sys_clk_freq, master)
self.phys = []
for tx, rx in zip(self.tx.phys, self.rx.phys):
phy = SimpleNamespace()
phy.tx, phy.rx = tx, rx
self.phys.append(phy)
class CXP_Core(Module, AutoCSR):
def __init__(self, phy, command_buffer_depth=32, nrxslot=4):
# control buffer is only 32 words (128 bytes) for compatibility with version1.x compliant Devices
# Section 12.1.6 (CXP-001-2021)
self.buffer_depth, self.nslots = command_buffer_depth, nrxslot
self.submodules.tx = TX_Pipeline(phy.tx, command_buffer_depth, False)
self.submodules.rx = RX_Pipeline(phy.rx, command_buffer_depth, nrxslot, False)
def get_tx_port(self):
return self.tx.writer.mem.get_port(write_capable=True)
def get_rx_port(self):
return self.rx.command_reader.mem.get_port(write_capable=False)
def get_mem_size(self):
return word_width * self.buffer_depth * self.nslots // 8
class RX_Pipeline(Module, AutoCSR):
def __init__(self, phy, command_buffer_depth, nslot, with_trigger):
self.ready = CSRStatus()
self.trigger_ack = CSR()
self.pending_packet = CSR()
self.read_ptr = CSRStatus(log2_int(nslot))
self.reader_buffer_err = CSR()
self.reader_decode_err = CSR()
self.test_error_counter = CSRStatus(16)
self.test_packet_counter = CSRStatus(16)
self.test_counts_reset = CSR()
self.heartbeat = CSR()
self.host_id = CSRStatus(32)
self.device_time = CSRStatus(64)
# for multilane router
self.active = Signal()
if with_trigger:
self.trig = Signal()
self.trig_delay = Signal(char_width)
self.trig_linktrigger_n = Signal(char_width)
# # #
gtx = phy.gtx
self.sync += [
self.ready.status.eq(gtx.rx_ready),
self.active.eq(gtx.rx_ready),
]
# Host rx pipeline
#
# 32 32+8(dchar)
# PHY ───/───> dchar ─────/─────> trigger ─────> trigger ack ─────> packet parser ─────> EOP Marker ─────> stream data packet
# decoder reader reader │ │ │ with CRC
# (optional) │ │ └──────> test sequence checker
# │ │
# │ └─────────> heartbeat packet reader
# │
# └────────────> command packet reader
#
cdr = ClockDomainsRenamer("cxp_gtx_rx")
# decode all incoming data as duplicate char and inject the result into the bus for downstream modules
self.submodules.dchar_decoder = dchar_decoder = cdr(Duplicated_Char_Decoder())
# Priority level 0 packet - Trigger packet
if with_trigger:
self.submodules.trig_reader = trig_reader = cdr(Trigger_Reader())
self.sync.cxp_gtx_rx += [
self.trig.eq(trig_reader.trig),
self.trig_delay.eq(trig_reader.delay),
self.trig_linktrigger_n.eq(trig_reader.linktrigger_n),
]
# Priority level 1 packet - Trigger ack packet
self.submodules.trig_ack_reader= trig_ack_reader = cdr(Trigger_ACK_Reader())
self.submodules.trig_ack_ps = trig_ack_ps = PulseSynchronizer("cxp_gtx_rx", "sys")
self.sync.cxp_gtx_rx += trig_ack_ps.i.eq(trig_ack_reader.ack)
self.sync += [
If(trig_ack_ps.o,
self.trigger_ack.w.eq(1),
).Elif(self.trigger_ack.re,
self.trigger_ack.w.eq(0),
),
]
# Priority level 2 packet - stream, test, heartbeat and command packets
self.submodules.arbiter = arbiter = cdr(Packet_Arbiter())
self.submodules.decode_err_ps = decode_err_ps = PulseSynchronizer("cxp_gtx_rx", "sys")
self.sync.cxp_gtx_rx += decode_err_ps.i.eq(arbiter.decode_err)
self.sync += [
If(decode_err_ps.o,
self.reader_decode_err.w.eq(1),
).Elif(self.reader_decode_err.re,
self.reader_decode_err.w.eq(0),
),
]
if with_trigger:
rx_pipeline = [phy, dchar_decoder, trig_reader, trig_ack_reader, arbiter]
else:
rx_pipeline = [phy, dchar_decoder, trig_ack_reader, arbiter]
for s, d in zip(rx_pipeline, rx_pipeline[1:]):
self.comb += s.source.connect(d.sink)
# Stream packet
# Drop the K29.7 and mark the EOP for downstream
self.submodules.eop_marker = eop_marker = cdr(EOP_Marker())
self.comb += arbiter.source_stream.connect(eop_marker.sink)
# set pipeline source to output stream packet
self.source = eop_marker.source
# Test packet
self.submodules.test_seq_checker = test_seq_checker = cdr(Test_Sequence_Checker())
self.comb += arbiter.source_test.connect(test_seq_checker.sink)
self.submodules.test_reset_ps = test_reset_ps = PulseSynchronizer("sys", "cxp_gtx_rx")
self.comb += test_reset_ps.i.eq(self.test_counts_reset.re),
test_err_cnt_rx = Signal.like(self.test_error_counter.status)
test_pak_cnt_rx = Signal.like(self.test_packet_counter.status)
test_err_r, test_pak_r = Signal(), Signal()
self.sync.cxp_gtx_rx += [
test_err_r.eq(test_seq_checker.error),
test_pak_r.eq(arbiter.recv_test_pak),
If(test_reset_ps.o,
test_err_cnt_rx.eq(test_err_cnt_rx.reset),
).Elif(test_err_r,
test_err_cnt_rx.eq(test_err_cnt_rx + 1),
),
If(test_reset_ps.o,
test_pak_cnt_rx.eq(test_pak_cnt_rx.reset),
).Elif(test_pak_r,
test_pak_cnt_rx.eq(test_pak_cnt_rx + 1),
),
]
self.specials += [
MultiReg(test_err_cnt_rx, self.test_error_counter.status),
MultiReg(test_pak_cnt_rx, self.test_packet_counter.status),
]
# Command packet
self.submodules.command_reader = command_reader = cdr(Command_Packet_Reader(command_buffer_depth, nslot))
self.comb += arbiter.source_command.connect(command_reader.sink)
# nslot buffers control interface
write_ptr_sys = Signal.like(command_reader.write_ptr)
self.specials += [
MultiReg(self.read_ptr.status, command_reader.read_ptr, odomain="cxp_gtx_rx"),
MultiReg(command_reader.write_ptr, write_ptr_sys)
]
self.sync += [
self.pending_packet.w.eq(self.read_ptr.status != write_ptr_sys),
If(~gtx.rx_ready,
self.read_ptr.status.eq(0),
).Elif(self.pending_packet.re & self.pending_packet.w,
self.read_ptr.status.eq(self.read_ptr.status + 1),
)
]
self.submodules.buffer_err_ps = buffer_err_ps = PulseSynchronizer("cxp_gtx_rx", "sys")
self.sync.cxp_gtx_rx += buffer_err_ps.i.eq(command_reader.buffer_err),
self.sync += [
If(buffer_err_ps.o,
self.reader_buffer_err.w.eq(1),
).Elif(self.reader_buffer_err.re,
self.reader_buffer_err.w.eq(0),
),
]
# Heartbeat packet
self.submodules.heartbeat_reader = heartbeat_reader = cdr(Heartbeat_Packet_Reader())
self.comb += arbiter.source_heartbeat.connect(heartbeat_reader.sink)
self.specials += [
MultiReg(heartbeat_reader.host_id, self.host_id.status),
MultiReg(heartbeat_reader.heartbeat, self.device_time.status),
]
self.submodules.heartbeat_ps = heartbeat_ps = PulseSynchronizer("cxp_gtx_rx", "sys")
self.sync.cxp_gtx_rx += heartbeat_ps.i.eq(arbiter.recv_heartbeat)
self.sync += [
If(heartbeat_ps.o,
self.heartbeat.w.eq(1),
).Elif(self.heartbeat.re,
self.heartbeat.w.eq(0),
),
]
class TX_Pipeline(Module, AutoCSR):
def __init__(self, phy, command_buffer_depth, with_trigger_ack):
self.trig_stb = Signal()
self.trig_delay = Signal(char_width)
self.trig_linktrigger_mode = Signal()
if with_trigger_ack:
self.trig_ack_stb = Signal()
# # #
# Host tx pipeline
#
# 32 32 8
# command/test ───/───> packet ─────> idle word ─────> trigger ack ───/───> conv ───/───> trigger ─────> PHY
# packet writer wrapper inserter inserter inserter
# (optional)
#
# Equivalent transmission priority:
# trigger > tigger ack > idle word > command/test packet
#
# The pipeline is splited into 32 and 8 bits section to handle the word and char boundary priority insertion requirement:
# Insertion @ char boundary: trigger packets
# Insertion @ word boundary: idle packets and trigger ack packet
# - Section 9.2.4 (CXP-001-2021)
#
# The idle inserter is placed between the trigger ack inserter and command/test packet writer to maintain the trigger performance,
# as idle word should not be inserted into trigger and trigger ack packet - Section 9.2.5.1 (CXP-001-2021)
#
# Priority level 0 packet - Trigger packet
self.submodules.trig = trig = Trigger_Inserter()
self.comb += [
trig.stb.eq(self.trig_stb),
trig.delay.eq(self.trig_delay),
trig.linktrig_mode.eq(self.trig_linktrigger_mode)
]
# Priority level 1 packet - Trigger ack
if with_trigger_ack:
self.submodules.trig_ack = trig_ack = Trigger_ACK_Inserter()
self.comb += self.trig_ack_stb.eq(trig_ack.stb)
# Priority level 2 packet - command and test packet
# Control is not timing dependent, all the data packets are handled in firmware
self.submodules.writer = writer = Command_Test_Packet_Writer(command_buffer_depth)
# writer memory control interface
self.writer_word_len = CSRStorage(log2_int(command_buffer_depth))
self.writer_stb = CSR()
self.writer_stb_testseq = CSR()
self.writer_busy = CSRStatus()
self.sync += [
writer.word_len.eq(self.writer_word_len.storage),
writer.stb.eq(self.writer_stb.re),
writer.stb_testseq.eq(self.writer_stb_testseq.re),
self.writer_busy.status.eq(writer.busy),
]
# Misc
self.submodules.pak_wrp = pak_wrp = Packet_Wrapper()
self.submodules.idle = idle = Idle_Word_Inserter()
self.submodules.converter = converter = StrideConverter(word_layout, char_layout)
if with_trigger_ack:
tx_pipeline = [writer, pak_wrp, idle, trig_ack, converter, trig, phy]
else:
tx_pipeline = [writer, pak_wrp, idle, converter, trig, phy]
for s, d in zip(tx_pipeline, tx_pipeline[1:]):
self.comb += s.source.connect(d.sink)
class CXP_Frame_Pipeline(Module, AutoCSR):
# optimal stream packet size is 2 KiB - Section 9.5.2 (CXP-001-2021)
def __init__(self, pipelines, pmod_pads, roi_engine_count=1, res_width=16, count_width=31, master=0, packet_size=16384):
n_channels = len(pipelines)
assert n_channels > 0
assert count_width <= 31
# Trigger rtio
nbit_trigdelay = 8
nbit_linktrig = 1
self.trigger = rtlink.Interface(
rtlink.OInterface(nbit_trigdelay + nbit_linktrig),
rtlink.IInterface(word_width, timestamped=False)
)
self.sync.rio += [
If(self.trigger.o.stb,
pipelines[master].tx.trig_delay.eq(self.trigger.o.data[nbit_linktrig:]),
pipelines[master].tx.trig_linktrigger_mode.eq(self.trigger.o.data[:nbit_linktrig]),
),
pipelines[master].tx.trig_stb.eq(self.trigger.o.stb),
]
# ROI rtio
# 4 cfg (x0, y0, x1, y1) per roi_engine
self.config = rtlink.Interface(rtlink.OInterface(res_width, bits_for(4*roi_engine_count-1)))
# select which roi engine can output rtio_input signal
self.gate_data = rtlink.Interface(
rtlink.OInterface(roi_engine_count),
# the 32th bits is for sentinel (gate detection)
rtlink.IInterface(count_width+1, timestamped=False)
)
self.crc_error_cnt = CSRStatus(16)
self.crc_error_reset = CSR()
# DEBUG: csr
self.arbiter_active_ch = CSRStatus(n_channels)
self.roi_counter = CSRStatus(count_width)
self.roi_update = CSR()
self.pix_y = CSRStatus(res_width)
self.header_l_size = CSRStatus(3*char_width)
self.header_x_size = CSRStatus(3*char_width)
self.header_y_size = CSRStatus(3*char_width)
self.header_new_line = CSRStatus(3*char_width)
# # #
cdr = ClockDomainsRenamer("cxp_gtx_rx")
debug_out = False
if not debug_out:
self.submodules.pixel_pipeline = pixel_pipeline = cdr(Pixel_Pipeline(res_width, count_width, packet_size))
# CRC error counter
self.submodules.crc_reset_ps = crc_reset_ps = PulseSynchronizer("sys", "cxp_gtx_rx")
self.comb += crc_reset_ps.i.eq(self.crc_error_reset.re)
crc_error_cnt_rx = Signal.like(self.crc_error_cnt.status)
crc_error_r = Signal()
self.sync.cxp_gtx_rx += [
# to improve timinig
crc_error_r.eq(pixel_pipeline.crc_checker.error),
If(crc_reset_ps.o,
crc_error_cnt_rx.eq(crc_error_cnt_rx.reset),
).Elif(crc_error_r,
crc_error_cnt_rx.eq(crc_error_cnt_rx + 1),
),
]
self.specials += MultiReg(crc_error_cnt_rx, self.crc_error_cnt.status)
# RTIO interface
n = 0
cfg = pixel_pipeline.roi.cfg
for offset, target in enumerate([cfg.x0, cfg.y0, cfg.x1, cfg.y1]):
roi_boundary = Signal.like(target)
self.sync.rio += If(self.config.o.stb & (self.config.o.address == 4*n+offset),
roi_boundary.eq(self.config.o.data))
self.specials += MultiReg(roi_boundary, target, "cxp_gtx_rx")
roi_out = pixel_pipeline.roi.out
update = Signal()
self.submodules.ps = ps = PulseSynchronizer("cxp_gtx_rx", "sys")
self.sync.cxp_gtx_rx += ps.i.eq(roi_out.update)
self.sync += update.eq(ps.o)
sentinel = 2**count_width
count_sys = Signal.like(roi_out.count)
self.specials += MultiReg(roi_out.count, count_sys),
self.sync.rio += [
# TODO: add gating
self.gate_data.i.stb.eq(update),
# without the slice, unspecified bits will be 1 for some reason
# i.e. data[count_wdith:] = 0b111111... when using data.eq(count_sys)
self.gate_data.i.data[:count_width].eq(count_sys),
]
# TODO: fix count_sys seems like end of frame is broken
# BUG: it maybe related to the KCode bug that only happens after frameheader decoder COPY (i.e. when sending pixel data)
# DEBUG:
new_line_cnt_rx, new_line_cnt_sys = Signal(3*char_width), Signal(3*char_width)
l_size_rx, l_size_sys = Signal(3*char_width), Signal(3*char_width)
x_size_rx, x_size_sys = Signal(3*char_width), Signal(3*char_width)
y_size_rx, y_size_sys = Signal(3*char_width), Signal(3*char_width)
y_pix_rx, y_pix_sys = Signal(res_width), Signal(res_width)
self.sync.cxp_gtx_rx += [
If(pixel_pipeline.header_decoder.new_line,
new_line_cnt_rx.eq(new_line_cnt_rx + 1),
),
l_size_rx.eq(pixel_pipeline.header_decoder.metadata.l_size),
x_size_rx.eq(pixel_pipeline.header_decoder.metadata.x_size),
y_size_rx.eq(pixel_pipeline.header_decoder.metadata.y_size),
y_pix_rx.eq(pixel_pipeline.parser.pixel4x[0].y),
]
self.specials += [
MultiReg(new_line_cnt_rx, new_line_cnt_sys),
MultiReg(l_size_rx, l_size_sys),
MultiReg(x_size_rx, x_size_sys),
MultiReg(y_size_rx, y_size_sys),
MultiReg(y_pix_rx, y_pix_sys),
]
self.sync += [
self.header_new_line.status.eq(new_line_cnt_sys),
self.pix_y.status.eq(y_pix_sys),
self.header_l_size.status.eq(l_size_sys),
self.header_x_size.status.eq(x_size_sys),
self.header_y_size.status.eq(y_size_sys),
self.roi_counter.status.eq(count_sys),
If(update,
self.roi_update.w.eq(1),
).Elif(self.roi_update.re,
self.roi_update.w.eq(0),
),
]
else:
# DEBUG:
crc_checker = cdr(CXPCRC32_Checker())
# TODO: handle full buffer gracefully
# TODO: investigate why there is a heartbeat message in the middle of the frame with k27.7 code too???
# NOTE: sometimes there are 0xFBFBFBFB K=0b1111
# perhaps the buffer is full overflowing and doing strange stuff
# it should be mem block not "cycle buffer"
# self.submodules.dropper = dropper = cdr(DChar_Dropper())
buffer = cdr(Buffer(word_layout_dchar)) # crcchecker timinig is bad
buffer_cdc_fifo = cdr(Buffer(word_layout_dchar)) # to improve timing
cdc_fifo = stream.AsyncFIFO(word_layout_dchar, 2**log2_int(packet_size//word_width))
self.submodules += buffer, crc_checker, buffer_cdc_fifo
self.submodules += ClockDomainsRenamer({"write": "cxp_gtx_rx", "read": "sys"})(cdc_fifo)
self.submodules.debug_out = debug_out = RX_Debug_Buffer(word_layout_dchar, 2**log2_int(packet_size//word_width))
pipeline = [buffer, crc_checker, buffer_cdc_fifo, cdc_fifo, debug_out]
for s, d in zip(pipeline, pipeline[1:]):
self.comb += s.source.connect(d.sink)
# Routing WIP
# +---------+ +-------------+
# downconn pipline ----->| | | |------> crc checker ------> raw stream data
# | arbiter |---->| broadcaster |
# downconn pipline ----->| | | |------> crc checker ------> raw stream data
# +---------+ +-------------+
#
self.submodules.arbiter = arbiter = cdr(Stream_Arbiter(n_channels))
self.submodules.broadcaster = broadcaster = cdr(Stream_Broadcaster())
# Connect pipeline
for i, p in enumerate(pipelines):
# Assume downconns pipeline already marks the eop
self.comb += p.rx.source.connect(arbiter.sinks[i])
self.comb += arbiter.source.connect(broadcaster.sink)
if not debug_out:
self.comb += broadcaster.sources[0].connect(pixel_pipeline.sink),
else:
self.comb += broadcaster.sources[0].connect(pipeline[0].sink),
# Control interface
# only the simple topology MASTER:ch0, extension:ch1,2,3 is supported right now
active_channels_sys = Signal(n_channels)
for i, p in enumerate(pipelines):
# TODO: change this to non csr signal?
self.sync += active_channels_sys[i].eq(p.rx.active)
self.specials += MultiReg(active_channels_sys, arbiter.active_channels, odomain="cxp_gtx_rx"),
# DEBUG:
self.sync += self.arbiter_active_ch.status.eq(active_channels_sys)
for i, p in enumerate(pipelines):
# self.comb += p.rx.source.ack.eq(1)
rx_stb = Signal()
self.sync.cxp_gtx_rx += rx_stb.eq(p.rx.source.stb)
self.specials += [
# Instance("OBUF", i_I=rx_stb, o_O=pmod_pads[i]),
# Instance("OBUF", i_I=arbiter.sinks[i].stb, o_O=pmod_pads[i]),
]
class NEO_CXP_Frame_pipeline(Module):
# optimal stream packet size is 2 KiB - Section 9.5.2 (CXP-001-2021)
def __init__(self, pipelines, pmod_pads, roi_engine_count=1, res_width=16, count_width=31, master=0, packet_size=16384):
n_channels = len(pipelines)
assert n_channels > 0
assert count_width <= 31
# Trigger rtio
nbit_trigdelay = 8
nbit_linktrig = 1
self.trigger = rtlink.Interface(
rtlink.OInterface(nbit_trigdelay + nbit_linktrig),
rtlink.IInterface(word_width, timestamped=False)
)
self.sync.rio += [
If(self.trigger.o.stb,
pipelines[master].tx.trig.delay.eq(self.trigger.o.data[nbit_linktrig:]),
pipelines[master].tx.trig.linktrig_mode.eq(self.trigger.o.data[:nbit_linktrig]),
),
pipelines[master].tx.trig.stb.eq(self.trigger.o.stb),
]
# ROI rtio
# 4 cfg (x0, y0, x1, y1) per roi_engine
self.config = rtlink.Interface(rtlink.OInterface(res_width, bits_for(4*roi_engine_count-1)))
# select which roi engine can output rtio_input signal
self.gate_data = rtlink.Interface(
rtlink.OInterface(roi_engine_count),
# the 32th bits is for sentinel (gate detection)
rtlink.IInterface(count_width+1, timestamped=False)
)
# # #
cdr = ClockDomainsRenamer("cxp_gtx_rx")
debug_out = False
if debug_out:
pass
#
# downconn pipline -----> router -----> arbiter ------> crc checker ------> 4x converters
#
# Connect pipeline
for i, p in enumerate(pipelines):
broadcaster = cdr(cxp_router.Stream_Router()) # strip the packet id, tag & packet size
crc_checker = cdr(CXPCRC32_Checker())
self.submodules += broadcaster, crc_checker
self.comb += [
p.rx.source.connect(broadcaster.sink),
broadcaster.sources[0].connect(crc_checker),
]