diff --git a/src/gateware/cxp.py b/src/gateware/cxp.py new file mode 100644 index 0000000..d400431 --- /dev/null +++ b/src/gateware/cxp.py @@ -0,0 +1,602 @@ +from migen import * +from migen.genlib.cdc import MultiReg, PulseSynchronizer +from misoc.interconnect.csr import * +from misoc.interconnect.stream import StrideConverter + +from artiq.gateware.rtio import rtlink + +from cxp_downconn import CXP_RXPHYs +from cxp_upconn import CXP_TXPHYs +from cxp_pipeline import ( + Command_Packet_Reader, + Command_Test_Packet_Writer, + Duplicated_Char_Decoder, + Heartbeat_Packet_Reader, + Idle_Word_Inserter, + Packet_Arbiter, + Packet_Wrapper, + Test_Sequence_Checker, + Trigger_ACK_Inserter, + Trigger_ACK_Reader, + Trigger_Inserter, + Trigger_Reader, +) +from cxp_frame_pipeline import * + +import cxp_router + +from types import SimpleNamespace + +class CXP_PHYS(Module, AutoCSR): + def __init__(self, refclk, upconn_pads, downconn_pads, sys_clk_freq, master=0): + assert len(upconn_pads) == len(downconn_pads) + + self.submodules.tx = CXP_TXPHYs(upconn_pads, sys_clk_freq) + self.submodules.rx = CXP_RXPHYs(refclk, downconn_pads, sys_clk_freq, master) + + self.phys = [] + for tx, rx in zip(self.tx.phys, self.rx.phys): + phy = SimpleNamespace() + phy.tx, phy.rx = tx, rx + self.phys.append(phy) + + +class CXP_Core(Module, AutoCSR): + def __init__(self, phy, command_buffer_depth=32, nrxslot=4): + # control buffer is only 32 words (128 bytes) for compatibility with version1.x compliant Devices + # Section 12.1.6 (CXP-001-2021) + self.buffer_depth, self.nslots = command_buffer_depth, nrxslot + + self.submodules.tx = TX_Pipeline(phy.tx, command_buffer_depth, False) + self.submodules.rx = RX_Pipeline(phy.rx, command_buffer_depth, nrxslot, False) + + def get_tx_port(self): + return self.tx.writer.mem.get_port(write_capable=True) + + def get_rx_port(self): + return self.rx.command_reader.mem.get_port(write_capable=False) + + def get_mem_size(self): + return word_width * self.buffer_depth * self.nslots // 8 + +class RX_Pipeline(Module, AutoCSR): + def __init__(self, phy, command_buffer_depth, nslot, with_trigger): + self.ready = CSRStatus() + + self.trigger_ack = CSR() + + self.pending_packet = CSR() + self.read_ptr = CSRStatus(log2_int(nslot)) + self.reader_buffer_err = CSR() + + self.reader_decode_err = CSR() + self.test_error_counter = CSRStatus(16) + self.test_packet_counter = CSRStatus(16) + self.test_counts_reset = CSR() + + self.heartbeat = CSR() + self.host_id = CSRStatus(32) + self.device_time = CSRStatus(64) + + # for multilane router + self.active = Signal() + + if with_trigger: + self.trig = Signal() + self.trig_delay = Signal(char_width) + self.trig_linktrigger_n = Signal(char_width) + + # # # + + gtx = phy.gtx + self.sync += [ + self.ready.status.eq(gtx.rx_ready), + self.active.eq(gtx.rx_ready), + ] + + # Host rx pipeline + # + # 32 32+8(dchar) + # PHY ───/───> dchar ─────/─────> trigger ─────> trigger ack ─────> packet parser ─────> EOP Marker ─────> stream data packet + # decoder reader reader │ │ │ with CRC + # (optional) │ │ └──────> test sequence checker + # │ │ + # │ └─────────> heartbeat packet reader + # │ + # └────────────> command packet reader + # + cdr = ClockDomainsRenamer("cxp_gtx_rx") + + # decode all incoming data as duplicate char and inject the result into the bus for downstream modules + self.submodules.dchar_decoder = dchar_decoder = cdr(Duplicated_Char_Decoder()) + + # Priority level 0 packet - Trigger packet + if with_trigger: + self.submodules.trig_reader = trig_reader = cdr(Trigger_Reader()) + self.sync.cxp_gtx_rx += [ + self.trig.eq(trig_reader.trig), + self.trig_delay.eq(trig_reader.delay), + self.trig_linktrigger_n.eq(trig_reader.linktrigger_n), + ] + + # Priority level 1 packet - Trigger ack packet + self.submodules.trig_ack_reader= trig_ack_reader = cdr(Trigger_ACK_Reader()) + + self.submodules.trig_ack_ps = trig_ack_ps = PulseSynchronizer("cxp_gtx_rx", "sys") + self.sync.cxp_gtx_rx += trig_ack_ps.i.eq(trig_ack_reader.ack) + self.sync += [ + If(trig_ack_ps.o, + self.trigger_ack.w.eq(1), + ).Elif(self.trigger_ack.re, + self.trigger_ack.w.eq(0), + ), + ] + + # Priority level 2 packet - stream, test, heartbeat and command packets + self.submodules.arbiter = arbiter = cdr(Packet_Arbiter()) + + self.submodules.decode_err_ps = decode_err_ps = PulseSynchronizer("cxp_gtx_rx", "sys") + self.sync.cxp_gtx_rx += decode_err_ps.i.eq(arbiter.decode_err) + self.sync += [ + If(decode_err_ps.o, + self.reader_decode_err.w.eq(1), + ).Elif(self.reader_decode_err.re, + self.reader_decode_err.w.eq(0), + ), + ] + + if with_trigger: + rx_pipeline = [phy, dchar_decoder, trig_reader, trig_ack_reader, arbiter] + else: + rx_pipeline = [phy, dchar_decoder, trig_ack_reader, arbiter] + for s, d in zip(rx_pipeline, rx_pipeline[1:]): + self.comb += s.source.connect(d.sink) + + # Stream packet + # Drop the K29.7 and mark the EOP for downstream + self.submodules.eop_marker = eop_marker = cdr(EOP_Marker()) + self.comb += arbiter.source_stream.connect(eop_marker.sink) + + # set pipeline source to output stream packet + self.source = eop_marker.source + + + # Test packet + self.submodules.test_seq_checker = test_seq_checker = cdr(Test_Sequence_Checker()) + self.comb += arbiter.source_test.connect(test_seq_checker.sink) + + self.submodules.test_reset_ps = test_reset_ps = PulseSynchronizer("sys", "cxp_gtx_rx") + self.comb += test_reset_ps.i.eq(self.test_counts_reset.re), + + test_err_cnt_rx = Signal.like(self.test_error_counter.status) + test_pak_cnt_rx = Signal.like(self.test_packet_counter.status) + test_err_r, test_pak_r = Signal(), Signal() + self.sync.cxp_gtx_rx += [ + test_err_r.eq(test_seq_checker.error), + test_pak_r.eq(arbiter.recv_test_pak), + + If(test_reset_ps.o, + test_err_cnt_rx.eq(test_err_cnt_rx.reset), + ).Elif(test_err_r, + test_err_cnt_rx.eq(test_err_cnt_rx + 1), + ), + If(test_reset_ps.o, + test_pak_cnt_rx.eq(test_pak_cnt_rx.reset), + ).Elif(test_pak_r, + test_pak_cnt_rx.eq(test_pak_cnt_rx + 1), + ), + ] + self.specials += [ + MultiReg(test_err_cnt_rx, self.test_error_counter.status), + MultiReg(test_pak_cnt_rx, self.test_packet_counter.status), + ] + + # Command packet + self.submodules.command_reader = command_reader = cdr(Command_Packet_Reader(command_buffer_depth, nslot)) + self.comb += arbiter.source_command.connect(command_reader.sink) + + # nslot buffers control interface + write_ptr_sys = Signal.like(command_reader.write_ptr) + + self.specials += [ + MultiReg(self.read_ptr.status, command_reader.read_ptr, odomain="cxp_gtx_rx"), + MultiReg(command_reader.write_ptr, write_ptr_sys) + ] + self.sync += [ + self.pending_packet.w.eq(self.read_ptr.status != write_ptr_sys), + If(~gtx.rx_ready, + self.read_ptr.status.eq(0), + ).Elif(self.pending_packet.re & self.pending_packet.w, + self.read_ptr.status.eq(self.read_ptr.status + 1), + ) + ] + + self.submodules.buffer_err_ps = buffer_err_ps = PulseSynchronizer("cxp_gtx_rx", "sys") + self.sync.cxp_gtx_rx += buffer_err_ps.i.eq(command_reader.buffer_err), + self.sync += [ + If(buffer_err_ps.o, + self.reader_buffer_err.w.eq(1), + ).Elif(self.reader_buffer_err.re, + self.reader_buffer_err.w.eq(0), + ), + ] + + # Heartbeat packet + self.submodules.heartbeat_reader = heartbeat_reader = cdr(Heartbeat_Packet_Reader()) + self.comb += arbiter.source_heartbeat.connect(heartbeat_reader.sink) + + self.specials += [ + MultiReg(heartbeat_reader.host_id, self.host_id.status), + MultiReg(heartbeat_reader.heartbeat, self.device_time.status), + ] + + self.submodules.heartbeat_ps = heartbeat_ps = PulseSynchronizer("cxp_gtx_rx", "sys") + self.sync.cxp_gtx_rx += heartbeat_ps.i.eq(arbiter.recv_heartbeat) + self.sync += [ + If(heartbeat_ps.o, + self.heartbeat.w.eq(1), + ).Elif(self.heartbeat.re, + self.heartbeat.w.eq(0), + ), + ] + + + + + +class TX_Pipeline(Module, AutoCSR): + def __init__(self, phy, command_buffer_depth, with_trigger_ack): + self.trig_stb = Signal() + self.trig_delay = Signal(char_width) + self.trig_linktrigger_mode = Signal() + + if with_trigger_ack: + self.trig_ack_stb = Signal() + + # # # + + # Host tx pipeline + # + # 32 32 8 + # command/test ───/───> packet ─────> idle word ─────> trigger ack ───/───> conv ───/───> trigger ─────> PHY + # packet writer wrapper inserter inserter inserter + # (optional) + # + # Equivalent transmission priority: + # trigger > tigger ack > idle word > command/test packet + # + # The pipeline is splited into 32 and 8 bits section to handle the word and char boundary priority insertion requirement: + # Insertion @ char boundary: trigger packets + # Insertion @ word boundary: idle packets and trigger ack packet + # - Section 9.2.4 (CXP-001-2021) + # + # The idle inserter is placed between the trigger ack inserter and command/test packet writer to maintain the trigger performance, + # as idle word should not be inserted into trigger and trigger ack packet - Section 9.2.5.1 (CXP-001-2021) + # + + + # Priority level 0 packet - Trigger packet + self.submodules.trig = trig = Trigger_Inserter() + self.comb += [ + trig.stb.eq(self.trig_stb), + trig.delay.eq(self.trig_delay), + trig.linktrig_mode.eq(self.trig_linktrigger_mode) + ] + + # Priority level 1 packet - Trigger ack + if with_trigger_ack: + self.submodules.trig_ack = trig_ack = Trigger_ACK_Inserter() + self.comb += self.trig_ack_stb.eq(trig_ack.stb) + + # Priority level 2 packet - command and test packet + # Control is not timing dependent, all the data packets are handled in firmware + self.submodules.writer = writer = Command_Test_Packet_Writer(command_buffer_depth) + + # writer memory control interface + self.writer_word_len = CSRStorage(log2_int(command_buffer_depth)) + self.writer_stb = CSR() + self.writer_stb_testseq = CSR() + self.writer_busy = CSRStatus() + + self.sync += [ + writer.word_len.eq(self.writer_word_len.storage), + writer.stb.eq(self.writer_stb.re), + writer.stb_testseq.eq(self.writer_stb_testseq.re), + self.writer_busy.status.eq(writer.busy), + ] + + # Misc + self.submodules.pak_wrp = pak_wrp = Packet_Wrapper() + self.submodules.idle = idle = Idle_Word_Inserter() + self.submodules.converter = converter = StrideConverter(word_layout, char_layout) + + if with_trigger_ack: + tx_pipeline = [writer, pak_wrp, idle, trig_ack, converter, trig, phy] + else: + tx_pipeline = [writer, pak_wrp, idle, converter, trig, phy] + + for s, d in zip(tx_pipeline, tx_pipeline[1:]): + self.comb += s.source.connect(d.sink) + +class CXP_Frame_Pipeline(Module, AutoCSR): + # optimal stream packet size is 2 KiB - Section 9.5.2 (CXP-001-2021) + def __init__(self, pipelines, pmod_pads, roi_engine_count=1, res_width=16, count_width=31, master=0, packet_size=16384): + n_channels = len(pipelines) + assert n_channels > 0 + assert count_width <= 31 + + # Trigger rtio + nbit_trigdelay = 8 + nbit_linktrig = 1 + self.trigger = rtlink.Interface( + rtlink.OInterface(nbit_trigdelay + nbit_linktrig), + rtlink.IInterface(word_width, timestamped=False) + ) + + self.sync.rio += [ + If(self.trigger.o.stb, + pipelines[master].tx.trig_delay.eq(self.trigger.o.data[nbit_linktrig:]), + pipelines[master].tx.trig_linktrigger_mode.eq(self.trigger.o.data[:nbit_linktrig]), + ), + pipelines[master].tx.trig_stb.eq(self.trigger.o.stb), + ] + + + # ROI rtio + + # 4 cfg (x0, y0, x1, y1) per roi_engine + self.config = rtlink.Interface(rtlink.OInterface(res_width, bits_for(4*roi_engine_count-1))) + + # select which roi engine can output rtio_input signal + self.gate_data = rtlink.Interface( + rtlink.OInterface(roi_engine_count), + # the 32th bits is for sentinel (gate detection) + rtlink.IInterface(count_width+1, timestamped=False) + ) + + self.crc_error_cnt = CSRStatus(16) + self.crc_error_reset = CSR() + + # DEBUG: csr + self.arbiter_active_ch = CSRStatus(n_channels) + + self.roi_counter = CSRStatus(count_width) + self.roi_update = CSR() + self.pix_y = CSRStatus(res_width) + + self.header_l_size = CSRStatus(3*char_width) + self.header_x_size = CSRStatus(3*char_width) + self.header_y_size = CSRStatus(3*char_width) + self.header_new_line = CSRStatus(3*char_width) + + # # # + + cdr = ClockDomainsRenamer("cxp_gtx_rx") + + debug_out = False + + if not debug_out: + self.submodules.pixel_pipeline = pixel_pipeline = cdr(Pixel_Pipeline(res_width, count_width, packet_size)) + + # CRC error counter + self.submodules.crc_reset_ps = crc_reset_ps = PulseSynchronizer("sys", "cxp_gtx_rx") + self.comb += crc_reset_ps.i.eq(self.crc_error_reset.re) + + crc_error_cnt_rx = Signal.like(self.crc_error_cnt.status) + + crc_error_r = Signal() + self.sync.cxp_gtx_rx += [ + # to improve timinig + crc_error_r.eq(pixel_pipeline.crc_checker.error), + + If(crc_reset_ps.o, + crc_error_cnt_rx.eq(crc_error_cnt_rx.reset), + ).Elif(crc_error_r, + crc_error_cnt_rx.eq(crc_error_cnt_rx + 1), + ), + ] + self.specials += MultiReg(crc_error_cnt_rx, self.crc_error_cnt.status) + + # RTIO interface + + n = 0 + cfg = pixel_pipeline.roi.cfg + for offset, target in enumerate([cfg.x0, cfg.y0, cfg.x1, cfg.y1]): + roi_boundary = Signal.like(target) + self.sync.rio += If(self.config.o.stb & (self.config.o.address == 4*n+offset), + roi_boundary.eq(self.config.o.data)) + self.specials += MultiReg(roi_boundary, target, "cxp_gtx_rx") + + + + roi_out = pixel_pipeline.roi.out + update = Signal() + self.submodules.ps = ps = PulseSynchronizer("cxp_gtx_rx", "sys") + self.sync.cxp_gtx_rx += ps.i.eq(roi_out.update) + self.sync += update.eq(ps.o) + + sentinel = 2**count_width + count_sys = Signal.like(roi_out.count) + + self.specials += MultiReg(roi_out.count, count_sys), + self.sync.rio += [ + # TODO: add gating + self.gate_data.i.stb.eq(update), + # without the slice, unspecified bits will be 1 for some reason + # i.e. data[count_wdith:] = 0b111111... when using data.eq(count_sys) + self.gate_data.i.data[:count_width].eq(count_sys), + ] + + + + # TODO: fix count_sys seems like end of frame is broken + # BUG: it maybe related to the KCode bug that only happens after frameheader decoder COPY (i.e. when sending pixel data) + # DEBUG: + + new_line_cnt_rx, new_line_cnt_sys = Signal(3*char_width), Signal(3*char_width) + l_size_rx, l_size_sys = Signal(3*char_width), Signal(3*char_width) + x_size_rx, x_size_sys = Signal(3*char_width), Signal(3*char_width) + y_size_rx, y_size_sys = Signal(3*char_width), Signal(3*char_width) + y_pix_rx, y_pix_sys = Signal(res_width), Signal(res_width) + self.sync.cxp_gtx_rx += [ + If(pixel_pipeline.header_decoder.new_line, + new_line_cnt_rx.eq(new_line_cnt_rx + 1), + ), + + l_size_rx.eq(pixel_pipeline.header_decoder.metadata.l_size), + x_size_rx.eq(pixel_pipeline.header_decoder.metadata.x_size), + y_size_rx.eq(pixel_pipeline.header_decoder.metadata.y_size), + + y_pix_rx.eq(pixel_pipeline.parser.pixel4x[0].y), + ] + self.specials += [ + MultiReg(new_line_cnt_rx, new_line_cnt_sys), + MultiReg(l_size_rx, l_size_sys), + MultiReg(x_size_rx, x_size_sys), + MultiReg(y_size_rx, y_size_sys), + MultiReg(y_pix_rx, y_pix_sys), + ] + self.sync += [ + self.header_new_line.status.eq(new_line_cnt_sys), + self.pix_y.status.eq(y_pix_sys), + self.header_l_size.status.eq(l_size_sys), + self.header_x_size.status.eq(x_size_sys), + self.header_y_size.status.eq(y_size_sys), + self.roi_counter.status.eq(count_sys), + If(update, + self.roi_update.w.eq(1), + ).Elif(self.roi_update.re, + self.roi_update.w.eq(0), + ), + ] + + else: + # DEBUG: + crc_checker = cdr(CXPCRC32_Checker()) + + # TODO: handle full buffer gracefully + # TODO: investigate why there is a heartbeat message in the middle of the frame with k27.7 code too??? + # NOTE: sometimes there are 0xFBFBFBFB K=0b1111 + # perhaps the buffer is full overflowing and doing strange stuff + + # it should be mem block not "cycle buffer" + # self.submodules.dropper = dropper = cdr(DChar_Dropper()) + buffer = cdr(Buffer(word_layout_dchar)) # crcchecker timinig is bad + buffer_cdc_fifo = cdr(Buffer(word_layout_dchar)) # to improve timing + cdc_fifo = stream.AsyncFIFO(word_layout_dchar, 2**log2_int(packet_size//word_width)) + self.submodules += buffer, crc_checker, buffer_cdc_fifo + self.submodules += ClockDomainsRenamer({"write": "cxp_gtx_rx", "read": "sys"})(cdc_fifo) + self.submodules.debug_out = debug_out = RX_Debug_Buffer(word_layout_dchar, 2**log2_int(packet_size//word_width)) + + pipeline = [buffer, crc_checker, buffer_cdc_fifo, cdc_fifo, debug_out] + for s, d in zip(pipeline, pipeline[1:]): + self.comb += s.source.connect(d.sink) + + + + # Routing WIP + # +---------+ +-------------+ + # downconn pipline ----->| | | |------> crc checker ------> raw stream data + # | arbiter |---->| broadcaster | + # downconn pipline ----->| | | |------> crc checker ------> raw stream data + # +---------+ +-------------+ + # + + + self.submodules.arbiter = arbiter = cdr(Stream_Arbiter(n_channels)) + self.submodules.broadcaster = broadcaster = cdr(Stream_Broadcaster()) + + # Connect pipeline + for i, p in enumerate(pipelines): + # Assume downconns pipeline already marks the eop + self.comb += p.rx.source.connect(arbiter.sinks[i]) + + self.comb += arbiter.source.connect(broadcaster.sink) + + if not debug_out: + self.comb += broadcaster.sources[0].connect(pixel_pipeline.sink), + else: + self.comb += broadcaster.sources[0].connect(pipeline[0].sink), + + # Control interface + # only the simple topology MASTER:ch0, extension:ch1,2,3 is supported right now + active_channels_sys = Signal(n_channels) + for i, p in enumerate(pipelines): + # TODO: change this to non csr signal? + self.sync += active_channels_sys[i].eq(p.rx.active) + self.specials += MultiReg(active_channels_sys, arbiter.active_channels, odomain="cxp_gtx_rx"), + + # DEBUG: + self.sync += self.arbiter_active_ch.status.eq(active_channels_sys) + + for i, p in enumerate(pipelines): + # self.comb += p.rx.source.ack.eq(1) + + rx_stb = Signal() + self.sync.cxp_gtx_rx += rx_stb.eq(p.rx.source.stb) + self.specials += [ + # Instance("OBUF", i_I=rx_stb, o_O=pmod_pads[i]), + # Instance("OBUF", i_I=arbiter.sinks[i].stb, o_O=pmod_pads[i]), + ] + +class NEO_CXP_Frame_pipeline(Module): + # optimal stream packet size is 2 KiB - Section 9.5.2 (CXP-001-2021) + def __init__(self, pipelines, pmod_pads, roi_engine_count=1, res_width=16, count_width=31, master=0, packet_size=16384): + n_channels = len(pipelines) + assert n_channels > 0 + assert count_width <= 31 + + # Trigger rtio + nbit_trigdelay = 8 + nbit_linktrig = 1 + self.trigger = rtlink.Interface( + rtlink.OInterface(nbit_trigdelay + nbit_linktrig), + rtlink.IInterface(word_width, timestamped=False) + ) + + self.sync.rio += [ + If(self.trigger.o.stb, + pipelines[master].tx.trig.delay.eq(self.trigger.o.data[nbit_linktrig:]), + pipelines[master].tx.trig.linktrig_mode.eq(self.trigger.o.data[:nbit_linktrig]), + ), + pipelines[master].tx.trig.stb.eq(self.trigger.o.stb), + ] + + + # ROI rtio + + # 4 cfg (x0, y0, x1, y1) per roi_engine + self.config = rtlink.Interface(rtlink.OInterface(res_width, bits_for(4*roi_engine_count-1))) + + # select which roi engine can output rtio_input signal + self.gate_data = rtlink.Interface( + rtlink.OInterface(roi_engine_count), + # the 32th bits is for sentinel (gate detection) + rtlink.IInterface(count_width+1, timestamped=False) + ) + + + # # # + + cdr = ClockDomainsRenamer("cxp_gtx_rx") + + debug_out = False + + if debug_out: + pass + + + # + # downconn pipline -----> router -----> arbiter ------> crc checker ------> 4x converters + # + + # Connect pipeline + for i, p in enumerate(pipelines): + broadcaster = cdr(cxp_router.Stream_Router()) # strip the packet id, tag & packet size + crc_checker = cdr(CXPCRC32_Checker()) + self.submodules += broadcaster, crc_checker + + self.comb += [ + p.rx.source.connect(broadcaster.sink), + broadcaster.sources[0].connect(crc_checker), + ]