diff --git a/src/gateware/cxp.py b/src/gateware/cxp.py new file mode 100644 index 0000000..72f0305 --- /dev/null +++ b/src/gateware/cxp.py @@ -0,0 +1,342 @@ +from migen import * +from migen.genlib.cdc import MultiReg, PulseSynchronizer, BusSynchronizer +from misoc.interconnect.csr import * + +from artiq.gateware.rtio import rtlink + +from cxp_downconn import CXP_DownConn_PHYS +from cxp_upconn import CXP_UpConn_PHYS +from cxp_pipeline import * +from cxp_frame_pipeline import * + +from functools import reduce +from operator import add + +class CXP_PHYS(Module, AutoCSR): + def __init__(self, refclk, upconn_pads, downconn_pads, sys_clk_freq, debug_sma, pmod_pads): + assert len(upconn_pads) == len(downconn_pads) + + self.submodules.upconn = CXP_UpConn_PHYS(upconn_pads, sys_clk_freq, debug_sma, pmod_pads) + self.submodules.downconn = CXP_DownConn_PHYS(refclk, downconn_pads, sys_clk_freq, debug_sma, pmod_pads) + +@FullMemoryWE() +class CXP_Interface(Module, AutoCSR): + def __init__(self, upconn_phy, downconn_phy, debug_sma, pmod_pads): + self.submodules.upconn = UpConn_Interface(upconn_phy, debug_sma, pmod_pads) + self.submodules.downconn = DownConn_Interface(downconn_phy, debug_sma, pmod_pads) + + def get_tx_port(self): + return self.upconn.bootstrap.mem.get_port(write_capable=True) + + def get_tx_mem_size(self): + # FIXME: if tx mem size is NOT same as rx, for some reason when rx mem is writen, tx mem cannot be access anymore + # and each time tx mem is read, CPU will return rx mem instead (fixed by reordering the mem allocation order) + # FIXME: seems like there are address alignment issue, if tx mem size is 0x800, the mem following the tx mem cannot be read correctly + # However, if tx mem is 0x2000 (same size as rx mem) the following rx mem can be read correctly + return self.upconn.bootstrap.mem.depth*self.upconn.bootstrap.mem.width // 8 # 0x800 + # return self.downconn.bootstrap.mem.depth*self.downconn.bootstrap.mem.width // 8 # 0x2000 + + def get_mem_size(self): + return word_dw * buffer_count * buffer_depth // 8 + + def get_rx_port(self): + return self.downconn.bootstrap.mem.get_port(write_capable=False) + + def get_rx_mem_size(self): + return self.downconn.bootstrap.mem.depth*self.downconn.bootstrap.mem.width // 8 + + def get_rx_downconn(self): + return self.downconn + +class CXP_Master(CXP_Interface): + def __init__(self, upconn_phy, downconn_phy, debug_sma, pmod_pads): + CXP_Interface.__init__(self, upconn_phy, downconn_phy, debug_sma, pmod_pads) + nbit_trigdelay = 8 + nbit_linktrig = 1 + + self.rtlink = rtlink.Interface( + rtlink.OInterface(nbit_trigdelay + nbit_linktrig), + rtlink.IInterface(word_dw, timestamped=False) + ) + + self.sync.rio += [ + If(self.rtlink.o.stb, + self.upconn.trig.delay.eq(self.rtlink.o.data[nbit_linktrig:]), + self.upconn.trig.linktrig_mode.eq(self.rtlink.o.data[:nbit_linktrig]), + ), + self.upconn.trig.stb.eq(self.rtlink.o.stb), + ] + + # DEBUG: out + self.specials += Instance("OBUF", i_I=self.rtlink.o.stb, o_O=debug_sma.p_tx), + # self.specials += Instance("OBUF", i_I=self.rtlink.o.stb, o_O=debug_sma.n_rx), + +class CXP_Extension(CXP_Interface): + def __init__(self, upconn_phy, downconn_phy, debug_sma, pmod_pads): + CXP_Interface.__init__(self, upconn_phy, downconn_phy, debug_sma, pmod_pads) + + +class DownConn_Interface(Module, AutoCSR): + def __init__(self, phy, debug_sma, pmod_pads): + self.rx_ready = CSRStatus() + + # # # + + gtx = phy.gtx + + # GTX status + self.sync += self.rx_ready.status.eq(gtx.rx_ready) + + # DEBUG: init status + self.txinit_phaligndone = CSRStatus() + self.rxinit_phaligndone = CSRStatus() + self.comb += [ + self.txinit_phaligndone.status.eq(gtx.tx_init.Xxphaligndone), + self.rxinit_phaligndone.status.eq(gtx.rx_init.Xxphaligndone), + ] + + # Connect all GTX connections' DRP + self.gtx_daddr = CSRStorage(9) + self.gtx_dread = CSR() + self.gtx_din_stb = CSR() + self.gtx_din = CSRStorage(16) + + self.gtx_dout = CSRStatus(16) + self.gtx_dready = CSR() + + self.comb += gtx.dclk.eq(ClockSignal("sys")) + self.sync += [ + gtx.daddr.eq(self.gtx_daddr.storage), + gtx.den.eq(self.gtx_dread.re | self.gtx_din_stb.re), + gtx.dwen.eq(self.gtx_din_stb.re), + gtx.din.eq(self.gtx_din.storage), + If(gtx.dready, + self.gtx_dready.w.eq(1), + self.gtx_dout.status.eq(gtx.dout), + ).Elif(self.gtx_dready.re, + self.gtx_dready.w.eq(0), + ), + ] + + + # Receiver Pipeline WIP + # + # 32 32+8(dchar) + # PHY ---/---> dchar -----/-----> trigger ack ------> packet ------> CDC FIFO ------> debug buffer + # decoder checker decoder + # + cdr = ClockDomainsRenamer("cxp_gtx_rx") + + # decode all incoming data as duplicate char and inject the result into the bus for downstream modules + self.submodules.dchar_decoder = dchar_decoder = cdr(Duplicated_Char_Decoder()) + + # Priority level 1 packet - Trigger ack packet + self.submodules.trig_ack_checker = trig_ack_checker = cdr(Trigger_Ack_Checker()) + + self.submodules.trig_ack_ps = trig_ack_ps = PulseSynchronizer("cxp_gtx_rx", "sys") + self.sync.cxp_gtx_rx += trig_ack_ps.i.eq(trig_ack_checker.ack) + + self.trig_ack = Signal() + self.trig_clr = Signal() + # Error are latched + self.sync += [ + If(trig_ack_ps.o, + self.trig_ack.eq(1), + ).Elif(self.trig_clr, + self.trig_ack.eq(0), + ), + ] + + # Priority level 2 packet - data, test packet + self.submodules.bootstrap = bootstrap = cdr(RX_Bootstrap()) + + self.bootstrap_decoder_err = CSR() + self.bootstrap_buffer_err = CSR() + + decode_err_ps = PulseSynchronizer("cxp_gtx_rx", "sys") + buffer_err_ps = PulseSynchronizer("cxp_gtx_rx", "sys") + self.submodules += decode_err_ps, buffer_err_ps + self.sync.cxp_gtx_rx += [ + decode_err_ps.i.eq(bootstrap.decode_err), + buffer_err_ps.i.eq(bootstrap.buffer_err), + ] + self.sync += [ + If(decode_err_ps.o, + self.bootstrap_decoder_err.w.eq(1), + ).Elif(self.bootstrap_decoder_err.re, + self.bootstrap_decoder_err.w.eq(0), + ), + If(buffer_err_ps.o, + self.bootstrap_buffer_err.w.eq(1), + ).Elif(self.bootstrap_buffer_err.re, + self.bootstrap_buffer_err.w.eq(0), + ), + ] + + + # test packet error & packet counters + self.bootstrap_test_error_counter = CSRStatus(len(bootstrap.test_err_cnt)) + self.bootstrap_test_packet_counter = CSRStatus(len(bootstrap.test_pak_cnt)) + self.bootstrap_test_counts_reset = CSR() + + + test_reset_ps = PulseSynchronizer("sys", "cxp_gtx_rx") + self.submodules += test_reset_ps + self.sync += test_reset_ps.i.eq(self.bootstrap_test_counts_reset.re), + + self.sync.cxp_gtx_rx += bootstrap.test_cnt_reset.eq(test_reset_ps.o), + self.specials += [ + MultiReg(bootstrap.test_err_cnt, self.bootstrap_test_error_counter.status), + MultiReg(bootstrap.test_pak_cnt, self.bootstrap_test_packet_counter.status), + ] + + # Cicular buffer interface + self.packet_type = CSRStatus(8) + self.pending_packet = CSR() + self.read_ptr = CSRStatus(log2_int(buffer_count)) + + self.specials += [ + MultiReg(bootstrap.packet_type, self.packet_type.status), + MultiReg(self.read_ptr.status, bootstrap.read_ptr_rx, odomain="cxp_gtx_rx"), + ] + self.sync += [ + self.pending_packet.w.eq(self.read_ptr.status != bootstrap.write_ptr_sys), + If(~gtx.rx_ready, + self.read_ptr.status.eq(0), + ).Elif(self.pending_packet.re & self.pending_packet.w, + self.read_ptr.status.eq(self.read_ptr.status + 1), + ) + ] + + # DEBUG: + # # add buffer to improve timing & reduce tight setup/hold time + # self.submodules.buffer_cdc_fifo = buffer_cdc_fifo = cdr(Buffer(word_layout_dchar)) + # cdc_fifo = stream.AsyncFIFO(word_layout_dchar, 512) + # self.submodules += ClockDomainsRenamer({"write": "cxp_gtx_rx", "read": "sys"})(cdc_fifo) + # self.submodules.debug_out = debug_out = RX_Debug_Buffer(word_layout_dchar) + + rx_pipeline = [phy, dchar_decoder, trig_ack_checker, bootstrap] + for s, d in zip(rx_pipeline, rx_pipeline[1:]): + self.comb += s.source.connect(d.sink) + self.source = rx_pipeline[-1].source + + + # DEBUG: CSR + self.trigger_ack = CSR() + self.sync += [ + self.trig_clr.eq(self.trigger_ack.re), + self.trigger_ack.w.eq(self.trig_ack), + ] + + + +class UpConn_Interface(Module, AutoCSR): + def __init__(self, phy, debug_sma, pmod_pads): + # Transmission Pipeline + # + # 32 32 8 + # ctrl/test ---/---> packet -----> idle word -----> trigger ack ---/--> conv ---/---> trigger -----> PHY + # packet wrapper inserter inserter inserter + # + # Equivalent transmission priority: + # trigger > trigger ack > idle > test/data packet + # To maintain the trigger performance, idle word should not be inserted into trigger or trigger ack. + # + # In low speed CoaXpress, the higher priority packet can be inserted in two types of boundary + # Insertion @ char boundary: Trigger packets + # Insertion @ word boundary: Trigger ack & IDLE packets + # The 32 bit part of the pipeline handles the word boundary insertion while the 8 bit part handles the char boundary insertion + + + + # Packet FIFOs with transmission priority + # 0: Trigger packet + self.submodules.trig = trig = TX_Trigger() + + # # DEBUG: INPUT + self.trig_stb = CSR() + self.trig_delay = CSRStorage(8) + self.linktrigger = CSRStorage() + + # self.sync += [ + # trig.stb.eq(self.trig_stb.re), + # trig.delay.eq(self.trig_delay.storage), + # trig.linktrig_mode.eq(self.linktrigger.storage), + # ] + + + # 1: IO acknowledgment for trigger packet + self.submodules.trig_ack = trig_ack = Trigger_ACK_Inserter() + + # DEBUG: INPUT + self.ack = CSR() + self.sync += trig_ack.stb.eq(self.ack.re), + + + # 2: All other packets (data & test packet) + # Control is not timing dependent, all the data packets are handled in firmware + self.submodules.bootstrap = bootstrap = TX_Bootstrap() + + self.submodules.pak_wrp = pak_wrp = Packet_Wrapper() + self.submodules.idle = idle = Idle_Word_Inserter() + + self.submodules.converter = converter = stream.StrideConverter(word_layout, char_layout) + + tx_pipeline = [bootstrap, pak_wrp, idle, trig_ack, converter, trig, phy] + for s, d in zip(tx_pipeline, tx_pipeline[1:]): + self.comb += s.source.connect(d.sink) + +class CXP_Frame_Buffer(Module, AutoCSR): + # optimal stream packet size is 2 KiB - Section 9.5.2 (CXP-001-2021) + def __init__(self, downconns, pmod_pads, packet_size=16384, n_buffer=2): + n_downconn = len(downconns) + + framebuffers = [] + arr_csr = [] + cdr = ClockDomainsRenamer("cxp_gtx_rx") + for i in range(n_buffer): + # TODO: change this to rtio + if i > 0: + name = "buffer_" + str(i) + "_routingid" + csr = CSRStorage(char_width, name=name, reset=i) + arr_csr.append(csr) + setattr(self, name, csr) + + crc_checker = cdr(CXPCRC32_Checker()) + + # TODO: handle full buffer gracefully + + # TODO: investigate why there is a heartbeat message in the middle of the frame with k27.7 code too??? + # NOTE: sometimes there are 0xFBFBFBFB K=0b1111 + # perhaps the buffer is full overflowing and doing strange stuff + + # it should be mem block not "cycle buffer" + # self.submodules.dropper = dropper = cdr(DChar_Dropper()) + buffer_cdc_fifo = cdr(Buffer(word_layout_dchar)) # to improve timing + cdc_fifo = stream.AsyncFIFO(word_layout_dchar, 2**log2_int(packet_size//word_dw)) + self.submodules += crc_checker, buffer_cdc_fifo + self.submodules += ClockDomainsRenamer({"write": "cxp_gtx_rx", "read": "sys"})(cdc_fifo) + + pipeline = [crc_checker, buffer_cdc_fifo, cdc_fifo] + for s, d in zip(pipeline, pipeline[1:]): + self.comb += s.source.connect(d.sink) + framebuffers.append(pipeline[0]) + + # DEBUG: + if i == 0: + self.submodules.debug_out = debug_out = RX_Debug_Buffer(word_layout_dchar, 2**log2_int(packet_size//word_dw)) + self.comb += pipeline[-1].source.connect(debug_out.sink) + else: + # remove any backpressure + self.comb += pipeline[-1].source.ack.eq(1) + + + self.submodules.router = router = cdr(Frame_Packet_Router(downconns, framebuffers, packet_size, pmod_pads)) + + for i, csr in enumerate(arr_csr): + self.specials += MultiReg(csr.storage, router.routing_table[i], odomain="cxp_gtx_rx"), + + # only the simple topology MASTER:ch0, extension:ch1,2,3 is supported right now + active_extensions = Signal(max=n_downconn) + self.sync += active_extensions.eq(reduce(add, [d.rx_ready.status for d in downconns[1:]])) + self.specials += MultiReg(active_extensions, router.n_ext_active, odomain="cxp_gtx_rx"),