From 7d5e3c1ef9d5d0ed21c15a993f65855b13b7b284 Mon Sep 17 00:00:00 2001 From: morgan Date: Wed, 5 Jun 2024 13:01:17 +0800 Subject: [PATCH] cxp downconn: add high speed serial cxp downconn: add bruteforcephase aligner cxp downconn: add gtx with mmcm for TXUSRCLK freq requirement cxp downconn: add loopback mode parameter for testing --- src/gateware/cxp_downconn.py | 426 +++++++++++++++++++++++++++++++++++ 1 file changed, 426 insertions(+) create mode 100644 src/gateware/cxp_downconn.py diff --git a/src/gateware/cxp_downconn.py b/src/gateware/cxp_downconn.py new file mode 100644 index 0000000..b5d56f1 --- /dev/null +++ b/src/gateware/cxp_downconn.py @@ -0,0 +1,426 @@ +from migen import * +from migen.genlib.resetsync import AsyncResetSynchronizer +from migen.genlib.cdc import MultiReg, PulseSynchronizer + +from misoc.cores.code_8b10b import Encoder, Decoder +from misoc.interconnect.csr import * + +from artiq.gateware.drtio.transceiver.gtx_7series_init import * + +from operator import add +from math import ceil +from functools import reduce + +# Changes the phase of the transceiver RX clock to align the comma to +# the LSBs of RXDATA, fixing the latency. +# +# This is implemented by repeatedly resetting the transceiver until it +# gives out the correct phase. Each reset gives a random phase. +# +# If Xilinx had designed the GTX transceiver correctly, RXSLIDE_MODE=PMA +# would achieve this faster and in a cleaner way. But: +# * the phase jumps are of 2 UI at every second RXSLIDE pulse, instead +# of 1 UI at every pulse. It is unclear what the latency becomes. +# * RXSLIDE_MODE=PMA cannot be used with the RX buffer bypassed. +# Those design flaws make RXSLIDE_MODE=PMA yet another broken and useless +# transceiver "feature". +# +# Warning: Xilinx transceivers are LSB first, and comma needs to be flipped +# compared to the usual 8b10b binary representation. +class CXP_BruteforceClockAligner(Module): + def __init__(self, comma, sys_clk_freq, check_period): + self.rxdata = Signal(20) + self.restart = Signal() + + self.ready = Signal() + + check_max_val = ceil(check_period*sys_clk_freq) + check_counter = Signal(max=check_max_val+1) + check = Signal() + reset_check_counter = Signal() + self.sync += [ + check.eq(0), + If(reset_check_counter, + check_counter.eq(check_max_val) + ).Else( + If(check_counter == 0, + check.eq(1), + check_counter.eq(check_max_val) + ).Else( + check_counter.eq(check_counter-1) + ) + ) + ] + + checks_reset = PulseSynchronizer("sys", "cxp_gtx_rx") + self.submodules += checks_reset + + comma_n = ~comma & 0b1111111111 + comma_seen_rxclk = Signal() + comma_seen = Signal() + comma_seen_rxclk.attr.add("no_retiming") + self.specials += MultiReg(comma_seen_rxclk, comma_seen) + self.sync.cxp_gtx_rx += \ + If(checks_reset.o, + comma_seen_rxclk.eq(0) + ).Elif((self.rxdata[:10] == comma) | (self.rxdata[:10] == comma_n), + comma_seen_rxclk.eq(1) + ) + + error_seen_rxclk = Signal() + error_seen = Signal() + error_seen_rxclk.attr.add("no_retiming") + self.specials += MultiReg(error_seen_rxclk, error_seen) + rx1cnt = Signal(max=11) + self.sync.cxp_gtx_rx += [ + rx1cnt.eq(reduce(add, [self.rxdata[i] for i in range(10)])), + If(checks_reset.o, + error_seen_rxclk.eq(0) + ).Elif((rx1cnt != 4) & (rx1cnt != 5) & (rx1cnt != 6), + error_seen_rxclk.eq(1) + ) + ] + + fsm = FSM(reset_state="WAIT_COMMA") + self.submodules += fsm + + fsm.act("WAIT_COMMA", + If(check, + # Errors are still OK at this stage, as the transceiver + # has just been reset and may output garbage data. + If(comma_seen, + NextState("WAIT_NOERROR") + ).Else( + self.restart.eq(1) + ), + checks_reset.i.eq(1) + ) + ) + fsm.act("WAIT_NOERROR", + If(check, + If(comma_seen & ~error_seen, + NextState("READY") + ).Else( + self.restart.eq(1), + NextState("WAIT_COMMA") + ), + checks_reset.i.eq(1) + ) + ) + fsm.act("READY", + reset_check_counter.eq(1), + self.ready.eq(1), + If(error_seen, + checks_reset.i.eq(1), + self.restart.eq(1), + NextState("WAIT_COMMA") + ) + ) + + + +class CXP_DownConn(Module): + # Settings: + # * GTX reference clock @ 125MHz + # * GTX data width = 20 + # * GTX PLL frequency @ 3.125GHz + # * GTX line rate (TX & RX) @ 3.125Gb/s + # * GTX TX/RX USRCLK @ PLL/datawidth = 156MHz + def __init__(self, refclk, pads, sys_clk_freq, tx_mode="single", rx_mode="single"): + assert tx_mode in ["single", "master", "slave"] + assert rx_mode in ["single", "master", "slave"] + + cpll_div = 4 + pll_div = int(40/cpll_div) + + self.rx_restart = Signal() + self.tx_restart = Signal() + self.loopback_mode = Signal(3) + + self.txenable = Signal() + self.submodules.encoder = ClockDomainsRenamer("cxp_gtx_tx")(Encoder(2, True)) + self.submodules.decoders = [ClockDomainsRenamer("cxp_gtx_rx")( + (Decoder(True))) for _ in range(2)] + self.rx_ready = Signal() + + # transceiver direct clock outputs + # useful to specify clock constraints in a way palatable to Vivado + self.txoutclk = Signal() + self.rxoutclk = Signal() + + # # # + + cpllreset = Signal() + cplllock = Signal() + # TX generates cxp_tx clock, init must be in system domain + self.submodules.tx_init = tx_init = GTXInit(sys_clk_freq, False, mode=tx_mode) + # RX receives restart commands from RTIO domain + self.submodules.rx_init = rx_init = GTXInit(sys_clk_freq, True, mode=rx_mode) + self.comb += [ + cpllreset.eq(tx_init.cpllreset), + tx_init.cplllock.eq(cplllock), + rx_init.cplllock.eq(cplllock) + ] + + txdata = Signal(20) + rxdata = Signal(20) + # Note: the following parameters were set after consulting AR45360 + self.specials += \ + Instance("GTXE2_CHANNEL", + # PMA Attributes + p_PMA_RSV=0x00018480, + p_PMA_RSV2=0x2050, # PMA_RSV2[5] = 0: Eye scan feature disabled + p_PMA_RSV3=0, + p_PMA_RSV4=1, # PMA_RSV[4],RX_CM_TRIM[2:0] = 0b1010: Common mode 800mV + p_RX_BIAS_CFG=0b000000000100, + p_RX_OS_CFG=0b0000010000000, + p_RX_CLK25_DIV=5, + p_TX_CLK25_DIV=5, + + # Power-Down Attributes + p_PD_TRANS_TIME_FROM_P2=0x3c, + p_PD_TRANS_TIME_NONE_P2=0x3c, + p_PD_TRANS_TIME_TO_P2=0x64, + + # CPLL + p_CPLL_CFG=0xBC07DC, + p_CPLL_FBDIV=cpll_div, + p_CPLL_FBDIV_45=5, + p_CPLL_REFCLK_DIV=1, + p_RXOUT_DIV=2, + p_TXOUT_DIV=2, + p_CPLL_INIT_CFG=0x00001E, + p_CPLL_LOCK_CFG=0x01E8, + i_CPLLRESET=cpllreset, + i_CPLLPD=cpllreset, + o_CPLLLOCK=cplllock, + i_CPLLLOCKEN=1, + i_CPLLREFCLKSEL=0b001, + i_TSTIN=2**20-1, + i_GTREFCLK0=refclk, + + # TX clock + p_TXBUF_EN="FALSE", + p_TX_XCLK_SEL="TXUSR", + o_TXOUTCLK=self.txoutclk, + i_TXSYSCLKSEL=0b00, + i_TXOUTCLKSEL=0b11, + + # TX Startup/Reset + i_TXPHDLYRESET=0, + i_TXDLYBYPASS=0, + i_TXPHALIGNEN=1 if tx_mode != "single" else 0, + i_GTTXRESET=tx_init.gtXxreset, + o_TXRESETDONE=tx_init.Xxresetdone, + i_TXDLYSRESET=tx_init.Xxdlysreset, + o_TXDLYSRESETDONE=tx_init.Xxdlysresetdone, + i_TXPHINIT=tx_init.txphinit if tx_mode != "single" else 0, + o_TXPHINITDONE=tx_init.txphinitdone if tx_mode != "single" else Signal(), + i_TXPHALIGN=tx_init.Xxphalign if tx_mode != "single" else 0, + i_TXDLYEN=tx_init.Xxdlyen if tx_mode != "single" else 0, + o_TXPHALIGNDONE=tx_init.Xxphaligndone, + i_TXUSERRDY=tx_init.Xxuserrdy, + p_TXPMARESET_TIME=1, + p_TXPCSRESET_TIME=1, + i_TXINHIBIT=~self.txenable, + + # TX data + p_TX_DATA_WIDTH=20, + p_TX_INT_DATAWIDTH=0, + i_TXCHARDISPMODE=Cat(txdata[9], txdata[19]), + i_TXCHARDISPVAL=Cat(txdata[8], txdata[18]), + i_TXDATA=Cat(txdata[:8], txdata[10:18]), + i_TXUSRCLK=ClockSignal("cxp_gtx_tx"), + i_TXUSRCLK2=ClockSignal("cxp_gtx_tx"), + + # TX electrical + i_TXBUFDIFFCTRL=0b100, + i_TXDIFFCTRL=0b1000, + + # RX Startup/Reset + i_RXPHDLYRESET=0, + i_RXDLYBYPASS=0, + i_RXPHALIGNEN=1 if rx_mode != "single" else 0, + i_GTRXRESET=rx_init.gtXxreset, + o_RXRESETDONE=rx_init.Xxresetdone, + i_RXDLYSRESET=rx_init.Xxdlysreset, + o_RXDLYSRESETDONE=rx_init.Xxdlysresetdone, + i_RXPHALIGN=rx_init.Xxphalign if rx_mode != "single" else 0, + i_RXDLYEN=rx_init.Xxdlyen if rx_mode != "single" else 0, + o_RXPHALIGNDONE=rx_init.Xxphaligndone, + i_RXUSERRDY=rx_init.Xxuserrdy, + p_RXPMARESET_TIME=1, + p_RXPCSRESET_TIME=1, + + # RX AFE + p_RX_DFE_XYD_CFG=0, + p_RX_CM_SEL=0b11, # RX_CM_SEL = 0b11: Common mode is programmable + p_RX_CM_TRIM=0b010, # PMA_RSV[4],RX_CM_TRIM[2:0] = 0b1010: Common mode 800mV + i_RXDFEXYDEN=1, + i_RXDFEXYDHOLD=0, + i_RXDFEXYDOVRDEN=0, + i_RXLPMEN=0, # RXLPMEN = 0: DFE mode is enabled + p_RX_DFE_GAIN_CFG=0x0207EA, + p_RX_DFE_VP_CFG=0b00011111100000011, + p_RX_DFE_UT_CFG=0b10001000000000000, + p_RX_DFE_KL_CFG=0b0000011111110, + p_RX_DFE_KL_CFG2=0x3788140A, + p_RX_DFE_H2_CFG=0b000110000000, + p_RX_DFE_H3_CFG=0b000110000000, + p_RX_DFE_H4_CFG=0b00011100000, + p_RX_DFE_H5_CFG=0b00011100000, + p_RX_DFE_LPM_CFG=0x0904, # RX_DFE_LPM_CFG = 0x0904: linerate <= 6.6Gb/s + # = 0x0104: linerate > 6.6Gb/s + + # RX clock + i_RXDDIEN=1, + i_RXSYSCLKSEL=0b00, + i_RXOUTCLKSEL=0b010, + o_RXOUTCLK=self.rxoutclk, + i_RXUSRCLK=ClockSignal("cxp_gtx_rx"), + i_RXUSRCLK2=ClockSignal("cxp_gtx_rx"), + + # RX Clock Correction Attributes + p_CLK_CORRECT_USE="FALSE", + p_CLK_COR_SEQ_1_1=0b0100000000, + p_CLK_COR_SEQ_2_1=0b0100000000, + p_CLK_COR_SEQ_1_ENABLE=0b1111, + p_CLK_COR_SEQ_2_ENABLE=0b1111, + + # RX data + p_RX_DATA_WIDTH=20, + p_RX_INT_DATAWIDTH=0, + o_RXDISPERR=Cat(rxdata[9], rxdata[19]), + o_RXCHARISK=Cat(rxdata[8], rxdata[18]), + o_RXDATA=Cat(rxdata[:8], rxdata[10:18]), + + # RX Byte and Word Alignment Attributes + p_ALIGN_COMMA_DOUBLE="FALSE", + p_ALIGN_COMMA_ENABLE=0b1111111111, + p_ALIGN_COMMA_WORD=1, + p_ALIGN_MCOMMA_DET="TRUE", + p_ALIGN_MCOMMA_VALUE=0b1010000011, + p_ALIGN_PCOMMA_DET="TRUE", + p_ALIGN_PCOMMA_VALUE=0b0101111100, + p_SHOW_REALIGN_COMMA="FALSE", + p_RXSLIDE_AUTO_WAIT=7, + p_RXSLIDE_MODE="PCS", + p_RX_SIG_VALID_DLY=10, + + # RX 8B/10B Decoder Attributes + p_RX_DISPERR_SEQ_MATCH="FALSE", + p_DEC_MCOMMA_DETECT="TRUE", + p_DEC_PCOMMA_DETECT="TRUE", + p_DEC_VALID_COMMA_ONLY="FALSE", + + # RX Buffer Attributes + p_RXBUF_ADDR_MODE="FAST", + p_RXBUF_EIDLE_HI_CNT=0b1000, + p_RXBUF_EIDLE_LO_CNT=0b0000, + p_RXBUF_EN="FALSE", + p_RX_BUFFER_CFG=0b000000, + p_RXBUF_RESET_ON_CB_CHANGE="TRUE", + p_RXBUF_RESET_ON_COMMAALIGN="FALSE", + p_RXBUF_RESET_ON_EIDLE="FALSE", # RXBUF_RESET_ON_EIDLE = FALSE: OOB is disabled + p_RXBUF_RESET_ON_RATE_CHANGE="TRUE", + p_RXBUFRESET_TIME=0b00001, + p_RXBUF_THRESH_OVFLW=61, + p_RXBUF_THRESH_OVRD="FALSE", + p_RXBUF_THRESH_UNDFLW=4, + p_RXDLY_CFG=0x001F, + p_RXDLY_LCFG=0x030, + p_RXDLY_TAP_CFG=0x0000, + p_RXPH_CFG=0xC00002, + p_RXPHDLY_CFG=0x084020, + p_RXPH_MONITOR_SEL=0b00000, + p_RX_XCLK_SEL="RXUSR", + p_RX_DDI_SEL=0b000000, + p_RX_DEFER_RESET_BUF_EN="TRUE", + + # CDR Attributes + p_RXCDR_CFG=0x03_0000_23FF_1040_0020, # DFE @ <= 6.6Gb/s, 8B/10B encoded data, CDR setting < +/- 200ppm + # (See UG476 (v1.12.1), p.205) + p_RXCDR_FR_RESET_ON_EIDLE=0b0, + p_RXCDR_HOLD_DURING_EIDLE=0b0, + p_RXCDR_PH_RESET_ON_EIDLE=0b0, + p_RXCDR_LOCK_CFG=0b010101, + + # Pads + i_GTXRXP=pads.rxp, + i_GTXRXN=pads.rxn, + o_GTXTXP=pads.txp, + o_GTXTXN=pads.txn, + + # ! loopback for debugging + i_LOOPBACK = self.loopback_mode, + p_TX_LOOPBACK_DRIVE_HIZ = "FALSE", + p_RXPRBS_ERR_LOOPBACK = 0b0, + + # Other parameters + p_PCS_RSVD_ATTR=( + (tx_mode != "single") << 1 | # PCS_RSVD_ATTR[1] = 0: TX Single Lane Auto Mode + # = 1: TX Manual Mode + (rx_mode != "single") << 2 | # [2] = 0: RX Single Lane Auto Mode + # = 1: RX Manual Mode + 0 << 8 # [8] = 0: OOB is disabled + ), + i_RXELECIDLEMODE=0b11, # RXELECIDLEMODE = 0b11: OOB is disabled + p_RX_DFE_LPM_HOLD_DURING_EIDLE=0b0, + p_ES_EYE_SCAN_EN="TRUE", # Must be TRUE for GTX + ) + + + # TX clocking + # A PLL is used to generate the correct frequency for TXUSRCLK (UG476 Equation 3-1) + self.clock_domains.cd_cxp_gtx_tx = ClockDomain() + txpll_fb_clk = Signal() + txpll_reset = Signal() + txpll_locked = Signal() + txoutclk_buf = Signal() + txpll_clkout = Signal() + self.specials += [ + Instance("PLLE2_ADV", + p_BANDWIDTH="HIGH", + o_LOCKED=txpll_locked, + i_RST=txpll_reset, + + p_CLKIN1_PERIOD=1e9/sys_clk_freq, # ns + i_CLKIN1=txoutclk_buf, + + # VCO @ 1.25GHz + p_CLKFBOUT_MULT=10, p_DIVCLK_DIVIDE=1, + i_CLKFBIN=txpll_fb_clk, o_CLKFBOUT=txpll_fb_clk, + + # 156.25MHz + p_CLKOUT0_DIVIDE=pll_div, p_CLKOUT0_PHASE=0.0, o_CLKOUT0=txpll_clkout, + + # TODO: DRP for line rate change + ), + Instance("BUFG", i_I=self.txoutclk, o_O=txoutclk_buf), + Instance("BUFG", i_I=txpll_clkout, o_O=self.cd_cxp_gtx_tx.clk), + AsyncResetSynchronizer(self.cd_cxp_gtx_tx, ~txpll_locked & ~tx_init.done) + ] + + # RX clocking + # the CDR matches the required frequency for RXUSRCLK, no need for PLL + self.clock_domains.cd_cxp_gtx_rx = ClockDomain() + self.specials += [ + Instance("BUFG", i_I=self.rxoutclk, o_O=self.cd_cxp_gtx_rx.clk), + AsyncResetSynchronizer(self.cd_cxp_gtx_rx, ~rx_init.done) + ] + + self.comb += [ + txdata.eq(Cat(self.encoder.output[0], self.encoder.output[1])), + self.decoders[0].input.eq(rxdata[:10]), + self.decoders[1].input.eq(rxdata[10:]) + ] + + # 6e-3 is too slow for 3.25Gbps line rate + clock_aligner = CXP_BruteforceClockAligner(0b0101111100, sys_clk_freq, check_period=1e-2) + self.submodules += clock_aligner + self.comb += [ + clock_aligner.rxdata.eq(rxdata), + rx_init.restart.eq(clock_aligner.restart), + self.rx_ready.eq(clock_aligner.ready), + tx_init.restart.eq(self.tx_restart) + ]