From b6ac052e9f83e96b3f4f312f8afea966eb9012d3 Mon Sep 17 00:00:00 2001
From: mwojcik <mw@m-labs.hk>
Date: Thu, 18 Apr 2024 16:55:35 +0800
Subject: [PATCH] aux_controller: multiple receiver buffers

---
 artiq/gateware/drtio/aux_controller.py        | 70 +++++++++++--------
 artiq/gateware/targets/efc.py                 |  5 +-
 artiq/gateware/targets/kasli.py               | 21 +++---
 artiq/gateware/targets/kc705.py               | 14 ++--
 .../test/drtio/test_aux_controller.py         | 24 ++++---
 5 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/artiq/gateware/drtio/aux_controller.py b/artiq/gateware/drtio/aux_controller.py
index 5f91b1f45..27ca2a15d 100644
--- a/artiq/gateware/drtio/aux_controller.py
+++ b/artiq/gateware/drtio/aux_controller.py
@@ -10,6 +10,7 @@ from misoc.interconnect import wishbone
 
 
 max_packet = 1024
+aux_buffer_count = 8
 
 
 class Transmitter(Module, AutoCSR):
@@ -95,13 +96,13 @@ class Transmitter(Module, AutoCSR):
 
 class Receiver(Module, AutoCSR):
     def __init__(self, link_layer, min_mem_dw):
-        self.aux_rx_length = CSRStatus(bits_for(max_packet))
         self.aux_rx_present = CSR()
         self.aux_rx_error = CSR()
+        self.aux_read_pointer = CSR(log2_int(aux_buffer_count))
 
         ll_dw = len(link_layer.rx_aux_data)
         mem_dw = max(min_mem_dw, ll_dw)
-        self.specials.mem = Memory(mem_dw, max_packet//(mem_dw//8))
+        self.specials.mem = Memory(mem_dw, aux_buffer_count*max_packet//(mem_dw//8))
 
         converter = ClockDomainsRenamer("rtio_rx")(
             stream.Converter(ll_dw, mem_dw))
@@ -123,30 +124,45 @@ class Receiver(Module, AutoCSR):
         mem_port = self.mem.get_port(write_capable=True, clock_domain="rtio_rx")
         self.specials += mem_port
 
+        # write pointer represents where the gateware is
+        write_pointer = Signal(log2_int(aux_buffer_count))
+        write_pointer_sys = Signal.like(write_pointer)
+        # read pointer represents where CPU is
+        # write reaching read is an error, read reaching write is buffer clear
+        read_pointer = Signal.like(write_pointer)
+        read_pointer_rx = Signal.like(write_pointer)
+
+        read_pointer.attr.add("no_retiming")
+        write_pointer.attr.add("no_retiming")
+        signal_error = PulseSynchronizer("rtio_rx", "sys")
+
+        self.specials += [
+            MultiReg(read_pointer, read_pointer_rx, "rtio_rx"),
+            MultiReg(write_pointer, write_pointer_sys)
+        ]
+
         frame_counter_nbits = bits_for(max_packet) - log2_int(mem_dw//8)
         frame_counter = Signal(frame_counter_nbits)
+        # frame counter requires one more bit to represent overflow (bits_for)
+        # actual valid packet will be addressed with one bit less
+        packet_nbits = frame_counter_nbits - 1
         self.comb += [
-            mem_port.adr.eq(frame_counter),
+            mem_port.adr[:packet_nbits].eq(frame_counter),
+            # bits above the frame counter point to current frame
+            mem_port.adr[packet_nbits:].eq(write_pointer),
             mem_port.dat_w.eq(converter.source.data),
-            converter.source.ack.eq(1)
+            converter.source.ack.eq(1),
+            self.aux_read_pointer.w.eq(read_pointer)
         ]
 
-        frame_counter.attr.add("no_retiming")
-        frame_counter_sys = Signal(frame_counter_nbits)
-        self.specials += MultiReg(frame_counter, frame_counter_sys)
-        self.comb += self.aux_rx_length.status.eq(frame_counter_sys << log2_int(mem_dw//8))
-
-        signal_frame = PulseSynchronizer("rtio_rx", "sys")
-        frame_ack = PulseSynchronizer("sys", "rtio_rx")
-        signal_error = PulseSynchronizer("rtio_rx", "sys")
-        self.submodules += signal_frame, frame_ack, signal_error
+        self.submodules += signal_error
         self.sync += [
-            If(self.aux_rx_present.re, self.aux_rx_present.w.eq(0)),
-            If(signal_frame.o, self.aux_rx_present.w.eq(1)),
             If(self.aux_rx_error.re, self.aux_rx_error.w.eq(0)),
-            If(signal_error.o, self.aux_rx_error.w.eq(1))
+            If(signal_error.o, self.aux_rx_error.w.eq(1)),
+            self.aux_rx_present.w.eq(~(read_pointer == write_pointer_sys)),
+            If(self.aux_rx_present.re & self.aux_rx_present.w,
+                read_pointer.eq(read_pointer + 1)),
         ]
-        self.comb += frame_ack.i.eq(self.aux_rx_present.re)
 
         fsm = ClockDomainsRenamer("rtio_rx")(FSM(reset_state="IDLE"))
         self.submodules += fsm
@@ -171,7 +187,7 @@ class Receiver(Module, AutoCSR):
                     NextState("FRAME")
                 )
             ).Else(
-                NextValue(frame_counter, 0)
+                NextValue(frame_counter, 0),
             )
         )
         fsm.act("FRAME",
@@ -189,14 +205,12 @@ class Receiver(Module, AutoCSR):
             )
         )
         fsm.act("SIGNAL_FRAME",
-            signal_frame.i.eq(1),
-            NextState("WAIT_ACK"),
-            If(converter.source.stb, signal_error.i.eq(1))
-        )
-        fsm.act("WAIT_ACK",
-            If(frame_ack.o,
-                NextValue(frame_counter, 0), 
-                NextState("IDLE")
+            NextState("IDLE"),
+            If((write_pointer + 1) == read_pointer_rx,
+                # on full buffer, overwrite only current frame
+                signal_error.i.eq(1)
+            ).Else(
+                NextValue(write_pointer, write_pointer + 1)
             ),
             If(converter.source.stb, signal_error.i.eq(1))
         )
@@ -215,8 +229,8 @@ class DRTIOAuxController(Module):
         tx_sdram_if = wishbone.SRAM(self.transmitter.mem, read_only=False, data_width=dw)
         rx_sdram_if = wishbone.SRAM(self.receiver.mem, read_only=True, data_width=dw)
         decoder = wishbone.Decoder(self.bus,
-            [(lambda a: a[log2_int(max_packet)-wsb] == 0, tx_sdram_if.bus),
-             (lambda a: a[log2_int(max_packet)-wsb] == 1, rx_sdram_if.bus)],
+            [(lambda a: a[log2_int(max_packet*aux_buffer_count)-wsb] == 0, tx_sdram_if.bus),
+             (lambda a: a[log2_int(max_packet*aux_buffer_count)-wsb] == 1, rx_sdram_if.bus)],
             register=True)
         self.submodules += tx_sdram_if, rx_sdram_if, decoder
 
diff --git a/artiq/gateware/targets/efc.py b/artiq/gateware/targets/efc.py
index 42a4f031b..ee26c5e0e 100644
--- a/artiq/gateware/targets/efc.py
+++ b/artiq/gateware/targets/efc.py
@@ -82,9 +82,10 @@ class Satellite(BaseSoC, AMPSoC):
             core.link_layer, self.cpu_dw))
         self.csr_devices.append("drtioaux0")
 
+        drtio_aux_mem_size = 1024 * 16 # max_packet * 8 buffers * 2 (tx, rx halves)
         memory_address = self.mem_map["drtioaux"]
-        self.add_wb_slave(memory_address, 0x800, self.drtioaux0.bus)
-        self.add_memory_region("drtioaux0_mem", memory_address | self.shadow_base, 0x800)
+        self.add_wb_slave(memory_address, drtio_aux_mem_size, self.drtioaux0.bus)
+        self.add_memory_region("drtioaux0_mem", memory_address | self.shadow_base, drtio_aux_mem_size)
 
         self.config["HAS_DRTIO"] = None
         self.add_csr_group("drtioaux", ["drtioaux0"])
diff --git a/artiq/gateware/targets/kasli.py b/artiq/gateware/targets/kasli.py
index d9ed6f446..cd1404322 100755
--- a/artiq/gateware/targets/kasli.py
+++ b/artiq/gateware/targets/kasli.py
@@ -236,10 +236,11 @@ class MasterBase(MiniSoC, AMPSoC):
             setattr(self.submodules, coreaux_name, coreaux)
             self.csr_devices.append(coreaux_name)
 
-            memory_address = self.mem_map["drtioaux"] + 0x800*i
-            self.add_wb_slave(memory_address, 0x800,
+            drtio_aux_mem_size = 1024 * 16 # max_packet * 8 buffers * 2 (tx, rx halves)
+            memory_address = self.mem_map["drtioaux"] + drtio_aux_mem_size*i
+            self.add_wb_slave(memory_address, drtio_aux_mem_size,
                               coreaux.bus)
-            self.add_memory_region(memory_name, memory_address | self.shadow_base, 0x800)
+            self.add_memory_region(memory_name, memory_address | self.shadow_base, drtio_aux_mem_size)
         self.config["HAS_DRTIO"] = None
         self.config["HAS_DRTIO_ROUTING"] = None
         self.config["DRTIO_ROLE"] = "master"
@@ -318,10 +319,11 @@ class MasterBase(MiniSoC, AMPSoC):
             setattr(self.submodules, coreaux_name, coreaux)
             self.csr_devices.append(coreaux_name)
 
-            memory_address = self.mem_map["drtioaux"] + 0x800*channel
-            self.add_wb_slave(memory_address, 0x800,
+            drtio_aux_mem_size = 1024 * 16 # max_packet * 8 buffers * 2 (tx, rx halves)
+            memory_address = self.mem_map["drtioaux"] + drtio_aux_mem_size*channel
+            self.add_wb_slave(memory_address, drtio_aux_mem_size,
                             coreaux.bus)
-            self.add_memory_region(memory_name, memory_address | self.shadow_base, 0x800)
+            self.add_memory_region(memory_name, memory_address | self.shadow_base, drtio_aux_mem_size)
 
     def add_drtio_cpuif_groups(self):
         self.add_csr_group("drtio", self.drtio_csr_group)
@@ -486,10 +488,11 @@ class SatelliteBase(BaseSoC, AMPSoC):
             setattr(self.submodules, coreaux_name, coreaux)
             self.csr_devices.append(coreaux_name)
 
-            memory_address = self.mem_map["drtioaux"] + 0x800*i
-            self.add_wb_slave(memory_address, 0x800,
+            drtio_aux_mem_size = 1024 * 16 # max_packet * 8 buffers * 2 (tx, rx halves)
+            memory_address = self.mem_map["drtioaux"] + drtio_aux_mem_size * i
+            self.add_wb_slave(memory_address, drtio_aux_mem_size,
                               coreaux.bus)
-            self.add_memory_region(memory_name, memory_address | self.shadow_base, 0x800)
+            self.add_memory_region(memory_name, memory_address | self.shadow_base, drtio_aux_mem_size)
         self.config["HAS_DRTIO"] = None
         self.config["HAS_DRTIO_ROUTING"] = None
         self.config["DRTIO_ROLE"] = "satellite"
diff --git a/artiq/gateware/targets/kc705.py b/artiq/gateware/targets/kc705.py
index b3b5a4af7..4200919ae 100755
--- a/artiq/gateware/targets/kc705.py
+++ b/artiq/gateware/targets/kc705.py
@@ -247,10 +247,11 @@ class _MasterBase(MiniSoC, AMPSoC):
             setattr(self.submodules, coreaux_name, coreaux)
             self.csr_devices.append(coreaux_name)
 
-            memory_address = self.mem_map["drtioaux"] + 0x800*i
-            self.add_wb_slave(memory_address, 0x800,
+            drtio_aux_mem_size = 1024 * 16 # max_packet * 8 buffers * 2 (tx, rx halves)
+            memory_address = self.mem_map["drtioaux"] + drtio_aux_mem_size*i
+            self.add_wb_slave(memory_address, drtio_aux_mem_size,
                               coreaux.bus)
-            self.add_memory_region(memory_name, memory_address | self.shadow_base, 0x800)
+            self.add_memory_region(memory_name, memory_address | self.shadow_base, drtio_aux_mem_size)
         self.config["HAS_DRTIO"] = None
         self.config["HAS_DRTIO_ROUTING"] = None
         self.add_csr_group("drtio", drtio_csr_group)
@@ -405,10 +406,11 @@ class _SatelliteBase(BaseSoC, AMPSoC):
             setattr(self.submodules, coreaux_name, coreaux)
             self.csr_devices.append(coreaux_name)
 
-            memory_address = self.mem_map["drtioaux"] + 0x800*i
-            self.add_wb_slave(memory_address, 0x800,
+            drtio_aux_mem_size = 1024 * 16 # max_packet * 8 buffers * 2 (tx, rx halves)
+            memory_address = self.mem_map["drtioaux"] + drtio_aux_mem_size*i
+            self.add_wb_slave(memory_address, drtio_aux_mem_size,
                               coreaux.bus)
-            self.add_memory_region(memory_name, memory_address | self.shadow_base, 0x800)
+            self.add_memory_region(memory_name, memory_address | self.shadow_base, drtio_aux_mem_size)
         self.config["HAS_DRTIO"] = None
         self.config["HAS_DRTIO_ROUTING"] = None
         self.add_csr_group("drtioaux", drtioaux_csr_group)
diff --git a/artiq/gateware/test/drtio/test_aux_controller.py b/artiq/gateware/test/drtio/test_aux_controller.py
index 6c65307ed..1ea809c0b 100644
--- a/artiq/gateware/test/drtio/test_aux_controller.py
+++ b/artiq/gateware/test/drtio/test_aux_controller.py
@@ -60,28 +60,36 @@ class TestAuxController(unittest.TestCase):
             while (yield from dut[dw].aux_controller.transmitter.aux_tx.read()):
                 yield
 
-        def receive_packet(dw):
+        def receive_packet(pkt_no, len, dw):
             while not (yield from dut[dw].aux_controller.receiver.aux_rx_present.read()):
                 yield
-            length = yield from dut[dw].aux_controller.receiver.aux_rx_length.read()
+            p = yield from dut[dw].aux_controller.receiver.aux_read_pointer.read()
+            self.assertEqual(pkt_no, p)
             r = []
-            for i in range(length//(dw//8)):
+            # packet length is derived by software now, so we pass it for the test
+            for i in range(len):
+                if dw == 64:
+                    offset = 2048
+                if dw == 32:
+                    offset = 1024
+                print(dw)
+                max_packet = 1024
                 r.append((yield from dut[dw].aux_controller.bus.read(256+i)))
             yield from dut[dw].aux_controller.receiver.aux_rx_present.write(1)
             return r
 
         prng = random.Random(0)
 
-        def send_and_check_packet(dw):
+        def send_and_check_packet(i, dw):
             data = [prng.randrange(2**dw-1) for _ in range(prng.randrange(1, 16))]
             yield from send_packet(data, dw)
-            received = yield from receive_packet(dw)
+            received = yield from receive_packet(i, len(data), dw)
             self.assertEqual(data, received)
 
         def sim(dw):
             yield from link_init(dw)
             for i in range(8):
-                yield from send_and_check_packet(dw)
+                yield from send_and_check_packet(i, dw)
 
         @passive
         def rt_traffic(dw):
@@ -95,5 +103,5 @@ class TestAuxController(unittest.TestCase):
                 yield dut[dw].link_layer.tx_rt_frame.eq(0)
                 yield
 
-        run_simulation(dut[32], [sim(32), rt_traffic(32)])
-        run_simulation(dut[64], [sim(64), rt_traffic(64)])
+        run_simulation(dut[32], [sim(32), rt_traffic(32)], vcd_name="32bit.vcd")
+        run_simulation(dut[64], [sim(64), rt_traffic(64)], vcd_name="64bit.vcd")