diff --git a/axi/dma/rtl/v2/AxiStreamDmaV2Write.vhd b/axi/dma/rtl/v2/AxiStreamDmaV2Write.vhd index cf585424a8..4fa3726d77 100755 --- a/axi/dma/rtl/v2/AxiStreamDmaV2Write.vhd +++ b/axi/dma/rtl/v2/AxiStreamDmaV2Write.vhd @@ -383,9 +383,28 @@ begin if r.awlen = 0 then -- Set the flag v.wMaster.wlast := '1'; - -- If next state has not already been updated go to idle + -- If next state has not already been updated, decide where + -- to go next. if v.state = MOVE_S then - v.state := IDLE_S; + -- If this burst exactly filled the buffer while the + -- frame continues, start the continue here (return the + -- descriptor with continue set) rather than re-entering + -- ADDR_S. Re-entering ADDR_S with maxSize=0 would issue + -- a stray zero-length burst at the next buffer's base + -- address; on a host DMA that off-by-one write lands + -- past the mapped page and raises an IOMMU page fault. + -- (contEn=0 keeps the legacy overflow/drop path.) + if (v.dmaWrTrack.maxSize(31 downto log2(DATA_BYTES_C)) = 0) and (r.dmaWrTrack.contEn = '1') then + v.continue := '1'; + v.dmaWrTrack.inUse := '0'; + if r.dmaWrTrack.metaEnable = '1' then + v.state := META_S; + else + v.state := RETURN_S; + end if; + else + v.state := IDLE_S; + end if; end if; else -- Decrement the transaction counter diff --git a/tests/axi/dma/test_AxiStreamDmaV2FifoLoopback.py b/tests/axi/dma/test_AxiStreamDmaV2FifoLoopback.py new file mode 100644 index 0000000000..deee5ac728 --- /dev/null +++ b/tests/axi/dma/test_AxiStreamDmaV2FifoLoopback.py @@ -0,0 +1,122 @@ +############################################################################## +## This file is part of 'SLAC Firmware Standard Library'. +## It is subject to the license terms in the LICENSE.txt file found in the +## top-level directory of this distribution and at: +## https://confluence.slac.stanford.edu/display/ppareg/LICENSE.html. +## No part of 'SLAC Firmware Standard Library', including this file, +## may be copied, modified, propagated, or distributed except according to +## the terms contained in the LICENSE.txt file. +############################################################################## + +# Test methodology: +# - Datapath: Drive a single AXI-Stream frame into the integrated store-and-forward +# FIFO (sAxis), let it buffer through the AXI4 memory model (M_AXI -> AxiRam), and +# capture the forwarded frame on the output (mAxis). +# - Boundary: The IP-integrator wrapper bakes BUFF_FRAME_WIDTH_G = 8, i.e. a 256-byte +# per-buffer frame. A frame larger than 256 B is split across multiple buffers with +# the "continue" bit set on every buffer except the last, then re-merged on readback +# (AxiStreamDmaV2Read drives tLast := not continue). This is the same mechanism that, +# on the XilinxVariumC1100 HBM buffer (BUFF_FRAME_WIDTH_G = 19), splits frames at +# 512 KiB. Testing at the 256-byte sim boundary exercises identical logic. +# - Check: The forwarded frame must be byte-for-byte identical to the injected frame +# and arrive as ONE frame (single tLast). A truncated/fragmented output frame means +# the continue re-merge is broken. + +import os + +import cocotb +import pytest +from cocotb.triggers import RisingEdge, Timer +from cocotbext.axi import ( + AxiBus, + AxiRam, + AxiStreamBus, + AxiStreamFrame, + AxiStreamSink, + AxiStreamSource, +) + +from tests.common.regression_utils import run_surf_vhdl_test, start_lockstep_clocks + +# Per-buffer frame size baked into AxiStreamDmaV2FifoIpIntegrator (BUFF_FRAME_WIDTH_G=8). +BUFFER_FRAME_BYTES = 256 + + +class TB: + def __init__(self, dut): + self.dut = dut + + start_lockstep_clocks(dut.axiClk, dut.axilClk, period_ns=5.0) + dut.axiRst.setimmediatevalue(1) + dut.axilRst.setimmediatevalue(1) + dut.axiReady.setimmediatevalue(1) + + # Source drives sAxis, sink captures mAxis, AxiRam backs the M_AXI store. + self.source = AxiStreamSource(AxiStreamBus.from_prefix(dut, "S_AXIS"), dut.axiClk, dut.axiRst) + self.sink = AxiStreamSink(AxiStreamBus.from_prefix(dut, "M_AXIS"), dut.axiClk, dut.axiRst) + self.ram = AxiRam(AxiBus.from_prefix(dut, "M_AXI"), dut.axiClk, dut.axiRst, size=2 ** 16) + + async def cycle(self, count=1): + for _ in range(count): + await RisingEdge(self.dut.axiClk) + await Timer(1, unit="ns") + + async def reset(self): + self.dut.axiRst.value = 1 + self.dut.axilRst.value = 1 + await self.cycle(8) + self.dut.axiRst.value = 0 + self.dut.axilRst.value = 0 + # The FIFO self-loads its internal free list (INIT_S) after reset; give it + # time to populate before injecting traffic. + await self.cycle(64) + + +@cocotb.test(timeout_time=2, timeout_unit="ms") +async def fifo_loopback_frame_test(dut): + tb = TB(dut) + await tb.reset() + + size = int(os.environ.get("FRAME_SIZE", "260")) + payload = bytes((i & 0xFF) for i in range(size)) + + await tb.source.send(AxiStreamFrame(payload, tdest=0, tid=0)) + rx = await tb.sink.recv() + got = bytes(rx.tdata) + + crosses = size > BUFFER_FRAME_BYTES + assert len(got) == size, ( + f"output frame size {len(got)} != injected {size} " + f"(buffer frame = {BUFFER_FRAME_BYTES} B; " + f"{'continue re-merge produced a fragmented frame' if crosses else 'single-buffer frame corrupted'})" + ) + assert got == payload, ( + f"payload mismatch through store-and-forward buffer (size={size}, " + f"crosses 256 B boundary={crosses})" + ) + + +@pytest.mark.parametrize( + "parameters", + [ + pytest.param({"FRAME_SIZE": 256}, id="one_buffer_256B"), + pytest.param({"FRAME_SIZE": 260}, id="continue_boundary_260B"), + pytest.param({"FRAME_SIZE": 1024}, id="multi_buffer_1024B"), + ], +) +def test_AxiStreamDmaV2FifoLoopback(parameters): + run_surf_vhdl_test( + test_file=__file__, + toplevel="surf.axistreamdmav2fifoipintegrator", + # FRAME_SIZE is consumed by the bench via the environment, not a VHDL generic. + extra_env=parameters, + extra_vhdl_sources={ + "surf": [ + "axi/axi-lite/ip_integrator/SlaveAxiLiteIpIntegrator.vhd", + "axi/axi-stream/ip_integrator/SlaveAxiStreamIpIntegrator.vhd", + "axi/axi-stream/ip_integrator/MasterAxiStreamIpIntegrator.vhd", + "axi/axi4/ip_integrator/MasterAxiIpIntegrator.vhd", + "axi/dma/ip_integrator/AxiStreamDmaV2FifoIpIntegrator.vhd", + ], + }, + ) diff --git a/tests/axi/dma/test_AxiStreamDmaV2WriteContinue.py b/tests/axi/dma/test_AxiStreamDmaV2WriteContinue.py new file mode 100644 index 0000000000..c3e30cafed --- /dev/null +++ b/tests/axi/dma/test_AxiStreamDmaV2WriteContinue.py @@ -0,0 +1,192 @@ +############################################################################## +## This file is part of 'SLAC Firmware Standard Library'. +## It is subject to the license terms in the LICENSE.txt file found in the +## top-level directory of this distribution and at: +## https://confluence.slac.stanford.edu/display/ppareg/LICENSE.html. +## No part of 'SLAC Firmware Standard Library', including this file, +## may be copied, modified, propagated, or distributed except according to +## the terms contained in the LICENSE.txt file. +############################################################################## + +# Test methodology: +# - Drive a continued ("multi-buffer") frame: a single AXI-Stream frame longer +# than the descriptor maxSize, with contEn=1, so the write engine fills one +# buffer, returns it with continue=1, requests a SECOND descriptor at a +# DIFFERENT memory address, and finishes the frame there. +# - This mirrors the main PCIe DMA (AxiStreamDmaV2Desc) path on hardware, where +# each continued buffer is a separately-mapped host page. A stray write burst +# to the wrong / boundary address is exactly what triggers an IOMMU +# IO_PAGE_FAULT on real hardware once a frame crosses a buffer boundary. +# - Checks: +# * The two returned descriptors carry continue=1 then continue=0. +# * Each buffer's bytes land at its own address. +# * EVERY M_AXI write burst targets one of the two declared buffer windows +# -- no stray burst to an unmapped address. + +import os + +import cocotb +import pytest +from cocotb.clock import Clock +from cocotb.triggers import RisingEdge, Timer +from cocotbext.axi import AxiRamWrite, AxiStreamBus, AxiStreamFrame, AxiStreamSource, AxiWriteBus + +from tests.common.regression_utils import hdl_parameters_from, run_surf_vhdl_test + + +def logic_int(value) -> int: + try: + return int(value) + except ValueError: + return 0 + + +class TB: + def __init__(self, dut): + self.dut = dut + self.source = None + self.ram = None + self.aw_log = [] + # Distinct, page-like buffers handed out on successive descriptor + # requests, widely spaced so any boundary-overrun write lands in a gap + # (outside every buffer window) and is detectable. + # 4 kB-aligned, widely-spaced buffers: a boundary-overrun write lands in + # the gap between buffers (outside every window) so it is detectable. + self.base = int(os.environ.get("ADDR_BASE", "0x1000"), 0) + self.stride = int(os.environ.get("ADDR_STRIDE", "0x1000"), 0) + self.max_size = int(os.environ.get("DESC_MAX_SIZE", "32"), 0) + self.req_count = 0 + + cocotb.start_soon(Clock(dut.axiClk, 5.0, unit="ns").start()) + dut.axiRst.setimmediatevalue(1) + dut.axiWriteCtrlPause.setimmediatevalue(0) + dut.axiWriteCtrlOver.setimmediatevalue(0) + dut.dmaWrDescAckValid.setimmediatevalue(0) + dut.dmaWrDescRetAck.setimmediatevalue(0) + cocotb.start_soon(self._descriptor_responder()) + cocotb.start_soon(self._monitor_aw()) + + def buf_addr(self, i): + return self.base + i * self.stride + + async def cycle(self, count=1): + for _ in range(count): + await RisingEdge(self.dut.axiClk) + await Timer(1, unit="ns") + + async def reset(self): + self.dut.axiRst.setimmediatevalue(1) + await self.cycle(3) + self.dut.axiRst.value = 0 + await self.cycle(3) + + def start_agents(self): + if self.source is None: + self.source = AxiStreamSource(AxiStreamBus.from_prefix(self.dut, "M_AXIS"), self.dut.axiClk, self.dut.axiRst) + if self.ram is None: + self.ram = AxiRamWrite(AxiWriteBus.from_prefix(self.dut, "M_AXI"), self.dut.axiClk, self.dut.axiRst, size=2 ** 16) + + async def _descriptor_responder(self): + acked = False + while True: + await RisingEdge(self.dut.axiClk) + await Timer(1, unit="ns") + self.dut.dmaWrDescAckValid.value = 0 + req = int(self.dut.dmaWrDescReqValid.value) + if not req: + acked = False + if req and not acked: + acked = True + addr = self.buf_addr(self.req_count) + self.req_count += 1 + self.dut.dmaWrDescAckAddress.value = addr + self.dut.dmaWrDescAckMetaEnable.value = 0 + self.dut.dmaWrDescAckMetaAddr.value = 0 + self.dut.dmaWrDescAckDropEn.value = 0 + self.dut.dmaWrDescAckMaxSize.value = self.max_size + self.dut.dmaWrDescAckContEn.value = 1 # allow multi-buffer continue + self.dut.dmaWrDescAckBuffId.value = 0x1000 + self.req_count + self.dut.dmaWrDescAckTimeout.value = 0x1000 + self.dut.dmaWrDescAckValid.value = 1 + + async def _monitor_aw(self): + while True: + await RisingEdge(self.dut.axiClk) + await Timer(1, unit="ns") + if logic_int(self.dut.M_AXI_AWVALID.value) and logic_int(self.dut.M_AXI_AWREADY.value): + self.aw_log.append((int(self.dut.M_AXI_AWADDR.value), int(self.dut.M_AXI_AWLEN.value))) + + +@cocotb.test(timeout_time=200, timeout_unit="us") +async def continue_multibuffer_write_test(dut): + tb = TB(dut) + await tb.reset() + tb.start_agents() + + # Frame spans ceil(total/max_size) buffers: each full buffer returns + # continue=1, the final (partial) buffer returns continue=0. + total = int(os.environ.get("FRAME_SIZE", "48"), 0) + payload = bytes((i & 0xFF) for i in range(total)) + await tb.source.send(AxiStreamFrame(payload, tdest=0, tid=0)) + + n_bufs = (total + tb.max_size - 1) // tb.max_size + sizes = [tb.max_size] * (n_bufs - 1) + [total - tb.max_size * (n_bufs - 1)] + + # Collect every continued descriptor return. + returns = [] + deadline = 40000 + while len(returns) < n_bufs and deadline > 0: + await tb.cycle(1) + deadline -= 1 + if int(dut.dmaWrDescRetValid.value): + returns.append( + (int(dut.dmaWrDescRetContinue.value), int(dut.dmaWrDescRetSize.value)) + ) + dut.dmaWrDescRetAck.value = 1 + await tb.cycle(1) + dut.dmaWrDescRetAck.value = 0 + + assert len(returns) == n_bufs, f"expected {n_bufs} descriptor returns, got {returns}" + for i, (cont, size) in enumerate(returns): + exp_cont = 0 if i == n_bufs - 1 else 1 + assert cont == exp_cont, f"buffer {i} continue={cont}, expected {exp_cont}; returns={returns}" + assert size == sizes[i], f"buffer {i} size={size}, expected {sizes[i]}; returns={returns}" + + # Data integrity: each buffer holds its own slice at its own address. + off = 0 + for i, sz in enumerate(sizes): + assert tb.ram.read(tb.buf_addr(i), sz) == payload[off:off + sz], f"buffer {i} payload mismatch" + off += sz + + # No stray write: every AW burst must fall inside one of the buffer windows. + # A burst in the gap between buffers is the IOMMU IO_PAGE_FAULT signature. + windows = [(tb.buf_addr(i), tb.buf_addr(i) + sizes[i]) for i in range(n_bufs)] + for addr, length in tb.aw_log: + in_window = any(lo <= addr < hi for lo, hi in windows) + assert in_window, ( + f"stray write burst at addr=0x{addr:x} len={length} outside buffer windows " + f"{[hex(w[0]) for w in windows]} -- this is the IOMMU IO_PAGE_FAULT signature" + ) + + +@pytest.mark.parametrize( + "parameters", + [ + pytest.param({"DESC_MAX_SIZE": 32, "FRAME_SIZE": 48}, id="continue_48B_over_32B"), + pytest.param({"DESC_MAX_SIZE": 32, "FRAME_SIZE": 64}, id="continue_64B_two_full"), + pytest.param({"DESC_MAX_SIZE": 16, "FRAME_SIZE": 48, "BURST_BYTES_G": 8}, id="continue_48B_over_16B"), + ], +) +def test_AxiStreamDmaV2WriteContinue(parameters): + run_surf_vhdl_test( + test_file=__file__, + toplevel="surf.axistreamdmav2writeipintegrator", + parameters=hdl_parameters_from(parameters), + extra_env=parameters, + extra_vhdl_sources={ + "surf": [ + "axi/axi4/ip_integrator/MasterAxiIpIntegrator.vhd", + "axi/dma/ip_integrator/AxiStreamDmaV2WriteIpIntegrator.vhd", + ], + }, + )