diff --git a/src/dma/rtl/taxi_dma_psdpram_async.sv b/src/dma/rtl/taxi_dma_psdpram_async.sv new file mode 100644 index 0000000..55c6e0c --- /dev/null +++ b/src/dma/rtl/taxi_dma_psdpram_async.sv @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: CERN-OHL-S-2.0 +/* + +Copyright (c) 2019-2025 FPGA Ninja, LLC + +Authors: +- Alex Forencich + +*/ + +`resetall +`timescale 1ns / 1ps +`default_nettype none + +/* + * DMA parallel simple dual port RAM (asynchronous) + */ +module taxi_dma_psdpram_async # +( + // RAM size + parameter SIZE = 4096, + // Read data output pipeline stages + parameter PIPELINE = 2 +) +( + /* + * Write port + */ + input wire clk_wr, + input wire rst_wr, + taxi_dma_ram_if.wr_slv dma_ram_wr, + + /* + * Read port + */ + input wire clk_rd, + input wire rst_rd, + taxi_dma_ram_if.rd_slv dma_ram_rd +); + +localparam SEGS = dma_ram_wr.SEGS; +localparam SEG_ADDR_W = dma_ram_wr.SEG_ADDR_W; +localparam SEG_DATA_W = dma_ram_wr.SEG_DATA_W; +localparam SEG_BE_W = dma_ram_wr.SEG_BE_W; + +localparam INT_ADDR_W = $clog2(SIZE/(SEGS*SEG_BE_W)); + +// check configuration +if (SEG_ADDR_W < INT_ADDR_W) + $fatal(0, "Error: SEG_ADDR_W not sufficient for requested size (min %d for size %d) (instance %m)", INT_ADDR_W, SIZE); + +for (genvar n = 0; n < SEGS; n = n + 1) begin + + (* ramstyle = "no_rw_check" *) + logic [SEG_DATA_W-1:0] mem_reg[2**INT_ADDR_W]; + + logic wr_done_reg = 1'b0; + + logic [PIPELINE-1:0] rd_resp_valid_pipe_reg = '0; + logic [SEG_DATA_W-1:0] rd_resp_data_pipe_reg[PIPELINE]; + + initial begin + // two nested loops for smaller number of iterations per loop + // workaround for synthesizer complaints about large loop counts + for (integer i = 0; i < 2**INT_ADDR_W; i = i + 2**(INT_ADDR_W/2)) begin + for (integer j = i; j < i + 2**(INT_ADDR_W/2); j = j + 1) begin + mem_reg[j] = '0; + end + end + + for (integer i = 0; i < PIPELINE; i = i + 1) begin + rd_resp_data_pipe_reg[i] = '0; + end + end + + always_ff @(posedge clk_wr) begin + wr_done_reg <= 1'b0; + + for (integer i = 0; i < SEG_BE_W; i = i + 1) begin + if (dma_ram_wr.wr_cmd_valid[n] && dma_ram_wr.wr_cmd_be[n][i]) begin + mem_reg[dma_ram_wr.wr_cmd_addr[n][INT_ADDR_W-1:0]][i*8 +: 8] <= dma_ram_wr.wr_cmd_data[n][i*8 +: 8]; + end + wr_done_reg <= dma_ram_wr.wr_cmd_valid[n]; + end + + if (rst_wr) begin + wr_done_reg <= 1'b0; + end + end + + assign dma_ram_wr.wr_cmd_ready[n] = 1'b1; + assign dma_ram_wr.wr_done[n] = wr_done_reg; + + always_ff @(posedge clk_rd) begin + if (dma_ram_rd.rd_resp_ready[n]) begin + rd_resp_valid_pipe_reg[PIPELINE-1] <= 1'b0; + end + + for (integer j = PIPELINE-1; j > 0; j = j - 1) begin + if (dma_ram_rd.rd_resp_ready[n] || (PIPELINE'(~rd_resp_valid_pipe_reg) >> j) != 0) begin + rd_resp_valid_pipe_reg[j] <= rd_resp_valid_pipe_reg[j-1]; + rd_resp_data_pipe_reg[j] <= rd_resp_data_pipe_reg[j-1]; + rd_resp_valid_pipe_reg[j-1] <= 1'b0; + end + end + + if (dma_ram_rd.rd_cmd_valid[n] && dma_ram_rd.rd_cmd_ready[n]) begin + rd_resp_valid_pipe_reg[0] <= 1'b1; + rd_resp_data_pipe_reg[0] <= mem_reg[dma_ram_rd.rd_cmd_addr[n][INT_ADDR_W-1:0]]; + end + + if (rst_rd) begin + rd_resp_valid_pipe_reg <= '0; + end + end + + assign dma_ram_rd.rd_cmd_ready[n] = dma_ram_rd.rd_resp_ready[n] || &rd_resp_valid_pipe_reg != 0; + + assign dma_ram_rd.rd_resp_valid[n] = rd_resp_valid_pipe_reg[PIPELINE-1]; + assign dma_ram_rd.rd_resp_data[n] = rd_resp_data_pipe_reg[PIPELINE-1]; + +end + +endmodule + +`resetall diff --git a/src/dma/tb/taxi_dma_psdpram_async/Makefile b/src/dma/tb/taxi_dma_psdpram_async/Makefile new file mode 100644 index 0000000..26a097a --- /dev/null +++ b/src/dma/tb/taxi_dma_psdpram_async/Makefile @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: CERN-OHL-S-2.0 +# +# Copyright (c) 2023-2025 FPGA Ninja, LLC +# +# Authors: +# - Alex Forencich + +TOPLEVEL_LANG = verilog + +SIM ?= verilator +WAVES ?= 0 + +COCOTB_HDL_TIMEUNIT = 1ns +COCOTB_HDL_TIMEPRECISION = 1ps + +RTL_DIR = ../../rtl +LIB_DIR = ../../lib +TAXI_SRC_DIR = $(LIB_DIR)/taxi/src + +DUT = taxi_dma_psdpram_async +COCOTB_TEST_MODULES = test_$(DUT) +COCOTB_TOPLEVEL = test_$(DUT) +MODULE = $(COCOTB_TEST_MODULES) +TOPLEVEL = $(COCOTB_TOPLEVEL) +VERILOG_SOURCES += $(COCOTB_TOPLEVEL).sv +VERILOG_SOURCES += $(RTL_DIR)/$(DUT).sv +VERILOG_SOURCES += $(RTL_DIR)/taxi_dma_ram_if.sv + +# handle file list files +process_f_file = $(call process_f_files,$(addprefix $(dir $1),$(shell cat $1))) +process_f_files = $(foreach f,$1,$(if $(filter %.f,$f),$(call process_f_file,$f),$f)) +uniq_base = $(if $1,$(call uniq_base,$(foreach f,$1,$(if $(filter-out $(notdir $(lastword $1)),$(notdir $f)),$f,))) $(lastword $1)) +VERILOG_SOURCES := $(call uniq_base,$(call process_f_files,$(VERILOG_SOURCES))) + +# module parameters +export PARAM_SIZE := 65536 +export PARAM_SEGS := 2 +export PARAM_SEG_DATA_W := 32 +export PARAM_SEG_BE_W := $(shell expr $(PARAM_SEG_DATA_W) / 8 ) +export PARAM_SEG_ADDR_W := $(shell python -c "print(($(PARAM_SIZE)//($(PARAM_SEGS)*$(PARAM_SEG_BE_W))-1).bit_length())") +export PARAM_PIPELINE := 2 + +ifeq ($(SIM), icarus) + PLUSARGS += -fst + + COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-P $(COCOTB_TOPLEVEL).$(subst PARAM_,,$(v))=$($(v))) +else ifeq ($(SIM), verilator) + COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-G$(subst PARAM_,,$(v))=$($(v))) + + ifeq ($(WAVES), 1) + COMPILE_ARGS += --trace-fst + VERILATOR_TRACE = 1 + endif +endif + +include $(shell cocotb-config --makefiles)/Makefile.sim diff --git a/src/dma/tb/taxi_dma_psdpram_async/dma_psdp_ram.py b/src/dma/tb/taxi_dma_psdpram_async/dma_psdp_ram.py new file mode 120000 index 0000000..6613351 --- /dev/null +++ b/src/dma/tb/taxi_dma_psdpram_async/dma_psdp_ram.py @@ -0,0 +1 @@ +../dma_psdp_ram.py \ No newline at end of file diff --git a/src/dma/tb/taxi_dma_psdpram_async/test_taxi_dma_psdpram_async.py b/src/dma/tb/taxi_dma_psdpram_async/test_taxi_dma_psdpram_async.py new file mode 100644 index 0000000..534ff36 --- /dev/null +++ b/src/dma/tb/taxi_dma_psdpram_async/test_taxi_dma_psdpram_async.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: CERN-OHL-S-2.0 +""" + +Copyright (c) 2023-2025 FPGA Ninja, LLC + +Authors: +- Alex Forencich + +""" + +import itertools +import logging +import os +import random +import sys + +import cocotb_test.simulator +import pytest + +import cocotb +from cocotb.clock import Clock +from cocotb.triggers import RisingEdge, Timer +from cocotb.regression import TestFactory + +try: + from dma_psdp_ram import PsdpRamMasterWrite, PsdpRamMasterRead, PsdpRamWriteBus, PsdpRamReadBus +except ImportError: + # attempt import from current directory + sys.path.insert(0, os.path.join(os.path.dirname(__file__))) + try: + from dma_psdp_ram import PsdpRamMasterWrite, PsdpRamMasterRead, PsdpRamWriteBus, PsdpRamReadBus + finally: + del sys.path[0] + + +class TB(object): + def __init__(self, dut): + self.dut = dut + + self.log = logging.getLogger("cocotb.tb") + self.log.setLevel(logging.DEBUG) + + cocotb.start_soon(Clock(dut.clk_wr, 10, units="ns").start()) + cocotb.start_soon(Clock(dut.clk_rd, 11, units="ns").start()) + + # DMA RAM + self.dma_ram_master_wr = PsdpRamMasterWrite(PsdpRamWriteBus.from_entity(dut.dma_ram), dut.clk_wr, dut.rst_wr) + self.dma_ram_master_rd = PsdpRamMasterRead(PsdpRamReadBus.from_entity(dut.dma_ram), dut.clk_rd, dut.rst_rd) + + def set_idle_generator(self, generator=None): + if generator: + self.dma_ram_master_wr.set_pause_generator(generator()) + self.dma_ram_master_rd.set_pause_generator(generator()) + + def set_backpressure_generator(self, generator=None): + if generator: + pass + + async def cycle_reset(self): + self.dut.rst_wr.setimmediatevalue(0) + self.dut.rst_rd.setimmediatevalue(0) + await RisingEdge(self.dut.clk_wr) + await RisingEdge(self.dut.clk_wr) + self.dut.rst_wr.value = 1 + self.dut.rst_rd.value = 1 + await RisingEdge(self.dut.clk_wr) + await RisingEdge(self.dut.clk_wr) + self.dut.rst_wr.value = 0 + self.dut.rst_rd.value = 0 + await RisingEdge(self.dut.clk_wr) + await RisingEdge(self.dut.clk_wr) + + +async def run_test_write(dut, data_in=None, idle_inserter=None, backpressure_inserter=None, size=None): + + tb = TB(dut) + + byte_lanes = tb.dma_ram_master_wr.byte_lanes + + await tb.cycle_reset() + + tb.set_idle_generator(idle_inserter) + tb.set_backpressure_generator(backpressure_inserter) + + for length in list(range(1, byte_lanes*2))+[1024]: + for offset in list(range(byte_lanes, byte_lanes*2))+list(range(4096-byte_lanes, 4096)): + tb.log.info("length %d, offset %d", length, offset) + addr = offset+0x1000 + test_data = bytearray([x % 256 for x in range(length)]) + + await tb.dma_ram_master_wr.write(addr-4, b'\xaa'*(length+8)) + + await tb.dma_ram_master_wr.write(addr, test_data) + + data = await tb.dma_ram_master_rd.read(addr-1, length+2) + + assert data.data == b'\xaa'+test_data+b'\xaa' + + await RisingEdge(dut.clk_wr) + await RisingEdge(dut.clk_wr) + + +async def run_test_read(dut, data_in=None, idle_inserter=None, backpressure_inserter=None, size=None): + + tb = TB(dut) + + byte_lanes = tb.dma_ram_master_wr.byte_lanes + + await tb.cycle_reset() + + tb.set_idle_generator(idle_inserter) + tb.set_backpressure_generator(backpressure_inserter) + + for length in list(range(1, byte_lanes*2))+[1024]: + for offset in list(range(byte_lanes, byte_lanes*2))+list(range(4096-byte_lanes, 4096)): + tb.log.info("length %d, offset %d", length, offset) + addr = offset+0x1000 + test_data = bytearray([x % 256 for x in range(length)]) + + await tb.dma_ram_master_wr.write(addr, test_data) + + data = await tb.dma_ram_master_rd.read(addr, length) + + assert data.data == test_data + + await RisingEdge(dut.clk_wr) + await RisingEdge(dut.clk_wr) + + +async def run_stress_test(dut, idle_inserter=None, backpressure_inserter=None): + + tb = TB(dut) + + await tb.cycle_reset() + + tb.set_idle_generator(idle_inserter) + tb.set_backpressure_generator(backpressure_inserter) + + async def worker(master_wr, master_rd, offset, aperture, count=16): + for k in range(count): + length = random.randint(1, min(512, aperture)) + addr = offset+random.randint(0, aperture-length) + test_data = bytearray([x % 256 for x in range(length)]) + + await Timer(random.randint(1, 100), 'ns') + + await master_wr.write(addr, test_data) + + await Timer(random.randint(1, 100), 'ns') + + data = await master_rd.read(addr, length) + assert data.data == test_data + + workers = [] + + for k in range(16): + workers.append(cocotb.start_soon(worker(tb.dma_ram_master_wr, tb.dma_ram_master_rd, k*0x1000, 0x1000, count=16))) + + while workers: + await workers.pop(0).join() + + await RisingEdge(dut.clk_wr) + await RisingEdge(dut.clk_wr) + + +def cycle_pause(): + return itertools.cycle([1, 1, 1, 0]) + + +if cocotb.SIM_NAME: + + for test in [run_test_write, run_test_read, run_stress_test]: + + factory = TestFactory(test) + factory.add_option("idle_inserter", [None, cycle_pause]) + factory.add_option("backpressure_inserter", [None, cycle_pause]) + factory.generate_tests() + + +# cocotb-test + +tests_dir = os.path.dirname(__file__) +rtl_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', 'rtl')) +lib_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', 'lib')) +taxi_src_dir = os.path.abspath(os.path.join(lib_dir, 'taxi', 'src')) + + +def process_f_files(files): + lst = {} + for f in files: + if f[-2:].lower() == '.f': + with open(f, 'r') as fp: + l = fp.read().split() + for f in process_f_files([os.path.join(os.path.dirname(f), x) for x in l]): + lst[os.path.basename(f)] = f + else: + lst[os.path.basename(f)] = f + return list(lst.values()) + + +@pytest.mark.parametrize("seg_data_w", [32, 64]) +@pytest.mark.parametrize("segs", [2, 4]) +def test_taxi_dma_psdpram_async(request, seg_data_w, segs): + dut = "taxi_dma_psdpram_async" + module = os.path.splitext(os.path.basename(__file__))[0] + toplevel = module + + verilog_sources = [ + os.path.join(tests_dir, f"{toplevel}.sv"), + os.path.join(rtl_dir, f"{dut}.sv"), + os.path.join(rtl_dir, "taxi_dma_ram_if.sv"), + ] + + verilog_sources = process_f_files(verilog_sources) + + parameters = {} + + parameters['SIZE'] = 65536 + parameters['SEGS'] = segs + parameters['SEG_DATA_W'] = seg_data_w + parameters['SEG_BE_W'] = parameters['SEG_DATA_W'] // 8 + parameters['SEG_ADDR_W'] = (parameters['SIZE']//(parameters['SEGS']*parameters['SEG_BE_W'])-1).bit_length() + parameters['PIPELINE'] = 2 + + extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} + + sim_build = os.path.join(tests_dir, "sim_build", + request.node.name.replace('[', '-').replace(']', '')) + + cocotb_test.simulator.run( + simulator="verilator", + python_search=[tests_dir], + verilog_sources=verilog_sources, + toplevel=toplevel, + module=module, + parameters=parameters, + sim_build=sim_build, + extra_env=extra_env, + ) diff --git a/src/dma/tb/taxi_dma_psdpram_async/test_taxi_dma_psdpram_async.sv b/src/dma/tb/taxi_dma_psdpram_async/test_taxi_dma_psdpram_async.sv new file mode 100644 index 0000000..df1225c --- /dev/null +++ b/src/dma/tb/taxi_dma_psdpram_async/test_taxi_dma_psdpram_async.sv @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: CERN-OHL-S-2.0 +/* + +Copyright (c) 2025 FPGA Ninja, LLC + +Authors: +- Alex Forencich + +*/ + +`resetall +`timescale 1ns / 1ps +`default_nettype none + +/* + * DMA parallel simple dual port RAM (asynchronous) testbench + */ +module test_taxi_dma_psdpram_async # +( + /* verilator lint_off WIDTHTRUNC */ + parameter SIZE = 4096, + parameter SEGS = 2, + parameter SEG_DATA_W = 128, + parameter SEG_BE_W = SEG_DATA_W/8, + parameter SEG_ADDR_W = $clog2(SIZE/(SEGS*SEG_BE_W)), + parameter PIPELINE = 2 + /* verilator lint_on WIDTHTRUNC */ +) +(); + +logic clk_wr; +logic rst_wr; + +logic clk_rd; +logic rst_rd; + +taxi_dma_ram_if #( + .SEGS(SEGS), + .SEG_ADDR_W(SEG_ADDR_W), + .SEG_DATA_W(SEG_DATA_W), + .SEG_BE_W(SEG_BE_W) +) dma_ram(); + +taxi_dma_psdpram_async #( + .SIZE(SIZE), + .PIPELINE(PIPELINE) +) +uut ( + /* + * Write port + */ + .clk_wr(clk_wr), + .rst_wr(rst_wr), + .dma_ram_wr(dma_ram), + + /* + * Read port + */ + .clk_rd(clk_rd), + .rst_rd(rst_rd), + .dma_ram_rd(dma_ram) +); + +endmodule + +`resetall