diff --git a/rtl/stats/taxi_stats_counter.sv b/rtl/stats/taxi_stats_counter.sv new file mode 100644 index 0000000..0cadb68 --- /dev/null +++ b/rtl/stats/taxi_stats_counter.sv @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: CERN-OHL-S-2.0 +/* + +Copyright (c) 2021-2025 FPGA Ninja, LLC + +Authors: +- Alex Forencich + +*/ + +`resetall +`timescale 1ns / 1ps +`default_nettype none + +/* + * Statistics counter + */ +module taxi_stats_counter # +( + // Statistics counter (bits) + parameter STAT_COUNT_W = 32, + // Pipeline length + parameter PIPELINE = 2 +) +( + input wire logic clk, + input wire logic rst, + + /* + * Statistics increment input + */ + taxi_axis_if.snk s_axis_stat, + + /* + * AXI Lite register interface + */ + taxi_axil_if.wr_slv s_axil_wr, + taxi_axil_if.rd_slv s_axil_rd +); + +localparam STAT_INC_W = s_axis_stat.DATA_W; +localparam STAT_ID_W = s_axis_stat.ID_W; + +localparam AXIL_ADDR_W = s_axil_rd.ADDR_W; +localparam AXIL_DATA_W = s_axil_rd.DATA_W; + +localparam ID_SHIFT = $clog2(((AXIL_DATA_W > STAT_COUNT_W ? AXIL_DATA_W : STAT_COUNT_W)+7)/8); +localparam WORD_SELECT_SHIFT = $clog2(AXIL_DATA_W/8); +localparam WORD_SELECT_W = STAT_COUNT_W > AXIL_DATA_W ? $clog2((STAT_COUNT_W+7)/8) - $clog2(AXIL_DATA_W/8) : 1; + +// check configuration +if (AXIL_ADDR_W < STAT_ID_W+ID_SHIFT) + $fatal(0, "Error: AXI lite address width too narrow (instance %m)"); + +if (PIPELINE < 2) + $fatal(0, "Error: PIPELINE must be at least 2 (instance %m)"); + +logic init_reg = 1'b1, init_next; +logic [STAT_ID_W-1:0] init_ptr_reg = 0, init_ptr_next; + +logic op_acc_pipe_hazard; +logic stage_active; + +logic [PIPELINE-1:0] op_axil_read_pipe_reg = 0, op_axil_read_pipe_next; +logic [PIPELINE-1:0] op_acc_pipe_reg = 0, op_acc_pipe_next; + +logic [STAT_ID_W-1:0] mem_addr_pipeline_reg[PIPELINE], mem_addr_pipeline_next[PIPELINE]; +logic [WORD_SELECT_W-1:0] axil_shift_pipeline_reg[PIPELINE], axil_shift_pipeline_next[PIPELINE]; +logic [STAT_INC_W-1:0] stat_inc_pipeline_reg[PIPELINE], stat_inc_pipeline_next[PIPELINE]; + +logic s_axis_stat_tready_reg = 1'b0, s_axis_stat_tready_next; + +logic s_axil_awready_reg = 0, s_axil_awready_next; +logic s_axil_wready_reg = 0, s_axil_wready_next; +logic s_axil_bvalid_reg = 0, s_axil_bvalid_next; +logic s_axil_arready_reg = 0, s_axil_arready_next; +logic [AXIL_DATA_W-1:0] s_axil_rdata_reg = 0, s_axil_rdata_next; +logic s_axil_rvalid_reg = 0, s_axil_rvalid_next; + +(* ramstyle = "no_rw_check" *) +logic [STAT_COUNT_W-1:0] mem[2**STAT_ID_W]; + +logic [STAT_ID_W-1:0] mem_rd_addr; +logic [STAT_ID_W-1:0] mem_wr_addr; +logic [STAT_COUNT_W-1:0] mem_wr_data; +logic mem_wr_en; +logic [STAT_COUNT_W-1:0] mem_read_data_reg = 0; +logic [STAT_COUNT_W-1:0] mem_read_data_pipeline_reg[PIPELINE-1:1]; + +assign s_axis_stat.tready = s_axis_stat_tready_reg; + +assign s_axil_wr.awready = s_axil_awready_reg; +assign s_axil_wr.wready = s_axil_wready_reg; +assign s_axil_wr.bresp = 2'b00; +assign s_axil_wr.bvalid = s_axil_bvalid_reg; + +assign s_axil_rd.arready = s_axil_arready_reg; +assign s_axil_rd.rdata = s_axil_rdata_reg; +assign s_axil_rd.rresp = 2'b00; +assign s_axil_rd.rvalid = s_axil_rvalid_reg; + +wire [STAT_ID_W-1:0] s_axil_araddr_id = STAT_ID_W'(s_axil_rd.araddr >> ID_SHIFT); +wire [WORD_SELECT_W-1:0] s_axil_araddr_shift = WORD_SELECT_W'(s_axil_rd.araddr >> WORD_SELECT_SHIFT); + +initial begin + // break up loop to work around iteration termination + for (integer i = 0; i < 2**STAT_ID_W; i = i + 2**(STAT_ID_W/2)) begin + for (integer j = i; j < i + 2**(STAT_ID_W/2); j = j + 1) begin + mem[j] = 0; + end + end + + for (integer i = 0; i < PIPELINE; i = i + 1) begin + mem_addr_pipeline_reg[i] = 0; + axil_shift_pipeline_reg[i] = 0; + stat_inc_pipeline_reg[i] = 0; + end +end + +always_comb begin + init_next = init_reg; + init_ptr_next = init_ptr_reg; + + op_axil_read_pipe_next = PIPELINE'({op_axil_read_pipe_reg, 1'b0}); + op_acc_pipe_next = PIPELINE'({op_acc_pipe_reg, 1'b0}); + + mem_addr_pipeline_next[0] = 0; + axil_shift_pipeline_next[0] = 0; + stat_inc_pipeline_next[0] = 0; + for (integer j = 1; j < PIPELINE; j = j + 1) begin + mem_addr_pipeline_next[j] = mem_addr_pipeline_reg[j-1]; + axil_shift_pipeline_next[j] = axil_shift_pipeline_reg[j-1]; + stat_inc_pipeline_next[j] = stat_inc_pipeline_reg[j-1]; + end + + s_axis_stat_tready_next = 1'b0; + + s_axil_awready_next = 1'b0; + s_axil_wready_next = 1'b0; + s_axil_bvalid_next = s_axil_bvalid_reg && !s_axil.bready; + + s_axil_arready_next = 1'b0; + s_axil_rdata_next = s_axil_rdata_reg; + s_axil_rvalid_next = s_axil_rvalid_reg && !s_axil.rready; + + mem_rd_addr = 0; + mem_wr_addr = mem_addr_pipeline_reg[PIPELINE-1]; + mem_wr_data = mem_read_data_pipeline_reg[PIPELINE-1] + STAT_COUNT_W'(stat_inc_pipeline_reg[PIPELINE-1]); + mem_wr_en = 0; + + op_acc_pipe_hazard = 1'b0; + stage_active = 1'b0; + + for (integer j = 0; j < PIPELINE; j = j + 1) begin + stage_active = op_axil_read_pipe_reg[j] || op_acc_pipe_reg[j]; + op_acc_pipe_hazard = op_acc_pipe_hazard || (stage_active && mem_addr_pipeline_reg[j] == s_axis_stat.tid); + end + + // discard writes + if (s_axil_wr.awvalid && s_axil_wr.wvalid && (!s_axil_wr.bvalid || s_axil_wr.bready) && (!s_axil_wr.awready && !s_axil_wr.wready)) begin + s_axil_awready_next = 1'b1; + s_axil_wready_next = 1'b1; + s_axil_bvalid_next = 1'b1; + end + + // pipeline stage 0 - accept request + if (init_reg) begin + // zero all counters + init_ptr_next = init_ptr_reg + 1; + + mem_wr_addr = init_ptr_reg; + mem_wr_data = 0; + mem_wr_en = 1'b1; + + if (&init_ptr_reg) begin + init_next = 1'b0; + end + end else if (s_axil_rd.arvalid && (!s_axil_rd.rvalid || s_axil_rd.rready) && op_axil_read_pipe_reg == 0) begin + // AXIL read + op_axil_read_pipe_next[0] = 1'b1; + + s_axil_arready_next = 1'b1; + + mem_rd_addr = s_axil_araddr_id; + mem_addr_pipeline_next[0] = s_axil_araddr_id; + axil_shift_pipeline_next[0] = s_axil_araddr_shift; + end else if (s_axis_stat.tvalid && !s_axis_stat.tready && !op_acc_pipe_hazard) begin + // accumulate + op_acc_pipe_next[0] = 1'b1; + + s_axis_stat_tready_next = 1'b1; + + stat_inc_pipeline_next[0] = s_axis_stat.tdata; + + mem_rd_addr = s_axis_stat.tid; + mem_addr_pipeline_next[0] = s_axis_stat.tid; + end + + // read complete, perform operation + if (op_acc_pipe_reg[PIPELINE-1]) begin + // accumulate + mem_wr_addr = mem_addr_pipeline_reg[PIPELINE-1]; + mem_wr_data = mem_read_data_pipeline_reg[PIPELINE-1] + STAT_COUNT_W'(stat_inc_pipeline_reg[PIPELINE-1]); + mem_wr_en = 1'b1; + end else if (op_axil_read_pipe_reg[PIPELINE-1]) begin + // AXIL read + s_axil_rvalid_next = 1'b1; + s_axil_rdata_next = 0; + + if (STAT_COUNT_W > AXIL_DATA_W) begin + s_axil_rdata_next = AXIL_DATA_W'(mem_read_data_pipeline_reg[PIPELINE-1] >> axil_shift_pipeline_reg[PIPELINE-1]*AXIL_DATA_W); + end else begin + s_axil_rdata_next = AXIL_DATA_W'(mem_read_data_pipeline_reg[PIPELINE-1]); + end + end +end + +always_ff @(posedge clk) begin + init_reg <= init_next; + init_ptr_reg <= init_ptr_next; + + op_axil_read_pipe_reg <= op_axil_read_pipe_next; + op_acc_pipe_reg <= op_acc_pipe_next; + + s_axis_stat_tready_reg <= s_axis_stat_tready_next; + + s_axil_awready_reg <= s_axil_awready_next; + s_axil_wready_reg <= s_axil_wready_next; + s_axil_bvalid_reg <= s_axil_bvalid_next; + s_axil_arready_reg <= s_axil_arready_next; + s_axil_rdata_reg <= s_axil_rdata_next; + s_axil_rvalid_reg <= s_axil_rvalid_next; + + for (integer i = 0; i < PIPELINE; i = i + 1) begin + mem_addr_pipeline_reg[i] <= mem_addr_pipeline_next[i]; + axil_shift_pipeline_reg[i] <= axil_shift_pipeline_next[i]; + stat_inc_pipeline_reg[i] <= stat_inc_pipeline_next[i]; + end + + if (mem_wr_en) begin + mem[mem_wr_addr] <= mem_wr_data; + end + mem_read_data_reg <= mem[mem_rd_addr]; + mem_read_data_pipeline_reg[1] <= mem_read_data_reg; + for (integer i = 2; i < PIPELINE; i = i + 1) begin + mem_read_data_pipeline_reg[i] <= mem_read_data_pipeline_reg[i-1]; + end + + if (rst) begin + init_reg <= 1'b1; + init_ptr_reg <= 0; + + op_axil_read_pipe_reg <= 0; + op_acc_pipe_reg <= 0; + + s_axis_stat_tready_reg <= 1'b0; + + s_axil_awready_reg <= 1'b0; + s_axil_wready_reg <= 1'b0; + s_axil_bvalid_reg <= 1'b0; + s_axil_arready_reg <= 1'b0; + s_axil_rvalid_reg <= 1'b0; + end +end + +endmodule + +`resetall diff --git a/tb/stats/taxi_stats_counter/Makefile b/tb/stats/taxi_stats_counter/Makefile new file mode 100644 index 0000000..1a821f4 --- /dev/null +++ b/tb/stats/taxi_stats_counter/Makefile @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: CERN-OHL-S-2.0 +# +# Copyright (c) 2021-2025 FPGA Ninja, LLC +# +# Authors: +# - Alex Forencich + +TOPLEVEL_LANG = verilog + +SIM ?= verilator +WAVES ?= 0 + +COCOTB_HDL_TIMEUNIT = 1ns +COCOTB_HDL_TIMEPRECISION = 1ps + +DUT = taxi_stats_counter +COCOTB_TEST_MODULES = test_$(DUT) +COCOTB_TOPLEVEL = test_$(DUT) +MODULE = $(COCOTB_TEST_MODULES) +TOPLEVEL = $(COCOTB_TOPLEVEL) +VERILOG_SOURCES += $(COCOTB_TOPLEVEL).sv +VERILOG_SOURCES += ../../../rtl/stats/$(DUT).sv +VERILOG_SOURCES += ../../../rtl/axis/taxi_axis_if.sv +VERILOG_SOURCES += ../../../rtl/axi/taxi_axil_if.sv + +# handle file list files +process_f_file = $(call process_f_files,$(addprefix $(dir $1),$(shell cat $1))) +process_f_files = $(foreach f,$1,$(if $(filter %.f,$f),$(call process_f_file,$f),$f)) +uniq_base = $(if $1,$(call uniq_base,$(foreach f,$1,$(if $(filter-out $(notdir $(lastword $1)),$(notdir $f)),$f,))) $(lastword $1)) +VERILOG_SOURCES := $(call uniq_base,$(call process_f_files,$(VERILOG_SOURCES))) + +# module parameters +export PARAM_STAT_COUNT_W := 32 +export PARAM_PIPELINE := 2 +export PARAM_STAT_INC_W := 16 +export PARAM_STAT_ID_W := 8 +export PARAM_AXIL_DATA_W := 32 +export PARAM_AXIL_ADDR_W := $(shell python -c "print($(PARAM_STAT_ID_W) + (($(PARAM_STAT_COUNT_W)+7)//8-1).bit_length())") + +ifeq ($(SIM), icarus) + PLUSARGS += -fst + + COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-P $(COCOTB_TOPLEVEL).$(subst PARAM_,,$(v))=$($(v))) +else ifeq ($(SIM), verilator) + COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-G$(subst PARAM_,,$(v))=$($(v))) + + ifeq ($(WAVES), 1) + COMPILE_ARGS += --trace-fst + VERILATOR_TRACE = 1 + endif +endif + +include $(shell cocotb-config --makefiles)/Makefile.sim diff --git a/tb/stats/taxi_stats_counter/test_taxi_stats_counter.py b/tb/stats/taxi_stats_counter/test_taxi_stats_counter.py new file mode 100644 index 0000000..402e6d4 --- /dev/null +++ b/tb/stats/taxi_stats_counter/test_taxi_stats_counter.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: CERN-OHL-S-2.0 +""" + +Copyright (c) 2021-2025 FPGA Ninja, LLC + +Authors: +- Alex Forencich + +""" + +import itertools +import logging +import os +import random + +import cocotb_test.simulator +import pytest + +import cocotb +from cocotb.clock import Clock +from cocotb.queue import Queue +from cocotb.triggers import RisingEdge, Timer +from cocotb.regression import TestFactory + +from cocotbext.axi import AxiLiteBus, AxiLiteMaster +from cocotbext.axi import AxiStreamBus, AxiStreamSource, AxiStreamFrame + + +class TB(object): + def __init__(self, dut): + self.dut = dut + + self.log = logging.getLogger("cocotb.tb") + self.log.setLevel(logging.DEBUG) + + cocotb.start_soon(Clock(dut.clk, 10, units="ns").start()) + + self.stat_source = AxiStreamSource(AxiStreamBus.from_entity(dut.s_axis_stat), dut.clk, dut.rst) + + self.axil_master = AxiLiteMaster(AxiLiteBus.from_entity(dut.s_axil), dut.clk, dut.rst) + + def set_idle_generator(self, generator=None): + if generator: + self.stat_source.set_pause_generator(generator()) + self.axil_master.write_if.aw_channel.set_pause_generator(generator()) + self.axil_master.write_if.w_channel.set_pause_generator(generator()) + self.axil_master.read_if.ar_channel.set_pause_generator(generator()) + + def set_backpressure_generator(self, generator=None): + if generator: + self.axil_master.write_if.b_channel.set_pause_generator(generator()) + self.axil_master.read_if.r_channel.set_pause_generator(generator()) + + async def cycle_reset(self): + self.dut.rst.setimmediatevalue(0) + await RisingEdge(self.dut.clk) + await RisingEdge(self.dut.clk) + self.dut.rst.value = 1 + await RisingEdge(self.dut.clk) + await RisingEdge(self.dut.clk) + self.dut.rst.value = 0 + await RisingEdge(self.dut.clk) + await RisingEdge(self.dut.clk) + + +async def run_test_acc(dut, idle_inserter=None, backpressure_inserter=None): + + tb = TB(dut) + + byte_lanes = tb.axil_master.read_if.byte_lanes + counter_size = max(dut.STAT_COUNT_W.value // 8, byte_lanes) + + await tb.cycle_reset() + + tb.set_idle_generator(idle_inserter) + tb.set_backpressure_generator(backpressure_inserter) + + await Timer(4000, 'ns') + + for n in range(10): + for k in range(10): + await tb.stat_source.send(AxiStreamFrame([k], tid=k)) + + await Timer(1000, 'ns') + + data = await tb.axil_master.read_words(0, 10, ws=counter_size) + + print(data) + + for n in range(10): + assert data[n] == n*10 + + await RisingEdge(dut.clk) + await RisingEdge(dut.clk) + + +async def run_stress_test(dut, idle_inserter=None, backpressure_inserter=None): + + tb = TB(dut) + + byte_lanes = tb.axil_master.read_if.byte_lanes + counter_size = max(dut.STAT_COUNT_W.value // 8, byte_lanes) + stat_inc_width = len(dut.s_axis_stat.tdata) + stat_id_width = len(dut.s_axis_stat.tid) + + await tb.cycle_reset() + + tb.set_idle_generator(idle_inserter) + tb.set_backpressure_generator(backpressure_inserter) + + await Timer(4000, 'ns') + + async def worker(source, queue, count=128): + for k in range(count): + count = random.randrange(1, 2**stat_inc_width) + num = random.randrange(0, 2**stat_id_width) + + await tb.stat_source.send(AxiStreamFrame([count], tid=num)) + + await queue.put((num, count)) + + await Timer(random.randint(1, 1000), 'ns') + + workers = [] + queue = Queue() + + for k in range(16): + workers.append(cocotb.start_soon(worker(tb.stat_source, queue, count=128))) + + while workers: + await workers.pop(0).join() + + await Timer(1000, 'ns') + + data_ref = [0]*2**stat_id_width + + while not queue.empty(): + num, count = await queue.get() + data_ref[num] += count + + print(data_ref) + + data = await tb.axil_master.read_words(0, 2**stat_id_width, ws=counter_size) + + print(data) + + assert data == data_ref + + await RisingEdge(dut.clk) + await RisingEdge(dut.clk) + + +def cycle_pause(): + return itertools.cycle([1, 1, 1, 0]) + + +if cocotb.SIM_NAME: + + for test in [run_test_acc]: + + factory = TestFactory(test) + factory.add_option("idle_inserter", [None, cycle_pause]) + factory.add_option("backpressure_inserter", [None, cycle_pause]) + factory.generate_tests() + + factory = TestFactory(run_stress_test) + factory.generate_tests() + + +# cocotb-test + +tests_dir = os.path.dirname(__file__) +rtl_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', '..', 'rtl')) + + +def process_f_files(files): + lst = {} + for f in files: + if f[-2:].lower() == '.f': + with open(f, 'r') as fp: + l = fp.read().split() + for f in process_f_files([os.path.join(os.path.dirname(f), x) for x in l]): + lst[os.path.basename(f)] = f + else: + lst[os.path.basename(f)] = f + return list(lst.values()) + + +@pytest.mark.parametrize("stat_count_w", [32, 64]) +def test_taxi_stats_counter(request, stat_count_w): + dut = "taxi_stats_counter" + module = os.path.splitext(os.path.basename(__file__))[0] + toplevel = module + + verilog_sources = [ + os.path.join(tests_dir, f"{toplevel}.sv"), + os.path.join(rtl_dir, "stats", f"{dut}.sv"), + os.path.join(rtl_dir, "axis", "taxi_axis_if.sv"), + os.path.join(rtl_dir, "axi", "taxi_axil_if.sv"), + ] + + verilog_sources = process_f_files(verilog_sources) + + parameters = {} + + parameters['STAT_COUNT_W'] = stat_count_w + parameters['PIPELINE'] = 2 + parameters['STAT_INC_W'] = 16 + parameters['STAT_ID_W'] = 8 + parameters['AXIL_DATA_W'] = 32 + parameters['AXIL_ADDR_W'] = parameters['STAT_ID_W'] + ((parameters['STAT_COUNT_W']+7)//8-1).bit_length() + + extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()} + + sim_build = os.path.join(tests_dir, "sim_build", + request.node.name.replace('[', '-').replace(']', '')) + + cocotb_test.simulator.run( + simulator="verilator", + python_search=[tests_dir], + verilog_sources=verilog_sources, + toplevel=toplevel, + module=module, + parameters=parameters, + sim_build=sim_build, + extra_env=extra_env, + ) diff --git a/tb/stats/taxi_stats_counter/test_taxi_stats_counter.sv b/tb/stats/taxi_stats_counter/test_taxi_stats_counter.sv new file mode 100644 index 0000000..e748ab1 --- /dev/null +++ b/tb/stats/taxi_stats_counter/test_taxi_stats_counter.sv @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: CERN-OHL-S-2.0 +/* + +Copyright (c) 2025 FPGA Ninja, LLC + +Authors: +- Alex Forencich + +*/ + +`resetall +`timescale 1ns / 1ps +`default_nettype none + +/* + * Statistics counter testbench + */ +module test_taxi_stats_counter # +( + /* verilator lint_off WIDTHTRUNC */ + parameter STAT_COUNT_W = 32, + parameter PIPELINE = 2, + parameter STAT_INC_W = 16, + parameter STAT_ID_W = 8, + parameter AXIL_DATA_W = 32, + parameter AXIL_ADDR_W = STAT_ID_W + $clog2((STAT_COUNT_W+7)/8) + /* verilator lint_on WIDTHTRUNC */ +) +(); + +logic clk; +logic rst; + +taxi_axis_if #( + .DATA_W(STAT_INC_W), + .KEEP_EN(0), + .KEEP_W(1), + .ID_EN(1), + .ID_W(STAT_ID_W) +) s_axis_stat(); + +taxi_axil_if #( + .DATA_W(AXIL_DATA_W), + .ADDR_W(AXIL_ADDR_W) +) s_axil(); + +taxi_stats_counter #( + .STAT_COUNT_W(STAT_COUNT_W), + .PIPELINE(PIPELINE) +) +uut ( + .clk(clk), + .rst(rst), + + /* + * Statistics increment input + */ + .s_axis_stat(s_axis_stat), + + /* + * AXI Lite register interface + */ + .s_axil_wr(s_axil), + .s_axil_rd(s_axil) +); + +endmodule + +`resetall