Add correct amount of memory

should be 160, not 20. There are 8 cycles per stage and 20 stages
This commit is contained in:
Byron Lathi
2025-06-29 13:29:38 -07:00
parent 4c7badbbbb
commit 196ea8e6d3
3 changed files with 79 additions and 25 deletions

View File

@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<efx:project name="chacha20_timing_test" description="" last_change="1751170370" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="change" route_ood="change" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd"> <efx:project name="chacha20_timing_test" description="" last_change="1751223371" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
<efx:device_info> <efx:device_info>
<efx:family name="Titanium"/> <efx:family name="Titanium"/>
<efx:device name="Ti375N1156"/> <efx:device name="Ti375N1156"/>

View File

@@ -6,12 +6,17 @@ from chacha_helpers import chacha_block
import struct import struct
import random
from cocotb.clock import Clock from cocotb.clock import Clock
from cocotb.triggers import Timer, RisingEdge, FallingEdge from cocotb.triggers import Timer, RisingEdge, FallingEdge
from cocotb.queue import Queue
CLK_PERIOD = 4 CLK_PERIOD = 4
CONSTANT = [0x65787061, 0x6e642033, 0x322d6279, 0x7465206b]
class TB: class TB:
def __init__(self, dut): def __init__(self, dut):
@@ -20,9 +25,15 @@ class TB:
self.log = logging.getLogger("cocotb.tb") self.log = logging.getLogger("cocotb.tb")
self.log.setLevel(logging.INFO) self.log.setLevel(logging.INFO)
self.input_queue = Queue()
self.expected_queue = Queue()
self.output_queue = Queue()
cocotb.start_soon(Clock(self.dut.i_clk, CLK_PERIOD, units="ns").start()) cocotb.start_soon(Clock(self.dut.i_clk, CLK_PERIOD, units="ns").start())
cocotb.start_soon(self.run_input())
cocotb.start_soon(self.run_output())
async def cycle_reset(self): async def cycle_reset(self):
await self._cycle_reset(self.dut.i_rst, self.dut.i_clk) await self._cycle_reset(self.dut.i_rst, self.dut.i_clk)
@@ -38,7 +49,41 @@ class TB:
await RisingEdge(clk) await RisingEdge(clk)
await RisingEdge(clk) await RisingEdge(clk)
async def write_input(self, key, counter, nonce):
await self.input_queue.put((key, counter, nonce))
data_in = CONSTANT[:]
data_in.extend(struct.unpack("8I", key.to_bytes(32, "little")))
data_in.extend(struct.unpack("2I", counter.to_bytes(8, "little")))
data_in.extend(struct.unpack("2I", nonce.to_bytes(8, "little")))
data_out = chacha_block(data_in)
await self.expected_queue.put(data_out)
async def run_input(self):
while True:
key, counter, nonce = await self.input_queue.get()
self.dut.i_key.value = key
self.dut.i_counter.value = counter
self.dut.i_nonce.value = nonce
self.dut.i_ready.value = 1
self.dut.i_valid.value = 1
await RisingEdge(self.dut.i_clk)
self.dut.i_valid.value = 0
async def run_output(self):
while True:
await RisingEdge(self.dut.i_clk)
if self.dut.o_valid.value:
state = self.dut.o_state.value.integer
state_bytes = int.to_bytes(state, 64, "little")
state_words = struct.unpack("16I", state_bytes)
await self.output_queue.put(state_words)
@cocotb.test @cocotb.test
async def test_sanity(dut): async def test_sanity(dut):
@@ -46,31 +91,27 @@ async def test_sanity(dut):
await tb.cycle_reset() await tb.cycle_reset()
data_in = [0x65787061, 0x6e642033, 0x322d6279, 0x7465206b, count = 1024
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0]
data_out = chacha_block(data_in)
tb.dut.i_counter.value = 0 for i in range(count):
tb.dut.i_nonce.value = 0 key = random.randint(0, 2**256-1)
tb.dut.i_key.value = 0 counter = i
tb.dut.i_ready.value = 1 nonce = random.randint(0, 2**64-1)
tb.dut.i_valid.value = 1
await RisingEdge(tb.dut.i_clk) await tb.write_input(key, counter, nonce)
tb.dut.i_valid.value = 0
await RisingEdge(tb.dut.o_valid)
state = tb.dut.o_state.value.integer
state_bytes = int.to_bytes(state, 64, "little")
state_words = struct.unpack("16I", state_bytes)
fail = False fail = False
for i, (sim_val, dut_val) in enumerate(zip(data_out, state_words)): for _ in range(count):
if sim_val != dut_val: sim_vals = await tb.expected_queue.get()
tb.log.info(f"{i}: {sim_val:x} -> {dut_val:x}") dut_vals = await tb.output_queue.get()
fail = True
for i, (sim_val, dut_val) in enumerate(zip(sim_vals, dut_vals)):
if sim_val != dut_val:
tb.log.info(f"{i}: {sim_val:x} -> {dut_val:x}")
fail = True
await Timer(1, "us")
assert not fail assert not fail

View File

@@ -21,6 +21,9 @@ module chacha20_block #(
input logic i_ready input logic i_ready
); );
// each round is 8 stages
localparam PIPE_STAGES = ROUNDS * 8;
`define QR(name, i, n, a, b, c, d) \ `define QR(name, i, n, a, b, c, d) \
chacha20_qr u_chacha20_``name ( \ chacha20_qr u_chacha20_``name ( \
.i_clk (i_clk), \ .i_clk (i_clk), \
@@ -49,9 +52,9 @@ logic [3:0] valid[ROUNDS+1];
// small fifo for storing the initial state. // small fifo for storing the initial state.
// better to store it in a memory than in flops // better to store it in a memory than in flops
logic [$clog2(ROUNDS)-1:0] initial_state_wptr; logic [$clog2(PIPE_STAGES)-1:0] initial_state_wptr;
logic [$clog2(ROUNDS)-1:0] initial_state_rptr; logic [$clog2(PIPE_STAGES)-1:0] initial_state_rptr;
logic [511:0] initial_states [ROUNDS]; logic [511:0] initial_states [PIPE_STAGES];
logic [511:0] state_pre_add; logic [511:0] state_pre_add;
logic pre_add_valid; logic pre_add_valid;
@@ -67,12 +70,22 @@ always_ff @(posedge i_clk) begin
end else begin end else begin
if (i_valid) begin if (i_valid) begin
initial_states[initial_state_wptr] <= write_initial_state; initial_states[initial_state_wptr] <= write_initial_state;
if (initial_state_wptr < PIPE_STAGES-1) begin
initial_state_wptr <= initial_state_wptr + 1;
end else begin
initial_state_wptr <= '0;
end
end end
pre_add_valid <= valid[ROUNDS][0]; pre_add_valid <= valid[ROUNDS][0];
if (valid[ROUNDS][0]) begin if (valid[ROUNDS][0]) begin
read_initial_state <= initial_states[initial_state_rptr]; read_initial_state <= initial_states[initial_state_rptr];
if (initial_state_rptr < PIPE_STAGES-1) begin
initial_state_rptr <= initial_state_rptr + 1;
end else begin
initial_state_rptr <= '0;
end
for (int i = 0; i < 16; i++) begin for (int i = 0; i < 16; i++) begin
state_pre_add[i*32 +: 32] <= state[ROUNDS][i]; state_pre_add[i*32 +: 32] <= state[ROUNDS][i];
end end