From 9b40c886732ca69c07f47b44df2e08759a09e627 Mon Sep 17 00:00:00 2001 From: Byron Lathi Date: Sun, 18 Jan 2026 21:58:56 -0800 Subject: [PATCH] Whatever I was working on --- ChaCha20_Poly1305_64/doc/notes.md | 54 +++++- ChaCha20_Poly1305_64/doc/poly1305.drawio | 194 ++++++++++----------- ChaCha20_Poly1305_64/sim/poly1305_stage.py | 7 +- 3 files changed, 146 insertions(+), 109 deletions(-) diff --git a/ChaCha20_Poly1305_64/doc/notes.md b/ChaCha20_Poly1305_64/doc/notes.md index 4c7cf4d..03606a4 100644 --- a/ChaCha20_Poly1305_64/doc/notes.md +++ b/ChaCha20_Poly1305_64/doc/notes.md @@ -63,4 +63,56 @@ the last written one is. We can just say the last written one was 2 I guess -We also need an input that tells it to reset the accumulator \ No newline at end of file +We also need an input that tells it to reset the accumulator + +What if instead of calculating all the way up to R^16 I just calculated up to r^8 +and then just had 2 parallel blocks? + +Lets think about the worst case throughput. The theoretical layout would have +8 of these in parallel. A minimum size packet of 64 bytes for example, is 512 +bits. This is less than 128*8, so it would only take one round. Therefore, we +take 16 cycles to do 64 bytes, or 32 bits per cycle. This is only 1/4 of our +target throughput. In order to reach our target throughput of 128 bits per cycle, + +If the packet is enough to fit into the second phase of the multiplier, then it +can run in parallel and give up to 256 bits per 16 cycles. In order for this to +happen, the packet size must be greater than 128*16, or 256 bytes. I would really +like to be able to reach our target throughput with 64 byte packets, so we may +need to have more smaller multipliers that can run in parallle, at the cost of +latency for larger packets. + +a 64 byte packet is 512 bytes, which takes up 4 128 bit lanes. If we have a group +of 2 multipliers, they can do 128\*2\*2 bits per 16 cycles, or 512 bits per 16 +cycles, which is 32 bits per cycle as we said earlier. To hit our target of 128 +bits per cycle we just instantiate 4 of them. This results in the same number of +multipliers (8), but configured differently to prioritize throughput over latency. + +We need to have a demux or whatever go between the groups. + +If we only do 4 effective lanes in parallel, then we only need to do the multiply +loop twice + +r->r^2 + +This will take 26 cycles, which is not ideal. Could we figure out a way to do all +of these powers in one step, taking only 13 cycles? + +Alternatively, we could only do a single parallel step and just calculate R^2. This +would mean we have 8 different hashes going on at the same time, and would drastically +increase latency, but I think that is a fair tradeoff + +So basically we need to store incoming data as 128 bit words. We will first get +r and s as 128 bit words. We store both and start work on squaring r. We will +also be recieving data this whole time at 128 bits per cycle which we store in +a FIFO. Once R^2 is calculated, we start running it through the multiplier, with +a counter that tells us when we should be using R and when we should be using +r^2. We only have 1 value to worry about, when we get the last value we only +use R instead of R^2. we also need to remember to store the outputs on both +last cycles. Since we are storing the data in a FIFO, we will know which is the +last. There is also a possibility that the data will not be a full 128 bits, so +we need to handle adding the leading 1 as well. + +We can use 1 multiplier, 2 data fifos, 2 constant buffers. + +The utilization of the multiplier is kinda low though since its only used +once per packet instaead of every 16 bytes \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/doc/poly1305.drawio b/ChaCha20_Poly1305_64/doc/poly1305.drawio index ca04eb4..4affada 100644 --- a/ChaCha20_Poly1305_64/doc/poly1305.drawio +++ b/ChaCha20_Poly1305_64/doc/poly1305.drawio @@ -1,144 +1,128 @@ - - + + - + - - - + + + - + + + + - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + + + - + + + + - - + + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + - + - - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + - - - + + + - - + + + + + + + + + + diff --git a/ChaCha20_Poly1305_64/sim/poly1305_stage.py b/ChaCha20_Poly1305_64/sim/poly1305_stage.py index 33148c5..76fcb92 100644 --- a/ChaCha20_Poly1305_64/sim/poly1305_stage.py +++ b/ChaCha20_Poly1305_64/sim/poly1305_stage.py @@ -5,6 +5,7 @@ import cocotb from cocotb.clock import Clock from cocotb.triggers import Timer, RisingEdge, FallingEdge from cocotb.queue import Queue +from cocotb.handle import Immediate from cocotbext.axi import AxiStreamBus, AxiStreamSource @@ -29,7 +30,7 @@ class TB: self.expected_queue = Queue() self.output_queue = Queue() - cocotb.start_soon(Clock(self.dut.i_clk, CLK_PERIOD, units="ns").start()) + cocotb.start_soon(Clock(self.dut.i_clk, CLK_PERIOD, unit="ns").start()) cocotb.start_soon(self.run_input()) cocotb.start_soon(self.run_output()) @@ -41,7 +42,7 @@ class TB: await self._cycle_reset(self.dut.i_rst, self.dut.i_clk) async def _cycle_reset(self, rst, clk): - rst.setimmediatevalue(0) + rst.value = Immediate(0) await RisingEdge(clk) await RisingEdge(clk) rst.value = 1 @@ -82,7 +83,7 @@ class TB: while True: await RisingEdge(self.dut.i_clk) if self.dut.o_valid.value: - await self.output_queue.put(self.dut.o_result.value.integer) + await self.output_queue.put(self.dut.o_result.value.to_unsigned()) @cocotb.test async def test_sanity(dut):