From 80e3faeae6e64f5801d2ff8b7d3ac971dd7f02f3 Mon Sep 17 00:00:00 2001 From: Byron Lathi Date: Mon, 14 Jul 2025 11:10:43 -0700 Subject: [PATCH] ramblings --- ChaCha20_Poly1305_64/doc/notes.md | 47 ++++++++++++++++++- .../poly1305_timing_test/constraints.sdc | 1 + .../poly1305_timing_test/mult_timing_test.sv | 42 +++++++++++++++++ .../poly1305_timing_test.xml | 23 +++++---- ChaCha20_Poly1305_64/sim/poly1305_core.py | 2 + ChaCha20_Poly1305_64/src/poly1305_core.sv | 37 +++++++++++---- 6 files changed, 131 insertions(+), 21 deletions(-) create mode 100644 ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc create mode 100644 ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv diff --git a/ChaCha20_Poly1305_64/doc/notes.md b/ChaCha20_Poly1305_64/doc/notes.md index 820b30a..ac4f2b2 100644 --- a/ChaCha20_Poly1305_64/doc/notes.md +++ b/ChaCha20_Poly1305_64/doc/notes.md @@ -105,4 +105,49 @@ quarter round module can have 2 different blocks going through it at once. The new one multiplexes 4 quarter rounds between 1 QR module which reduces the logic usage down to only 46k le, of which the vast majority is flops (2k ff per round, -0.5k lut) \ No newline at end of file +0.5k lut) + + +# Modulo 2^130-5 + +We can use the trick here to do modulo reduction much faster. + +If we split the bits at 2^130, leaving 129 high bits and 130 low bits, we now +have a 129 bit value multiplied by 2^130, plus the 130 bit value. We know that +2^130 mod 2^130-5 is 5, so we can replace that 2^130 with 5 and add, then +repeat that step again. + +Ex. + +x = x1*2^130 + x2 +x mod 2^130-5 = x1*5 + x2 -> x1*5+x2 = x3 +x mod 2^130-5 = x3*2^130 + x4 +x mod 2^130-5 = x3*5+x4 + + +and lets do the math to verify that we only need two rounds. The maximum value +that we could possible get is 2^131-1 and the maxmimum value for R is +0x0ffffffc0ffffffc0ffffffc0fffffff. Multiplying these together gives us +0x7fffffe07fffffe07fffffe07ffffff7f0000003f0000003f0000003f0000001. + +Applying the first round to this we get + +0x1ffffff81ffffff81ffffff81ffffffd * 5 + 0x3f0000003f0000003f0000003f0000001 += 0x48fffffdc8fffffdc8fffffdc8ffffff2 + +applying the second round to this we get + +1 * 5 + 0x8fffffdc8fffffdc8fffffdc8ffffff2 = 0x8fffffdc8fffffdc8fffffdc8ffffff7 + +and this is indeed the correct answer. The bottom part is 130 bits but since we +put in the max values and it didn't overflow, I don't think it will overflow here. + +131+128 = 259 bits, only have to do this once + +0xb83fe991ca75d7ef2ab5cba9cccdfd938b73fff384ac90ed284034da565ecf +0x19471c3e3e9c1bfded81da3736e96604a + + +Kind of curious now, at what point does a ripple carry adder using dedicated +CI/CO ports become slower then a more complex adder like carry lookahead or +carry save (wallace tree) diff --git a/ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc b/ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc new file mode 100644 index 0000000..392311c --- /dev/null +++ b/ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc @@ -0,0 +1 @@ +create_clock -period 2.5 -name clk [get_ports i_clk] \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv b/ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv new file mode 100644 index 0000000..6ce5cb8 --- /dev/null +++ b/ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv @@ -0,0 +1,42 @@ +module mult_timing_test( + input i_clk, + + input logic [132:0] data_a, + input logic [127:0] data_b, + + output logic [260:0] data_z +); + +logic [132:0] data_a_reg; +logic [127:0] data_b_reg; + + +logic [260:0] partial_result [7]; + +logic [260:0] data_z_temp_1[4]; +logic [260:0] data_z_temp_2_0, data_z_temp_2_1; + +always @(posedge i_clk) begin + data_a_reg <= data_a; + data_b_reg <= data_b; + + for (int i = 0; i < 7; i++) begin + partial_result[i] <= data_a_reg[i*18 +: 18] * data_b_reg; + end + + + data_z_temp_1[0] <= (partial_result[0] << (19*0)) + (partial_result[1] << (19*1)); + data_z_temp_1[1] <= (partial_result[2] << (19*0)) + (partial_result[3] << (19*1)); + data_z_temp_1[2] <= (partial_result[4] << (19*0)) + (partial_result[5] << (19*1)); + data_z_temp_1[3] <= (partial_result[6] << (19*0)); + + data_z_temp_2_0 <= data_z_temp_1[0] + (data_z_temp_1[1] << (19*2)); + data_z_temp_2_1 <= data_z_temp_1[2] + (data_z_temp_1[3] << (19*2)); + + data_z <= data_z_temp_2_0 + data_z_temp_2_1; + + // data_z <= data_z_temp_2[0] + (data_z_temp_2[1] << (19*4)); + +end + +endmodule \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml b/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml index 628ee0a..50827ff 100644 --- a/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml +++ b/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml @@ -1,18 +1,20 @@ - + - + + + @@ -39,15 +41,15 @@ - - - - + + + + - + @@ -56,16 +58,17 @@ - + + - + - + diff --git a/ChaCha20_Poly1305_64/sim/poly1305_core.py b/ChaCha20_Poly1305_64/sim/poly1305_core.py index b339085..1feeaf1 100644 --- a/ChaCha20_Poly1305_64/sim/poly1305_core.py +++ b/ChaCha20_Poly1305_64/sim/poly1305_core.py @@ -70,4 +70,6 @@ async def test_sanity(dut): tb.log.info(f"tag: {tag:x}") + assert tag == result + await Timer(1, "us") \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/src/poly1305_core.sv b/ChaCha20_Poly1305_64/src/poly1305_core.sv index 8d85220..8b27118 100644 --- a/ChaCha20_Poly1305_64/src/poly1305_core.sv +++ b/ChaCha20_Poly1305_64/src/poly1305_core.sv @@ -23,13 +23,15 @@ logic [127:0] poly1305_r, poly1305_s; logic [129:0] accumulator, accumulator_next; logic [129:0] data_one_extended; -logic [130:0] data_post_add; +logic [130:0] data_post_add, data_post_add_reg; logic [257:0] data_post_mul, data_post_mul_reg; -logic phase; +logic [257:0] modulo_stage, modulo_stage_next; -logic [1:0] valid_sr; +logic [2:0] phase; + +logic [3:0] valid_sr; function logic [129:0] tkeep_expand (input [15:0] tkeep); tkeep_expand = '0; @@ -40,7 +42,7 @@ endfunction // only ready in phase 0 assign s_data_axis.tready = phase == 0; -assign o_tag_valid = valid_sr[1]; +assign o_tag_valid = valid_sr[3]; always_ff @(posedge i_clk) begin if (i_rst) begin @@ -48,7 +50,10 @@ always_ff @(posedge i_clk) begin valid_sr <= '0; end - valid_sr <= {valid_sr[0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & ~phase}; + valid_sr <= {valid_sr[2:0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & (phase == 0)}; + data_post_add_reg <= data_post_add; + data_post_mul_reg <= data_post_mul; + modulo_stage <= modulo_stage_next; if (i_otk_valid) begin poly1305_r <= i_otk[255:128] & R_MASK; @@ -56,11 +61,18 @@ always_ff @(posedge i_clk) begin end if (s_data_axis.tvalid && phase == 0) begin - data_post_mul_reg <= data_post_mul; - phase <= '1; + phase <= 1; end - if (phase == '1) begin + if (phase == 1) begin + phase <= 2; + end + + if (phase == 2) begin + phase <= 3; + end + + if (phase == 3) begin accumulator <= accumulator_next; phase <= '0; end @@ -73,10 +85,15 @@ always_comb begin // phase == 0 data_one_extended = (tkeep_expand(s_data_axis.tkeep) + 1) | {2'b0, s_data_axis.tdata}; data_post_add = data_one_extended + accumulator; - data_post_mul = data_post_add * poly1305_r; // phase == 1 - accumulator_next = 130'(data_post_mul_reg % P130M5); + data_post_mul = data_post_add_reg * poly1305_r; + + // phase == 2 + modulo_stage_next = (data_post_mul_reg[257:130] * 5) + 258'(data_post_mul_reg[129:0]); + + // phase == 3 + accumulator_next = 130'((modulo_stage[257:130] * 5) + 258'(modulo_stage[129:0])); end assign o_tag = accumulator[127:0] + poly1305_s;