From 80e3faeae6e64f5801d2ff8b7d3ac971dd7f02f3 Mon Sep 17 00:00:00 2001
From: Byron Lathi <bslathi19@gmail.com>
Date: Mon, 14 Jul 2025 11:10:43 -0700
Subject: [PATCH] ramblings

---
 ChaCha20_Poly1305_64/doc/notes.md             | 47 ++++++++++++++++++-
 .../poly1305_timing_test/constraints.sdc      |  1 +
 .../poly1305_timing_test/mult_timing_test.sv  | 42 +++++++++++++++++
 .../poly1305_timing_test.xml                  | 23 +++++----
 ChaCha20_Poly1305_64/sim/poly1305_core.py     |  2 +
 ChaCha20_Poly1305_64/src/poly1305_core.sv     | 37 +++++++++++----
 6 files changed, 131 insertions(+), 21 deletions(-)
 create mode 100644 ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc
 create mode 100644 ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv

diff --git a/ChaCha20_Poly1305_64/doc/notes.md b/ChaCha20_Poly1305_64/doc/notes.md
index 820b30a..ac4f2b2 100644
--- a/ChaCha20_Poly1305_64/doc/notes.md
+++ b/ChaCha20_Poly1305_64/doc/notes.md
@@ -105,4 +105,49 @@ quarter round module can have 2 different blocks going through it at once.
 
 The new one multiplexes 4 quarter rounds between 1 QR module which reduces the
 logic usage down to only 46k le, of which the vast majority is flops (2k ff per round,
-0.5k lut)
\ No newline at end of file
+0.5k lut)
+
+
+# Modulo 2^130-5
+
+We can use the trick here to do modulo reduction much faster.
+
+If we split the bits at 2^130, leaving 129 high bits and 130 low bits, we now
+have a 129 bit value multiplied by 2^130, plus the 130 bit value. We know that 
+2^130 mod 2^130-5 is 5, so we can replace that 2^130 with 5 and add, then
+repeat that step again.
+
+Ex.
+
+x = x1*2^130 + x2
+x mod 2^130-5 = x1*5 + x2   -> x1*5+x2 = x3
+x mod 2^130-5 = x3*2^130 + x4
+x mod 2^130-5 = x3*5+x4
+
+
+and lets do the math to verify that we only need two rounds. The maximum value
+that we could possible get is 2^131-1 and the maxmimum value for R is 
+0x0ffffffc0ffffffc0ffffffc0fffffff. Multiplying these together gives us 
+0x7fffffe07fffffe07fffffe07ffffff7f0000003f0000003f0000003f0000001.
+
+Applying the first round to this we get 
+
+0x1ffffff81ffffff81ffffff81ffffffd * 5 + 0x3f0000003f0000003f0000003f0000001
+= 0x48fffffdc8fffffdc8fffffdc8ffffff2
+
+applying the second round to this we get
+
+1 * 5 + 0x8fffffdc8fffffdc8fffffdc8ffffff2 = 0x8fffffdc8fffffdc8fffffdc8ffffff7
+
+and this is indeed the correct answer. The bottom part is 130 bits but since we
+put in the max values and it didn't overflow, I don't think it will overflow here.
+
+131+128 = 259 bits, only have to do this once
+
+0xb83fe991ca75d7ef2ab5cba9cccdfd938b73fff384ac90ed284034da565ecf
+0x19471c3e3e9c1bfded81da3736e96604a
+
+
+Kind of curious now, at what point does a ripple carry adder using dedicated
+CI/CO ports become slower then a more complex adder like carry lookahead or
+carry save (wallace tree)
diff --git a/ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc b/ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc
new file mode 100644
index 0000000..392311c
--- /dev/null
+++ b/ChaCha20_Poly1305_64/poly1305_timing_test/constraints.sdc
@@ -0,0 +1 @@
+create_clock -period 2.5 -name clk [get_ports i_clk]
\ No newline at end of file
diff --git a/ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv b/ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv
new file mode 100644
index 0000000..6ce5cb8
--- /dev/null
+++ b/ChaCha20_Poly1305_64/poly1305_timing_test/mult_timing_test.sv
@@ -0,0 +1,42 @@
+module mult_timing_test(
+    input i_clk,
+
+    input logic [132:0] data_a,
+    input logic [127:0] data_b,
+    
+    output logic [260:0] data_z
+);
+
+logic [132:0] data_a_reg;
+logic [127:0] data_b_reg;
+
+
+logic [260:0] partial_result [7];
+
+logic [260:0] data_z_temp_1[4];
+logic [260:0] data_z_temp_2_0, data_z_temp_2_1;
+
+always @(posedge i_clk) begin
+    data_a_reg <= data_a;
+    data_b_reg <= data_b;
+
+    for (int i = 0; i < 7; i++) begin
+        partial_result[i] <= data_a_reg[i*18 +: 18] * data_b_reg;
+    end
+    
+
+    data_z_temp_1[0] <= (partial_result[0] << (19*0)) + (partial_result[1] << (19*1));
+    data_z_temp_1[1] <= (partial_result[2] << (19*0)) + (partial_result[3] << (19*1));
+    data_z_temp_1[2] <= (partial_result[4] << (19*0)) + (partial_result[5] << (19*1));
+    data_z_temp_1[3] <= (partial_result[6] << (19*0));
+
+    data_z_temp_2_0 <= data_z_temp_1[0] + (data_z_temp_1[1] << (19*2));
+    data_z_temp_2_1 <= data_z_temp_1[2] + (data_z_temp_1[3] << (19*2));
+
+    data_z <= data_z_temp_2_0 + data_z_temp_2_1;
+
+    // data_z <= data_z_temp_2[0] + (data_z_temp_2[1] << (19*4));
+
+end
+
+endmodule
\ No newline at end of file
diff --git a/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml b/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml
index 628ee0a..50827ff 100644
--- a/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml
+++ b/ChaCha20_Poly1305_64/poly1305_timing_test/poly1305_timing_test.xml
@@ -1,18 +1,20 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<efx:project name="poly1305_timing_test" description="" last_change="1751725092" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
+<efx:project name="poly1305_timing_test" description="" last_change="1752448578" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
     <efx:device_info>
         <efx:family name="Titanium"/>
         <efx:device name="Ti375N1156"/>
         <efx:timing_model name="C4"/>
     </efx:device_info>
     <efx:design_info def_veri_version="sv_09" def_vhdl_version="vhdl_2008" unified_flow="false">
-        <efx:top_module name="poly1305_core_wrapper"/>
+        <efx:top_module name="mult_timing_test"/>
         <efx:design_file name="../src/poly1305_core.sv" version="default" library="default"/>
         <efx:design_file name="../../common/sim/sub/taxi/src/axis/rtl/taxi_axis_if.sv" version="default" library="default"/>
         <efx:design_file name="../sim/poly1305_core_wrapper.sv" version="default" library="default"/>
+        <efx:design_file name="mult_timing_test.sv" version="default" library="default"/>
         <efx:top_vhdl_arch name=""/>
     </efx:design_info>
     <efx:constraint_info>
+        <efx:sdc_file name="constraints.sdc"/>
         <efx:inter_file name=""/>
     </efx:constraint_info>
     <efx:sim_info/>
@@ -39,15 +41,15 @@
         <efx:param name="min-sr-fanout" value="0" value_type="e_integer"/>
         <efx:param name="min-ce-fanout" value="0" value_type="e_integer"/>
         <efx:param name="mode" value="speed" value_type="e_option"/>
-        <efx:param name="mult-auto-pipeline" value="0" value_type="e_integer"/>
-        <efx:param name="mult-decomp-retime" value="0" value_type="e_option"/>
-        <efx:param name="operator-sharing" value="0" value_type="e_option"/>
-        <efx:param name="optimize-adder-tree" value="0" value_type="e_option"/>
+        <efx:param name="mult-auto-pipeline" value="1" value_type="e_integer"/>
+        <efx:param name="mult-decomp-retime" value="1" value_type="e_option"/>
+        <efx:param name="operator-sharing" value="1" value_type="e_option"/>
+        <efx:param name="optimize-adder-tree" value="1" value_type="e_option"/>
         <efx:param name="optimize-zero-init-rom" value="1" value_type="e_option"/>
         <efx:param name="peri-syn-instantiation" value="0" value_type="e_option"/>
         <efx:param name="peri-syn-inference" value="0" value_type="e_option"/>
         <efx:param name="ram-decomp-mode" value="0" value_type="e_option"/>
-        <efx:param name="retiming" value="1" value_type="e_option"/>
+        <efx:param name="retiming" value="2" value_type="e_option"/>
         <efx:param name="seq_opt" value="1" value_type="e_option"/>
         <efx:param name="seq-opt-sync-only" value="0" value_type="e_option"/>
         <efx:param name="use-logic-for-small-mem" value="64" value_type="e_integer"/>
@@ -56,16 +58,17 @@
         <efx:param name="dsp-input-regs-packing" value="1" value_type="e_option"/>
         <efx:param name="dsp-output-regs-packing" value="1" value_type="e_option"/>
         <efx:param name="dsp-mac-packing" value="1" value_type="e_option"/>
-        <efx:param name="insert-carry-skip" value="0" value_type="e_option"/>
+        <efx:param name="insert-carry-skip" value="1" value_type="e_option"/>
         <efx:param name="pack-luts-to-comb4" value="0" value_type="e_option"/>
+        <efx:dynparam name="asdf" value="asdf"/>
     </efx:synthesis>
     <efx:place_and_route tool_name="efx_pnr">
         <efx:param name="work_dir" value="work_pnr" value_type="e_string"/>
         <efx:param name="verbose" value="off" value_type="e_bool"/>
         <efx:param name="load_delaym" value="on" value_type="e_bool"/>
-        <efx:param name="optimization_level" value="NULL" value_type="e_option"/>
+        <efx:param name="optimization_level" value="TIMING_3" value_type="e_option"/>
         <efx:param name="seed" value="1" value_type="e_integer"/>
-        <efx:param name="placer_effort_level" value="2" value_type="e_option"/>
+        <efx:param name="placer_effort_level" value="5" value_type="e_option"/>
         <efx:param name="max_threads" value="-1" value_type="e_integer"/>
         <efx:param name="print_critical_path" value="10" value_type="e_integer"/>
         <efx:param name="classic_flow" value="off" value_type="e_noarg"/>
diff --git a/ChaCha20_Poly1305_64/sim/poly1305_core.py b/ChaCha20_Poly1305_64/sim/poly1305_core.py
index b339085..1feeaf1 100644
--- a/ChaCha20_Poly1305_64/sim/poly1305_core.py
+++ b/ChaCha20_Poly1305_64/sim/poly1305_core.py
@@ -70,4 +70,6 @@ async def test_sanity(dut):
     
     tb.log.info(f"tag: {tag:x}")
 
+    assert tag == result
+
     await Timer(1, "us")
\ No newline at end of file
diff --git a/ChaCha20_Poly1305_64/src/poly1305_core.sv b/ChaCha20_Poly1305_64/src/poly1305_core.sv
index 8d85220..8b27118 100644
--- a/ChaCha20_Poly1305_64/src/poly1305_core.sv
+++ b/ChaCha20_Poly1305_64/src/poly1305_core.sv
@@ -23,13 +23,15 @@ logic [127:0] poly1305_r, poly1305_s;
 logic [129:0] accumulator, accumulator_next;
 
 logic [129:0] data_one_extended;
-logic [130:0] data_post_add;
+logic [130:0] data_post_add, data_post_add_reg;
 
 logic [257:0] data_post_mul, data_post_mul_reg;
 
-logic phase;
+logic [257:0] modulo_stage, modulo_stage_next;
 
-logic [1:0] valid_sr;
+logic [2:0] phase;
+
+logic [3:0] valid_sr;
 
 function logic [129:0] tkeep_expand (input [15:0] tkeep);
     tkeep_expand = '0;
@@ -40,7 +42,7 @@ endfunction
 
 // only ready in phase 0
 assign s_data_axis.tready = phase == 0;
-assign o_tag_valid = valid_sr[1];
+assign o_tag_valid = valid_sr[3];
 
 always_ff @(posedge i_clk) begin
     if (i_rst) begin
@@ -48,7 +50,10 @@ always_ff @(posedge i_clk) begin
         valid_sr <= '0;
     end
 
-    valid_sr <= {valid_sr[0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & ~phase};
+    valid_sr <= {valid_sr[2:0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & (phase == 0)};
+    data_post_add_reg <= data_post_add;
+    data_post_mul_reg <= data_post_mul;
+    modulo_stage <= modulo_stage_next;
 
     if (i_otk_valid) begin
         poly1305_r <= i_otk[255:128] & R_MASK;
@@ -56,11 +61,18 @@ always_ff @(posedge i_clk) begin
     end
 
     if (s_data_axis.tvalid && phase == 0) begin
-        data_post_mul_reg <= data_post_mul;
-        phase <= '1;
+        phase <= 1;
     end
 
-    if (phase == '1) begin
+    if (phase == 1) begin
+        phase <= 2;
+    end
+
+    if (phase == 2) begin
+        phase <= 3;
+    end
+
+    if (phase == 3) begin
         accumulator <= accumulator_next;
         phase <= '0;
     end
@@ -73,10 +85,15 @@ always_comb begin
     // phase == 0
     data_one_extended = (tkeep_expand(s_data_axis.tkeep) + 1) | {2'b0, s_data_axis.tdata};
     data_post_add = data_one_extended + accumulator;
-    data_post_mul = data_post_add * poly1305_r;
 
     // phase == 1
-    accumulator_next = 130'(data_post_mul_reg % P130M5);
+    data_post_mul = data_post_add_reg * poly1305_r;
+
+    // phase == 2
+    modulo_stage_next = (data_post_mul_reg[257:130] * 5) + 258'(data_post_mul_reg[129:0]);
+
+    // phase == 3
+    accumulator_next = 130'((modulo_stage[257:130] * 5) + 258'(modulo_stage[129:0]));
 end
 
 assign o_tag = accumulator[127:0] + poly1305_s;