ramblings

This commit is contained in:
Byron Lathi
2025-07-14 11:10:43 -07:00
parent 2b57079205
commit 80e3faeae6
6 changed files with 131 additions and 21 deletions

View File

@@ -105,4 +105,49 @@ quarter round module can have 2 different blocks going through it at once.
The new one multiplexes 4 quarter rounds between 1 QR module which reduces the The new one multiplexes 4 quarter rounds between 1 QR module which reduces the
logic usage down to only 46k le, of which the vast majority is flops (2k ff per round, logic usage down to only 46k le, of which the vast majority is flops (2k ff per round,
0.5k lut) 0.5k lut)
# Modulo 2^130-5
We can use the trick here to do modulo reduction much faster.
If we split the bits at 2^130, leaving 129 high bits and 130 low bits, we now
have a 129 bit value multiplied by 2^130, plus the 130 bit value. We know that
2^130 mod 2^130-5 is 5, so we can replace that 2^130 with 5 and add, then
repeat that step again.
Ex.
x = x1*2^130 + x2
x mod 2^130-5 = x1*5 + x2 -> x1*5+x2 = x3
x mod 2^130-5 = x3*2^130 + x4
x mod 2^130-5 = x3*5+x4
and lets do the math to verify that we only need two rounds. The maximum value
that we could possible get is 2^131-1 and the maxmimum value for R is
0x0ffffffc0ffffffc0ffffffc0fffffff. Multiplying these together gives us
0x7fffffe07fffffe07fffffe07ffffff7f0000003f0000003f0000003f0000001.
Applying the first round to this we get
0x1ffffff81ffffff81ffffff81ffffffd * 5 + 0x3f0000003f0000003f0000003f0000001
= 0x48fffffdc8fffffdc8fffffdc8ffffff2
applying the second round to this we get
1 * 5 + 0x8fffffdc8fffffdc8fffffdc8ffffff2 = 0x8fffffdc8fffffdc8fffffdc8ffffff7
and this is indeed the correct answer. The bottom part is 130 bits but since we
put in the max values and it didn't overflow, I don't think it will overflow here.
131+128 = 259 bits, only have to do this once
0xb83fe991ca75d7ef2ab5cba9cccdfd938b73fff384ac90ed284034da565ecf
0x19471c3e3e9c1bfded81da3736e96604a
Kind of curious now, at what point does a ripple carry adder using dedicated
CI/CO ports become slower then a more complex adder like carry lookahead or
carry save (wallace tree)

View File

@@ -0,0 +1 @@
create_clock -period 2.5 -name clk [get_ports i_clk]

View File

@@ -0,0 +1,42 @@
module mult_timing_test(
input i_clk,
input logic [132:0] data_a,
input logic [127:0] data_b,
output logic [260:0] data_z
);
logic [132:0] data_a_reg;
logic [127:0] data_b_reg;
logic [260:0] partial_result [7];
logic [260:0] data_z_temp_1[4];
logic [260:0] data_z_temp_2_0, data_z_temp_2_1;
always @(posedge i_clk) begin
data_a_reg <= data_a;
data_b_reg <= data_b;
for (int i = 0; i < 7; i++) begin
partial_result[i] <= data_a_reg[i*18 +: 18] * data_b_reg;
end
data_z_temp_1[0] <= (partial_result[0] << (19*0)) + (partial_result[1] << (19*1));
data_z_temp_1[1] <= (partial_result[2] << (19*0)) + (partial_result[3] << (19*1));
data_z_temp_1[2] <= (partial_result[4] << (19*0)) + (partial_result[5] << (19*1));
data_z_temp_1[3] <= (partial_result[6] << (19*0));
data_z_temp_2_0 <= data_z_temp_1[0] + (data_z_temp_1[1] << (19*2));
data_z_temp_2_1 <= data_z_temp_1[2] + (data_z_temp_1[3] << (19*2));
data_z <= data_z_temp_2_0 + data_z_temp_2_1;
// data_z <= data_z_temp_2[0] + (data_z_temp_2[1] << (19*4));
end
endmodule

View File

@@ -1,18 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<efx:project name="poly1305_timing_test" description="" last_change="1751725092" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd"> <efx:project name="poly1305_timing_test" description="" last_change="1752448578" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
<efx:device_info> <efx:device_info>
<efx:family name="Titanium"/> <efx:family name="Titanium"/>
<efx:device name="Ti375N1156"/> <efx:device name="Ti375N1156"/>
<efx:timing_model name="C4"/> <efx:timing_model name="C4"/>
</efx:device_info> </efx:device_info>
<efx:design_info def_veri_version="sv_09" def_vhdl_version="vhdl_2008" unified_flow="false"> <efx:design_info def_veri_version="sv_09" def_vhdl_version="vhdl_2008" unified_flow="false">
<efx:top_module name="poly1305_core_wrapper"/> <efx:top_module name="mult_timing_test"/>
<efx:design_file name="../src/poly1305_core.sv" version="default" library="default"/> <efx:design_file name="../src/poly1305_core.sv" version="default" library="default"/>
<efx:design_file name="../../common/sim/sub/taxi/src/axis/rtl/taxi_axis_if.sv" version="default" library="default"/> <efx:design_file name="../../common/sim/sub/taxi/src/axis/rtl/taxi_axis_if.sv" version="default" library="default"/>
<efx:design_file name="../sim/poly1305_core_wrapper.sv" version="default" library="default"/> <efx:design_file name="../sim/poly1305_core_wrapper.sv" version="default" library="default"/>
<efx:design_file name="mult_timing_test.sv" version="default" library="default"/>
<efx:top_vhdl_arch name=""/> <efx:top_vhdl_arch name=""/>
</efx:design_info> </efx:design_info>
<efx:constraint_info> <efx:constraint_info>
<efx:sdc_file name="constraints.sdc"/>
<efx:inter_file name=""/> <efx:inter_file name=""/>
</efx:constraint_info> </efx:constraint_info>
<efx:sim_info/> <efx:sim_info/>
@@ -39,15 +41,15 @@
<efx:param name="min-sr-fanout" value="0" value_type="e_integer"/> <efx:param name="min-sr-fanout" value="0" value_type="e_integer"/>
<efx:param name="min-ce-fanout" value="0" value_type="e_integer"/> <efx:param name="min-ce-fanout" value="0" value_type="e_integer"/>
<efx:param name="mode" value="speed" value_type="e_option"/> <efx:param name="mode" value="speed" value_type="e_option"/>
<efx:param name="mult-auto-pipeline" value="0" value_type="e_integer"/> <efx:param name="mult-auto-pipeline" value="1" value_type="e_integer"/>
<efx:param name="mult-decomp-retime" value="0" value_type="e_option"/> <efx:param name="mult-decomp-retime" value="1" value_type="e_option"/>
<efx:param name="operator-sharing" value="0" value_type="e_option"/> <efx:param name="operator-sharing" value="1" value_type="e_option"/>
<efx:param name="optimize-adder-tree" value="0" value_type="e_option"/> <efx:param name="optimize-adder-tree" value="1" value_type="e_option"/>
<efx:param name="optimize-zero-init-rom" value="1" value_type="e_option"/> <efx:param name="optimize-zero-init-rom" value="1" value_type="e_option"/>
<efx:param name="peri-syn-instantiation" value="0" value_type="e_option"/> <efx:param name="peri-syn-instantiation" value="0" value_type="e_option"/>
<efx:param name="peri-syn-inference" value="0" value_type="e_option"/> <efx:param name="peri-syn-inference" value="0" value_type="e_option"/>
<efx:param name="ram-decomp-mode" value="0" value_type="e_option"/> <efx:param name="ram-decomp-mode" value="0" value_type="e_option"/>
<efx:param name="retiming" value="1" value_type="e_option"/> <efx:param name="retiming" value="2" value_type="e_option"/>
<efx:param name="seq_opt" value="1" value_type="e_option"/> <efx:param name="seq_opt" value="1" value_type="e_option"/>
<efx:param name="seq-opt-sync-only" value="0" value_type="e_option"/> <efx:param name="seq-opt-sync-only" value="0" value_type="e_option"/>
<efx:param name="use-logic-for-small-mem" value="64" value_type="e_integer"/> <efx:param name="use-logic-for-small-mem" value="64" value_type="e_integer"/>
@@ -56,16 +58,17 @@
<efx:param name="dsp-input-regs-packing" value="1" value_type="e_option"/> <efx:param name="dsp-input-regs-packing" value="1" value_type="e_option"/>
<efx:param name="dsp-output-regs-packing" value="1" value_type="e_option"/> <efx:param name="dsp-output-regs-packing" value="1" value_type="e_option"/>
<efx:param name="dsp-mac-packing" value="1" value_type="e_option"/> <efx:param name="dsp-mac-packing" value="1" value_type="e_option"/>
<efx:param name="insert-carry-skip" value="0" value_type="e_option"/> <efx:param name="insert-carry-skip" value="1" value_type="e_option"/>
<efx:param name="pack-luts-to-comb4" value="0" value_type="e_option"/> <efx:param name="pack-luts-to-comb4" value="0" value_type="e_option"/>
<efx:dynparam name="asdf" value="asdf"/>
</efx:synthesis> </efx:synthesis>
<efx:place_and_route tool_name="efx_pnr"> <efx:place_and_route tool_name="efx_pnr">
<efx:param name="work_dir" value="work_pnr" value_type="e_string"/> <efx:param name="work_dir" value="work_pnr" value_type="e_string"/>
<efx:param name="verbose" value="off" value_type="e_bool"/> <efx:param name="verbose" value="off" value_type="e_bool"/>
<efx:param name="load_delaym" value="on" value_type="e_bool"/> <efx:param name="load_delaym" value="on" value_type="e_bool"/>
<efx:param name="optimization_level" value="NULL" value_type="e_option"/> <efx:param name="optimization_level" value="TIMING_3" value_type="e_option"/>
<efx:param name="seed" value="1" value_type="e_integer"/> <efx:param name="seed" value="1" value_type="e_integer"/>
<efx:param name="placer_effort_level" value="2" value_type="e_option"/> <efx:param name="placer_effort_level" value="5" value_type="e_option"/>
<efx:param name="max_threads" value="-1" value_type="e_integer"/> <efx:param name="max_threads" value="-1" value_type="e_integer"/>
<efx:param name="print_critical_path" value="10" value_type="e_integer"/> <efx:param name="print_critical_path" value="10" value_type="e_integer"/>
<efx:param name="classic_flow" value="off" value_type="e_noarg"/> <efx:param name="classic_flow" value="off" value_type="e_noarg"/>

View File

@@ -70,4 +70,6 @@ async def test_sanity(dut):
tb.log.info(f"tag: {tag:x}") tb.log.info(f"tag: {tag:x}")
assert tag == result
await Timer(1, "us") await Timer(1, "us")

View File

@@ -23,13 +23,15 @@ logic [127:0] poly1305_r, poly1305_s;
logic [129:0] accumulator, accumulator_next; logic [129:0] accumulator, accumulator_next;
logic [129:0] data_one_extended; logic [129:0] data_one_extended;
logic [130:0] data_post_add; logic [130:0] data_post_add, data_post_add_reg;
logic [257:0] data_post_mul, data_post_mul_reg; logic [257:0] data_post_mul, data_post_mul_reg;
logic phase; logic [257:0] modulo_stage, modulo_stage_next;
logic [1:0] valid_sr; logic [2:0] phase;
logic [3:0] valid_sr;
function logic [129:0] tkeep_expand (input [15:0] tkeep); function logic [129:0] tkeep_expand (input [15:0] tkeep);
tkeep_expand = '0; tkeep_expand = '0;
@@ -40,7 +42,7 @@ endfunction
// only ready in phase 0 // only ready in phase 0
assign s_data_axis.tready = phase == 0; assign s_data_axis.tready = phase == 0;
assign o_tag_valid = valid_sr[1]; assign o_tag_valid = valid_sr[3];
always_ff @(posedge i_clk) begin always_ff @(posedge i_clk) begin
if (i_rst) begin if (i_rst) begin
@@ -48,7 +50,10 @@ always_ff @(posedge i_clk) begin
valid_sr <= '0; valid_sr <= '0;
end end
valid_sr <= {valid_sr[0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & ~phase}; valid_sr <= {valid_sr[2:0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & (phase == 0)};
data_post_add_reg <= data_post_add;
data_post_mul_reg <= data_post_mul;
modulo_stage <= modulo_stage_next;
if (i_otk_valid) begin if (i_otk_valid) begin
poly1305_r <= i_otk[255:128] & R_MASK; poly1305_r <= i_otk[255:128] & R_MASK;
@@ -56,11 +61,18 @@ always_ff @(posedge i_clk) begin
end end
if (s_data_axis.tvalid && phase == 0) begin if (s_data_axis.tvalid && phase == 0) begin
data_post_mul_reg <= data_post_mul; phase <= 1;
phase <= '1;
end end
if (phase == '1) begin if (phase == 1) begin
phase <= 2;
end
if (phase == 2) begin
phase <= 3;
end
if (phase == 3) begin
accumulator <= accumulator_next; accumulator <= accumulator_next;
phase <= '0; phase <= '0;
end end
@@ -73,10 +85,15 @@ always_comb begin
// phase == 0 // phase == 0
data_one_extended = (tkeep_expand(s_data_axis.tkeep) + 1) | {2'b0, s_data_axis.tdata}; data_one_extended = (tkeep_expand(s_data_axis.tkeep) + 1) | {2'b0, s_data_axis.tdata};
data_post_add = data_one_extended + accumulator; data_post_add = data_one_extended + accumulator;
data_post_mul = data_post_add * poly1305_r;
// phase == 1 // phase == 1
accumulator_next = 130'(data_post_mul_reg % P130M5); data_post_mul = data_post_add_reg * poly1305_r;
// phase == 2
modulo_stage_next = (data_post_mul_reg[257:130] * 5) + 258'(data_post_mul_reg[129:0]);
// phase == 3
accumulator_next = 130'((modulo_stage[257:130] * 5) + 258'(modulo_stage[129:0]));
end end
assign o_tag = accumulator[127:0] + poly1305_s; assign o_tag = accumulator[127:0] + poly1305_s;