ramblings

This commit is contained in:
Byron Lathi
2025-07-14 11:10:43 -07:00
parent 2b57079205
commit 80e3faeae6
6 changed files with 131 additions and 21 deletions

View File

@@ -105,4 +105,49 @@ quarter round module can have 2 different blocks going through it at once.
The new one multiplexes 4 quarter rounds between 1 QR module which reduces the
logic usage down to only 46k le, of which the vast majority is flops (2k ff per round,
0.5k lut)
0.5k lut)
# Modulo 2^130-5
We can use the trick here to do modulo reduction much faster.
If we split the bits at 2^130, leaving 129 high bits and 130 low bits, we now
have a 129 bit value multiplied by 2^130, plus the 130 bit value. We know that
2^130 mod 2^130-5 is 5, so we can replace that 2^130 with 5 and add, then
repeat that step again.
Ex.
x = x1*2^130 + x2
x mod 2^130-5 = x1*5 + x2 -> x1*5+x2 = x3
x mod 2^130-5 = x3*2^130 + x4
x mod 2^130-5 = x3*5+x4
and lets do the math to verify that we only need two rounds. The maximum value
that we could possible get is 2^131-1 and the maxmimum value for R is
0x0ffffffc0ffffffc0ffffffc0fffffff. Multiplying these together gives us
0x7fffffe07fffffe07fffffe07ffffff7f0000003f0000003f0000003f0000001.
Applying the first round to this we get
0x1ffffff81ffffff81ffffff81ffffffd * 5 + 0x3f0000003f0000003f0000003f0000001
= 0x48fffffdc8fffffdc8fffffdc8ffffff2
applying the second round to this we get
1 * 5 + 0x8fffffdc8fffffdc8fffffdc8ffffff2 = 0x8fffffdc8fffffdc8fffffdc8ffffff7
and this is indeed the correct answer. The bottom part is 130 bits but since we
put in the max values and it didn't overflow, I don't think it will overflow here.
131+128 = 259 bits, only have to do this once
0xb83fe991ca75d7ef2ab5cba9cccdfd938b73fff384ac90ed284034da565ecf
0x19471c3e3e9c1bfded81da3736e96604a
Kind of curious now, at what point does a ripple carry adder using dedicated
CI/CO ports become slower then a more complex adder like carry lookahead or
carry save (wallace tree)

View File

@@ -0,0 +1 @@
create_clock -period 2.5 -name clk [get_ports i_clk]

View File

@@ -0,0 +1,42 @@
module mult_timing_test(
input i_clk,
input logic [132:0] data_a,
input logic [127:0] data_b,
output logic [260:0] data_z
);
logic [132:0] data_a_reg;
logic [127:0] data_b_reg;
logic [260:0] partial_result [7];
logic [260:0] data_z_temp_1[4];
logic [260:0] data_z_temp_2_0, data_z_temp_2_1;
always @(posedge i_clk) begin
data_a_reg <= data_a;
data_b_reg <= data_b;
for (int i = 0; i < 7; i++) begin
partial_result[i] <= data_a_reg[i*18 +: 18] * data_b_reg;
end
data_z_temp_1[0] <= (partial_result[0] << (19*0)) + (partial_result[1] << (19*1));
data_z_temp_1[1] <= (partial_result[2] << (19*0)) + (partial_result[3] << (19*1));
data_z_temp_1[2] <= (partial_result[4] << (19*0)) + (partial_result[5] << (19*1));
data_z_temp_1[3] <= (partial_result[6] << (19*0));
data_z_temp_2_0 <= data_z_temp_1[0] + (data_z_temp_1[1] << (19*2));
data_z_temp_2_1 <= data_z_temp_1[2] + (data_z_temp_1[3] << (19*2));
data_z <= data_z_temp_2_0 + data_z_temp_2_1;
// data_z <= data_z_temp_2[0] + (data_z_temp_2[1] << (19*4));
end
endmodule

View File

@@ -1,18 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<efx:project name="poly1305_timing_test" description="" last_change="1751725092" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
<efx:project name="poly1305_timing_test" description="" last_change="1752448578" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
<efx:device_info>
<efx:family name="Titanium"/>
<efx:device name="Ti375N1156"/>
<efx:timing_model name="C4"/>
</efx:device_info>
<efx:design_info def_veri_version="sv_09" def_vhdl_version="vhdl_2008" unified_flow="false">
<efx:top_module name="poly1305_core_wrapper"/>
<efx:top_module name="mult_timing_test"/>
<efx:design_file name="../src/poly1305_core.sv" version="default" library="default"/>
<efx:design_file name="../../common/sim/sub/taxi/src/axis/rtl/taxi_axis_if.sv" version="default" library="default"/>
<efx:design_file name="../sim/poly1305_core_wrapper.sv" version="default" library="default"/>
<efx:design_file name="mult_timing_test.sv" version="default" library="default"/>
<efx:top_vhdl_arch name=""/>
</efx:design_info>
<efx:constraint_info>
<efx:sdc_file name="constraints.sdc"/>
<efx:inter_file name=""/>
</efx:constraint_info>
<efx:sim_info/>
@@ -39,15 +41,15 @@
<efx:param name="min-sr-fanout" value="0" value_type="e_integer"/>
<efx:param name="min-ce-fanout" value="0" value_type="e_integer"/>
<efx:param name="mode" value="speed" value_type="e_option"/>
<efx:param name="mult-auto-pipeline" value="0" value_type="e_integer"/>
<efx:param name="mult-decomp-retime" value="0" value_type="e_option"/>
<efx:param name="operator-sharing" value="0" value_type="e_option"/>
<efx:param name="optimize-adder-tree" value="0" value_type="e_option"/>
<efx:param name="mult-auto-pipeline" value="1" value_type="e_integer"/>
<efx:param name="mult-decomp-retime" value="1" value_type="e_option"/>
<efx:param name="operator-sharing" value="1" value_type="e_option"/>
<efx:param name="optimize-adder-tree" value="1" value_type="e_option"/>
<efx:param name="optimize-zero-init-rom" value="1" value_type="e_option"/>
<efx:param name="peri-syn-instantiation" value="0" value_type="e_option"/>
<efx:param name="peri-syn-inference" value="0" value_type="e_option"/>
<efx:param name="ram-decomp-mode" value="0" value_type="e_option"/>
<efx:param name="retiming" value="1" value_type="e_option"/>
<efx:param name="retiming" value="2" value_type="e_option"/>
<efx:param name="seq_opt" value="1" value_type="e_option"/>
<efx:param name="seq-opt-sync-only" value="0" value_type="e_option"/>
<efx:param name="use-logic-for-small-mem" value="64" value_type="e_integer"/>
@@ -56,16 +58,17 @@
<efx:param name="dsp-input-regs-packing" value="1" value_type="e_option"/>
<efx:param name="dsp-output-regs-packing" value="1" value_type="e_option"/>
<efx:param name="dsp-mac-packing" value="1" value_type="e_option"/>
<efx:param name="insert-carry-skip" value="0" value_type="e_option"/>
<efx:param name="insert-carry-skip" value="1" value_type="e_option"/>
<efx:param name="pack-luts-to-comb4" value="0" value_type="e_option"/>
<efx:dynparam name="asdf" value="asdf"/>
</efx:synthesis>
<efx:place_and_route tool_name="efx_pnr">
<efx:param name="work_dir" value="work_pnr" value_type="e_string"/>
<efx:param name="verbose" value="off" value_type="e_bool"/>
<efx:param name="load_delaym" value="on" value_type="e_bool"/>
<efx:param name="optimization_level" value="NULL" value_type="e_option"/>
<efx:param name="optimization_level" value="TIMING_3" value_type="e_option"/>
<efx:param name="seed" value="1" value_type="e_integer"/>
<efx:param name="placer_effort_level" value="2" value_type="e_option"/>
<efx:param name="placer_effort_level" value="5" value_type="e_option"/>
<efx:param name="max_threads" value="-1" value_type="e_integer"/>
<efx:param name="print_critical_path" value="10" value_type="e_integer"/>
<efx:param name="classic_flow" value="off" value_type="e_noarg"/>

View File

@@ -70,4 +70,6 @@ async def test_sanity(dut):
tb.log.info(f"tag: {tag:x}")
assert tag == result
await Timer(1, "us")

View File

@@ -23,13 +23,15 @@ logic [127:0] poly1305_r, poly1305_s;
logic [129:0] accumulator, accumulator_next;
logic [129:0] data_one_extended;
logic [130:0] data_post_add;
logic [130:0] data_post_add, data_post_add_reg;
logic [257:0] data_post_mul, data_post_mul_reg;
logic phase;
logic [257:0] modulo_stage, modulo_stage_next;
logic [1:0] valid_sr;
logic [2:0] phase;
logic [3:0] valid_sr;
function logic [129:0] tkeep_expand (input [15:0] tkeep);
tkeep_expand = '0;
@@ -40,7 +42,7 @@ endfunction
// only ready in phase 0
assign s_data_axis.tready = phase == 0;
assign o_tag_valid = valid_sr[1];
assign o_tag_valid = valid_sr[3];
always_ff @(posedge i_clk) begin
if (i_rst) begin
@@ -48,7 +50,10 @@ always_ff @(posedge i_clk) begin
valid_sr <= '0;
end
valid_sr <= {valid_sr[0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & ~phase};
valid_sr <= {valid_sr[2:0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & (phase == 0)};
data_post_add_reg <= data_post_add;
data_post_mul_reg <= data_post_mul;
modulo_stage <= modulo_stage_next;
if (i_otk_valid) begin
poly1305_r <= i_otk[255:128] & R_MASK;
@@ -56,11 +61,18 @@ always_ff @(posedge i_clk) begin
end
if (s_data_axis.tvalid && phase == 0) begin
data_post_mul_reg <= data_post_mul;
phase <= '1;
phase <= 1;
end
if (phase == '1) begin
if (phase == 1) begin
phase <= 2;
end
if (phase == 2) begin
phase <= 3;
end
if (phase == 3) begin
accumulator <= accumulator_next;
phase <= '0;
end
@@ -73,10 +85,15 @@ always_comb begin
// phase == 0
data_one_extended = (tkeep_expand(s_data_axis.tkeep) + 1) | {2'b0, s_data_axis.tdata};
data_post_add = data_one_extended + accumulator;
data_post_mul = data_post_add * poly1305_r;
// phase == 1
accumulator_next = 130'(data_post_mul_reg % P130M5);
data_post_mul = data_post_add_reg * poly1305_r;
// phase == 2
modulo_stage_next = (data_post_mul_reg[257:130] * 5) + 258'(data_post_mul_reg[129:0]);
// phase == 3
accumulator_next = 130'((modulo_stage[257:130] * 5) + 258'(modulo_stage[129:0]));
end
assign o_tag = accumulator[127:0] + poly1305_s;