ramblings
This commit is contained in:
@@ -105,4 +105,49 @@ quarter round module can have 2 different blocks going through it at once.
|
||||
|
||||
The new one multiplexes 4 quarter rounds between 1 QR module which reduces the
|
||||
logic usage down to only 46k le, of which the vast majority is flops (2k ff per round,
|
||||
0.5k lut)
|
||||
0.5k lut)
|
||||
|
||||
|
||||
# Modulo 2^130-5
|
||||
|
||||
We can use the trick here to do modulo reduction much faster.
|
||||
|
||||
If we split the bits at 2^130, leaving 129 high bits and 130 low bits, we now
|
||||
have a 129 bit value multiplied by 2^130, plus the 130 bit value. We know that
|
||||
2^130 mod 2^130-5 is 5, so we can replace that 2^130 with 5 and add, then
|
||||
repeat that step again.
|
||||
|
||||
Ex.
|
||||
|
||||
x = x1*2^130 + x2
|
||||
x mod 2^130-5 = x1*5 + x2 -> x1*5+x2 = x3
|
||||
x mod 2^130-5 = x3*2^130 + x4
|
||||
x mod 2^130-5 = x3*5+x4
|
||||
|
||||
|
||||
and lets do the math to verify that we only need two rounds. The maximum value
|
||||
that we could possible get is 2^131-1 and the maxmimum value for R is
|
||||
0x0ffffffc0ffffffc0ffffffc0fffffff. Multiplying these together gives us
|
||||
0x7fffffe07fffffe07fffffe07ffffff7f0000003f0000003f0000003f0000001.
|
||||
|
||||
Applying the first round to this we get
|
||||
|
||||
0x1ffffff81ffffff81ffffff81ffffffd * 5 + 0x3f0000003f0000003f0000003f0000001
|
||||
= 0x48fffffdc8fffffdc8fffffdc8ffffff2
|
||||
|
||||
applying the second round to this we get
|
||||
|
||||
1 * 5 + 0x8fffffdc8fffffdc8fffffdc8ffffff2 = 0x8fffffdc8fffffdc8fffffdc8ffffff7
|
||||
|
||||
and this is indeed the correct answer. The bottom part is 130 bits but since we
|
||||
put in the max values and it didn't overflow, I don't think it will overflow here.
|
||||
|
||||
131+128 = 259 bits, only have to do this once
|
||||
|
||||
0xb83fe991ca75d7ef2ab5cba9cccdfd938b73fff384ac90ed284034da565ecf
|
||||
0x19471c3e3e9c1bfded81da3736e96604a
|
||||
|
||||
|
||||
Kind of curious now, at what point does a ripple carry adder using dedicated
|
||||
CI/CO ports become slower then a more complex adder like carry lookahead or
|
||||
carry save (wallace tree)
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
create_clock -period 2.5 -name clk [get_ports i_clk]
|
||||
@@ -0,0 +1,42 @@
|
||||
module mult_timing_test(
|
||||
input i_clk,
|
||||
|
||||
input logic [132:0] data_a,
|
||||
input logic [127:0] data_b,
|
||||
|
||||
output logic [260:0] data_z
|
||||
);
|
||||
|
||||
logic [132:0] data_a_reg;
|
||||
logic [127:0] data_b_reg;
|
||||
|
||||
|
||||
logic [260:0] partial_result [7];
|
||||
|
||||
logic [260:0] data_z_temp_1[4];
|
||||
logic [260:0] data_z_temp_2_0, data_z_temp_2_1;
|
||||
|
||||
always @(posedge i_clk) begin
|
||||
data_a_reg <= data_a;
|
||||
data_b_reg <= data_b;
|
||||
|
||||
for (int i = 0; i < 7; i++) begin
|
||||
partial_result[i] <= data_a_reg[i*18 +: 18] * data_b_reg;
|
||||
end
|
||||
|
||||
|
||||
data_z_temp_1[0] <= (partial_result[0] << (19*0)) + (partial_result[1] << (19*1));
|
||||
data_z_temp_1[1] <= (partial_result[2] << (19*0)) + (partial_result[3] << (19*1));
|
||||
data_z_temp_1[2] <= (partial_result[4] << (19*0)) + (partial_result[5] << (19*1));
|
||||
data_z_temp_1[3] <= (partial_result[6] << (19*0));
|
||||
|
||||
data_z_temp_2_0 <= data_z_temp_1[0] + (data_z_temp_1[1] << (19*2));
|
||||
data_z_temp_2_1 <= data_z_temp_1[2] + (data_z_temp_1[3] << (19*2));
|
||||
|
||||
data_z <= data_z_temp_2_0 + data_z_temp_2_1;
|
||||
|
||||
// data_z <= data_z_temp_2[0] + (data_z_temp_2[1] << (19*4));
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
||||
@@ -1,18 +1,20 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<efx:project name="poly1305_timing_test" description="" last_change="1751725092" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
|
||||
<efx:project name="poly1305_timing_test" description="" last_change="1752448578" sw_version="2025.1.110" last_run_state="pass" last_run_flow="bitstream" config_result_in_sync="true" design_ood="sync" place_ood="sync" route_ood="sync" xmlns:efx="http://www.efinixinc.com/enf_proj" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.efinixinc.com/enf_proj enf_proj.xsd">
|
||||
<efx:device_info>
|
||||
<efx:family name="Titanium"/>
|
||||
<efx:device name="Ti375N1156"/>
|
||||
<efx:timing_model name="C4"/>
|
||||
</efx:device_info>
|
||||
<efx:design_info def_veri_version="sv_09" def_vhdl_version="vhdl_2008" unified_flow="false">
|
||||
<efx:top_module name="poly1305_core_wrapper"/>
|
||||
<efx:top_module name="mult_timing_test"/>
|
||||
<efx:design_file name="../src/poly1305_core.sv" version="default" library="default"/>
|
||||
<efx:design_file name="../../common/sim/sub/taxi/src/axis/rtl/taxi_axis_if.sv" version="default" library="default"/>
|
||||
<efx:design_file name="../sim/poly1305_core_wrapper.sv" version="default" library="default"/>
|
||||
<efx:design_file name="mult_timing_test.sv" version="default" library="default"/>
|
||||
<efx:top_vhdl_arch name=""/>
|
||||
</efx:design_info>
|
||||
<efx:constraint_info>
|
||||
<efx:sdc_file name="constraints.sdc"/>
|
||||
<efx:inter_file name=""/>
|
||||
</efx:constraint_info>
|
||||
<efx:sim_info/>
|
||||
@@ -39,15 +41,15 @@
|
||||
<efx:param name="min-sr-fanout" value="0" value_type="e_integer"/>
|
||||
<efx:param name="min-ce-fanout" value="0" value_type="e_integer"/>
|
||||
<efx:param name="mode" value="speed" value_type="e_option"/>
|
||||
<efx:param name="mult-auto-pipeline" value="0" value_type="e_integer"/>
|
||||
<efx:param name="mult-decomp-retime" value="0" value_type="e_option"/>
|
||||
<efx:param name="operator-sharing" value="0" value_type="e_option"/>
|
||||
<efx:param name="optimize-adder-tree" value="0" value_type="e_option"/>
|
||||
<efx:param name="mult-auto-pipeline" value="1" value_type="e_integer"/>
|
||||
<efx:param name="mult-decomp-retime" value="1" value_type="e_option"/>
|
||||
<efx:param name="operator-sharing" value="1" value_type="e_option"/>
|
||||
<efx:param name="optimize-adder-tree" value="1" value_type="e_option"/>
|
||||
<efx:param name="optimize-zero-init-rom" value="1" value_type="e_option"/>
|
||||
<efx:param name="peri-syn-instantiation" value="0" value_type="e_option"/>
|
||||
<efx:param name="peri-syn-inference" value="0" value_type="e_option"/>
|
||||
<efx:param name="ram-decomp-mode" value="0" value_type="e_option"/>
|
||||
<efx:param name="retiming" value="1" value_type="e_option"/>
|
||||
<efx:param name="retiming" value="2" value_type="e_option"/>
|
||||
<efx:param name="seq_opt" value="1" value_type="e_option"/>
|
||||
<efx:param name="seq-opt-sync-only" value="0" value_type="e_option"/>
|
||||
<efx:param name="use-logic-for-small-mem" value="64" value_type="e_integer"/>
|
||||
@@ -56,16 +58,17 @@
|
||||
<efx:param name="dsp-input-regs-packing" value="1" value_type="e_option"/>
|
||||
<efx:param name="dsp-output-regs-packing" value="1" value_type="e_option"/>
|
||||
<efx:param name="dsp-mac-packing" value="1" value_type="e_option"/>
|
||||
<efx:param name="insert-carry-skip" value="0" value_type="e_option"/>
|
||||
<efx:param name="insert-carry-skip" value="1" value_type="e_option"/>
|
||||
<efx:param name="pack-luts-to-comb4" value="0" value_type="e_option"/>
|
||||
<efx:dynparam name="asdf" value="asdf"/>
|
||||
</efx:synthesis>
|
||||
<efx:place_and_route tool_name="efx_pnr">
|
||||
<efx:param name="work_dir" value="work_pnr" value_type="e_string"/>
|
||||
<efx:param name="verbose" value="off" value_type="e_bool"/>
|
||||
<efx:param name="load_delaym" value="on" value_type="e_bool"/>
|
||||
<efx:param name="optimization_level" value="NULL" value_type="e_option"/>
|
||||
<efx:param name="optimization_level" value="TIMING_3" value_type="e_option"/>
|
||||
<efx:param name="seed" value="1" value_type="e_integer"/>
|
||||
<efx:param name="placer_effort_level" value="2" value_type="e_option"/>
|
||||
<efx:param name="placer_effort_level" value="5" value_type="e_option"/>
|
||||
<efx:param name="max_threads" value="-1" value_type="e_integer"/>
|
||||
<efx:param name="print_critical_path" value="10" value_type="e_integer"/>
|
||||
<efx:param name="classic_flow" value="off" value_type="e_noarg"/>
|
||||
|
||||
@@ -70,4 +70,6 @@ async def test_sanity(dut):
|
||||
|
||||
tb.log.info(f"tag: {tag:x}")
|
||||
|
||||
assert tag == result
|
||||
|
||||
await Timer(1, "us")
|
||||
@@ -23,13 +23,15 @@ logic [127:0] poly1305_r, poly1305_s;
|
||||
logic [129:0] accumulator, accumulator_next;
|
||||
|
||||
logic [129:0] data_one_extended;
|
||||
logic [130:0] data_post_add;
|
||||
logic [130:0] data_post_add, data_post_add_reg;
|
||||
|
||||
logic [257:0] data_post_mul, data_post_mul_reg;
|
||||
|
||||
logic phase;
|
||||
logic [257:0] modulo_stage, modulo_stage_next;
|
||||
|
||||
logic [1:0] valid_sr;
|
||||
logic [2:0] phase;
|
||||
|
||||
logic [3:0] valid_sr;
|
||||
|
||||
function logic [129:0] tkeep_expand (input [15:0] tkeep);
|
||||
tkeep_expand = '0;
|
||||
@@ -40,7 +42,7 @@ endfunction
|
||||
|
||||
// only ready in phase 0
|
||||
assign s_data_axis.tready = phase == 0;
|
||||
assign o_tag_valid = valid_sr[1];
|
||||
assign o_tag_valid = valid_sr[3];
|
||||
|
||||
always_ff @(posedge i_clk) begin
|
||||
if (i_rst) begin
|
||||
@@ -48,7 +50,10 @@ always_ff @(posedge i_clk) begin
|
||||
valid_sr <= '0;
|
||||
end
|
||||
|
||||
valid_sr <= {valid_sr[0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & ~phase};
|
||||
valid_sr <= {valid_sr[2:0], s_data_axis.tlast & s_data_axis.tvalid & s_data_axis.tready & (phase == 0)};
|
||||
data_post_add_reg <= data_post_add;
|
||||
data_post_mul_reg <= data_post_mul;
|
||||
modulo_stage <= modulo_stage_next;
|
||||
|
||||
if (i_otk_valid) begin
|
||||
poly1305_r <= i_otk[255:128] & R_MASK;
|
||||
@@ -56,11 +61,18 @@ always_ff @(posedge i_clk) begin
|
||||
end
|
||||
|
||||
if (s_data_axis.tvalid && phase == 0) begin
|
||||
data_post_mul_reg <= data_post_mul;
|
||||
phase <= '1;
|
||||
phase <= 1;
|
||||
end
|
||||
|
||||
if (phase == '1) begin
|
||||
if (phase == 1) begin
|
||||
phase <= 2;
|
||||
end
|
||||
|
||||
if (phase == 2) begin
|
||||
phase <= 3;
|
||||
end
|
||||
|
||||
if (phase == 3) begin
|
||||
accumulator <= accumulator_next;
|
||||
phase <= '0;
|
||||
end
|
||||
@@ -73,10 +85,15 @@ always_comb begin
|
||||
// phase == 0
|
||||
data_one_extended = (tkeep_expand(s_data_axis.tkeep) + 1) | {2'b0, s_data_axis.tdata};
|
||||
data_post_add = data_one_extended + accumulator;
|
||||
data_post_mul = data_post_add * poly1305_r;
|
||||
|
||||
// phase == 1
|
||||
accumulator_next = 130'(data_post_mul_reg % P130M5);
|
||||
data_post_mul = data_post_add_reg * poly1305_r;
|
||||
|
||||
// phase == 2
|
||||
modulo_stage_next = (data_post_mul_reg[257:130] * 5) + 258'(data_post_mul_reg[129:0]);
|
||||
|
||||
// phase == 3
|
||||
accumulator_next = 130'((modulo_stage[257:130] * 5) + 258'(modulo_stage[129:0]));
|
||||
end
|
||||
|
||||
assign o_tag = accumulator[127:0] + poly1305_s;
|
||||
|
||||
Reference in New Issue
Block a user