diff --git a/ChaCha20_Poly1305_64/doc/notes.md b/ChaCha20_Poly1305_64/doc/notes.md index 21d0619..47e7a49 100644 --- a/ChaCha20_Poly1305_64/doc/notes.md +++ b/ChaCha20_Poly1305_64/doc/notes.md @@ -98,4 +98,7 @@ Actualyl its 88k luts... its 512ff * 4 * 20 = 40k ff Lets just leave it for now even if its overkill. The hardware would support up to 40Gbps, and technically the FPGA has 16 lanes so could do 160Gbps in total, if -we designed a custom board for it (or 120 if we used FMC connectors). \ No newline at end of file +we designed a custom board for it (or 120 if we used FMC connectors). + +If we only use a single quarter round multiplexed between all 4, then the same +quarter round module can have 2 different blocks going through it at once. \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/doc/qr_pipelining.drawio b/ChaCha20_Poly1305_64/doc/qr_pipelining.drawio new file mode 100644 index 0000000..143921e --- /dev/null +++ b/ChaCha20_Poly1305_64/doc/qr_pipelining.drawio @@ -0,0 +1,154 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ChaCha20_Poly1305_64/sim/chacha20.yaml b/ChaCha20_Poly1305_64/sim/chacha20.yaml index 86a2401..d5ef42d 100644 --- a/ChaCha20_Poly1305_64/sim/chacha20.yaml +++ b/ChaCha20_Poly1305_64/sim/chacha20.yaml @@ -4,4 +4,16 @@ tests: modules: - "chacha20_block" sources: "sources.list" + waves: True + - name: "chacha20_pipelined_round" + toplevel: "chacha20_pipelined_round" + modules: + - "chacha20_pipelined_round" + sources: "sources.list" + waves: True + - name: "chacha20_pipelined_block" + toplevel: "chacha20_pipelined_block" + modules: + - "chacha20_pipelined_block" + sources: "sources.list" waves: True \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/sim/chacha20_pipelined_block.py b/ChaCha20_Poly1305_64/sim/chacha20_pipelined_block.py new file mode 100644 index 0000000..0b873f6 --- /dev/null +++ b/ChaCha20_Poly1305_64/sim/chacha20_pipelined_block.py @@ -0,0 +1,119 @@ +import cocotb + +import logging + +from chacha_helpers import chacha_block + +import struct + +import random + + +from cocotb.clock import Clock +from cocotb.triggers import Timer, RisingEdge, FallingEdge +from cocotb.queue import Queue + +CLK_PERIOD = 4 + +CONSTANT = [0x65787061, 0x6e642033, 0x322d6279, 0x7465206b] + + +class TB: + def __init__(self, dut): + self.dut = dut + + self.log = logging.getLogger("cocotb.tb") + self.log.setLevel(logging.INFO) + + self.input_queue = Queue() + + self.expected_queue = Queue() + self.output_queue = Queue() + + cocotb.start_soon(Clock(self.dut.i_clk, CLK_PERIOD, units="ns").start()) + + cocotb.start_soon(self.run_input()) + cocotb.start_soon(self.run_output()) + + async def cycle_reset(self): + await self._cycle_reset(self.dut.i_rst, self.dut.i_clk) + + async def _cycle_reset(self, rst, clk): + rst.setimmediatevalue(0) + await RisingEdge(clk) + await RisingEdge(clk) + rst.value = 1 + await RisingEdge(clk) + await RisingEdge(clk) + rst.value = 0 + await RisingEdge(clk) + await RisingEdge(clk) + + async def write_input(self, key, counter, nonce): + await self.input_queue.put((key, counter, nonce)) + + + data_in = CONSTANT[:] + data_in.extend(struct.unpack("8I", key.to_bytes(32, "little"))) + data_in.extend(struct.unpack("2I", counter.to_bytes(8, "little"))) + data_in.extend(struct.unpack("2I", nonce.to_bytes(8, "little"))) + + data_out = chacha_block(data_in) + + await self.expected_queue.put(data_out) + + async def run_input(self): + while True: + key, counter, nonce = await self.input_queue.get() + + self.dut.i_key.value = key + self.dut.i_counter.value = counter + self.dut.i_nonce.value = nonce + self.dut.i_ready.value = 1 + self.dut.i_valid.value = 1 + await RisingEdge(self.dut.i_clk) + self.dut.i_valid.value = 0 + + async def run_output(self): + while True: + await RisingEdge(self.dut.i_clk) + if self.dut.o_valid.value: + state = self.dut.o_state.value.integer + + state_bytes = int.to_bytes(state, 64, "little") + state_words = struct.unpack("16I", state_bytes) + + await self.output_queue.put(state_words) + +@cocotb.test +async def test_sanity(dut): + tb = TB(dut) + + await tb.cycle_reset() + + count = 1 + + for i in range(count): + key = random.randint(0, 2**256-1) + key = 0 + counter = i + nonce = random.randint(0, 2**64-1) + nonce = 0 + + await tb.write_input(key, counter, nonce) + + + + fail = False + for _ in range(count): + sim_vals = await tb.expected_queue.get() + dut_vals = await tb.output_queue.get() + + for i, (sim_val, dut_val) in enumerate(zip(sim_vals, dut_vals)): + if sim_val != dut_val: + tb.log.info(f"{i}: {sim_val:x} -> {dut_val:x}") + fail = True + + await Timer(1, "us") + + assert not fail \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/sim/chacha20_pipelined_round.py b/ChaCha20_Poly1305_64/sim/chacha20_pipelined_round.py new file mode 100644 index 0000000..501b809 --- /dev/null +++ b/ChaCha20_Poly1305_64/sim/chacha20_pipelined_round.py @@ -0,0 +1,85 @@ +import cocotb + +import logging + +from chacha_helpers import chacha_block, QR + +import struct + +import random + + +from cocotb.clock import Clock +from cocotb.triggers import Timer, RisingEdge, FallingEdge +from cocotb.queue import Queue + +CLK_PERIOD = 4 + +CONSTANT = [0x65787061, 0x6e642033, 0x322d6279, 0x7465206b] + + +class TB: + def __init__(self, dut): + self.dut = dut + + self.log = logging.getLogger("cocotb.tb") + self.log.setLevel(logging.INFO) + + cocotb.start_soon(Clock(self.dut.i_clk, CLK_PERIOD, units="ns").start()) + + + async def cycle_reset(self): + await self._cycle_reset(self.dut.i_rst, self.dut.i_clk) + + async def _cycle_reset(self, rst, clk): + rst.setimmediatevalue(0) + await RisingEdge(clk) + await RisingEdge(clk) + rst.value = 1 + await RisingEdge(clk) + await RisingEdge(clk) + rst.value = 0 + await RisingEdge(clk) + await RisingEdge(clk) + +@cocotb.test +async def test_sanity(dut): + tb = TB(dut) + + await tb.cycle_reset() + + # data_in = [0x65787061, 0x6e642033, 0x322d6279, 0x7465206b, + # 0, 0, 0, 0, + # 0, 0, 0, 0, + # 0, 0, 0, 0] + + data = [1, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0] + + # data[0], data[4], data[8], data[12] = QR(data[0], data[4], data[8], data[12]) + # data[1], data[5], data[9], data[13] = QR(data[1], data[5], data[9], data[13]) + # data[2], data[6], data[10], data[14] = QR(data[2], data[6], data[10], data[14]) + # data[3], data[7], data[11], data[15] = QR(data[3], data[7], data[11], data[15]) + + data[0], data[5], data[10], data[15] = QR(data[0], data[5], data[10], data[15]) + data[1], data[6], data[11], data[12] = QR(data[1], data[6], data[11], data[12]) + data[2], data[7], data[8], data[13] = QR(data[2], data[7], data[8], data[13]) + data[3], data[4], data[9], data[14] = QR(data[3], data[4], data[9], data[14]) + + for val in data: + tb.log.info(f"{val:x}") + + # tb.log.info(data_out) + + tb.dut.i_ready.value = 1 + + tb.dut.i_state.value = 1 + # tb.dut.i_state.value = 0x657870616e642033322d62797465206b + + tb.dut.i_valid.value = 1 + await RisingEdge(tb.dut.i_clk) + tb.dut.i_valid.value = 1 + + await Timer(1, "us") diff --git a/ChaCha20_Poly1305_64/src/chacha20_pipelined_block.sv b/ChaCha20_Poly1305_64/src/chacha20_pipelined_block.sv new file mode 100644 index 0000000..223277b --- /dev/null +++ b/ChaCha20_Poly1305_64/src/chacha20_pipelined_block.sv @@ -0,0 +1,137 @@ +module chacha20_pipelined_block #( + parameter KEY_SIZE = 256, + parameter COUNTER_SIZE = 64, + parameter NONCE_SIZE = 64, + parameter STATE_SIZE = 512, + parameter ROUNDS = 20, + parameter CONSTANT = 128'h657870616e642033322d62797465206b +)( + input logic i_clk, + input logic i_rst, + + input logic [KEY_SIZE-1:0] i_key, + input logic [COUNTER_SIZE-1:0] i_counter, + input logic [NONCE_SIZE-1:0] i_nonce, + input logic i_valid, + output logic o_ready, + + + output logic [STATE_SIZE-1:0] o_state, + output logic o_valid, + input logic i_ready +); + +// each round is 8 stages +localparam PIPE_STAGES = ROUNDS * 8; + +logic [511:0] state [ROUNDS+1]; +logic valid[ROUNDS+1]; +logic ready[ROUNDS+1]; + +// small fifo for storing the initial state. +// better to store it in a memory than in flops +logic [$clog2(PIPE_STAGES)-1:0] initial_state_wptr; +logic [$clog2(PIPE_STAGES)-1:0] initial_state_rptr; +logic [511:0] initial_states [PIPE_STAGES]; + +logic [511:0] state_pre_add; +logic pre_add_valid; + +logic [511:0] write_initial_state, read_initial_state; + +logic [511:0] original_initial_state; + +always_ff @(posedge i_clk) begin + if (i_rst) begin + initial_state_rptr <= '0; + initial_state_wptr <= '0; + end else begin + if (i_valid) begin + initial_states[initial_state_wptr] <= write_initial_state; + if (initial_state_wptr < PIPE_STAGES-1) begin + initial_state_wptr <= initial_state_wptr + 1; + end else begin + initial_state_wptr <= '0; + end + end + + pre_add_valid <= valid[ROUNDS]; + + if (valid[ROUNDS]) begin + read_initial_state <= initial_states[initial_state_rptr]; + if (initial_state_rptr < PIPE_STAGES-1) begin + initial_state_rptr <= initial_state_rptr + 1; + end else begin + initial_state_rptr <= '0; + end + state_pre_add <= state[ROUNDS]; + end + + + o_valid <= pre_add_valid; + + // We cannot just add state_pre_add and read_initial state + // because the addition needs to be done wordwise, with no + // carries between 32 bit groups. + for (int i = 0; i < 16; i++) begin + o_state[i*32 +: 32] <= state_pre_add[i*32 +: 32] + read_initial_state[i*32 +: 32]; + end + end +end + +always_comb begin + for (int i = 0; i < 4; i++) begin + state[0][32*(3-i) +: 32] = CONSTANT[32*(3-i) +: 32]; // constant is big endian + end + + for (int i = 0; i < 8; i++) begin + state[0][32*(i+4) +: 32] = i_key[32*i +: 32]; + end + + state[0][12*32 +: 32] = i_counter[0 +: 32]; + state[0][13*32 +: 32] = i_counter[32 +: 32]; + + state[0][14*32 +: 32] = i_nonce[0 +: 32]; + state[0][15*32 +: 32] = i_nonce[32 +: 32]; + + valid[0] = i_valid; + + o_ready = ready[0]; + + + write_initial_state = state[0]; + original_initial_state = read_initial_state; +end + + +generate + for (genvar round = 0; round < ROUNDS; round+=2) begin : ROUND_LOOP + chacha20_pipelined_round #(.IS_EVEN(0)) chacha20_pipelined_round_inst_odd ( + .i_clk (i_clk), + .i_rst (i_rst), + + .i_state (state[round]), + .i_valid (valid[round]), + .o_ready (ready[round]), + + .o_state (state[round+1]), + .o_valid (valid[round+1]), + .i_ready (ready[round+1]) + ); + + chacha20_pipelined_round #(.IS_EVEN(1)) chacha20_pipelined_round_inst_even ( + .i_clk (i_clk), + .i_rst (i_rst), + + .i_state (state[round+1]), + .i_valid (valid[round+1]), + .o_ready (ready[round+1]), + + .o_state (state[round+2]), + .o_valid (valid[round+2]), + .i_ready (ready[round+2]) + ); + end +endgenerate + +endmodule \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/src/chacha20_pipelined_round.sv b/ChaCha20_Poly1305_64/src/chacha20_pipelined_round.sv new file mode 100644 index 0000000..a211a69 --- /dev/null +++ b/ChaCha20_Poly1305_64/src/chacha20_pipelined_round.sv @@ -0,0 +1,219 @@ +module chacha20_pipelined_round #( + parameter IS_EVEN = 1, + parameter KEY_SIZE = 256, + parameter COUNTER_SIZE = 64, + parameter NONCE_SIZE = 64, + parameter STATE_SIZE = 512, + parameter ROUNDS = 20, + parameter CONSTANT = 128'h657870616e642033322d62797465206b +)( + input logic i_clk, + input logic i_rst, + + input logic [STATE_SIZE-1:0] i_state, + input logic i_valid, + output logic o_ready, + + output logic [STATE_SIZE-1:0] o_state, + output logic o_valid, + input logic i_ready +); + +logic [31:0] state_in [16]; +logic [31:0] state_out [16]; + +logic [31:0] qr_a_i, qr_a_o; +logic [31:0] qr_b_i, qr_b_o; +logic [31:0] qr_c_i, qr_c_o; +logic [31:0] qr_d_i, qr_d_o; + +logic qr_i_valid, qr_o_valid; +logic qr_o_ready, qr_i_ready; + +// state is shared between the in fsm and the out fsm +enum logic [1:0] {STATE_IN, STATE_SHIFT, STATE_OUT} in_fsm_state, out_fsm_state; +logic [1:0] in_counter; +logic [1:0] out_counter; + +`define QR_IN(a, b, c, d) \ +qr_a_i = state_in[a]; \ +qr_b_i = state_in[b]; \ +qr_c_i = state_in[c]; \ +qr_d_i = state_in[d] + +`define QR_OUT(a, b, c, d) \ +state_out[a] <= qr_a_o; \ +state_out[b] <= qr_b_o; \ +state_out[c] <= qr_c_o; \ +state_out[d] <= qr_d_o + +always_comb begin + if (IS_EVEN) begin + case (in_counter) + 0: begin + `QR_IN(0, 5, 10, 15); + end + + 1: begin + `QR_IN(1, 5, 9, 13); + end + + 2: begin + `QR_IN(2, 6, 10, 14); + end + + 3: begin + `QR_IN(3, 7, 11, 15); + end + endcase + end else begin + case (in_counter) + 0: begin + `QR_IN(0, 4, 8, 12); + end + + 1: begin + `QR_IN(1, 5, 9, 13); + end + + 2: begin + `QR_IN(2, 6, 10, 14); + end + + 3: begin + `QR_IN(3, 7, 11, 15); + end + endcase + end + + for (int i = 0; i < 16; i++) begin + o_state[32*i +: 32] = state_out[i]; + end +end + +chacha20_qr u_chacha20_qr ( + .i_clk (i_clk), + .i_rst (i_rst), + + .i_valid (qr_i_valid), + .o_ready (qr_o_ready), + .a_i (qr_a_i), + .b_i (qr_b_i), + .c_i (qr_c_i), + .d_i (qr_d_i), + + .o_valid (qr_o_valid), + .i_ready (qr_i_ready), + .a_o (qr_a_o), + .b_o (qr_b_o), + .c_o (qr_c_o), + .d_o (qr_d_o) +); + +assign o_ready = in_fsm_state == STATE_IN; +assign qr_i_valid = in_fsm_state == STATE_SHIFT; + +assign qr_i_ready = out_fsm_state == STATE_SHIFT; +assign o_valid = out_fsm_state == STATE_OUT; + +always_ff @(posedge i_clk) begin + if (i_rst) begin + in_fsm_state <= STATE_IN; + out_fsm_state <= STATE_SHIFT; + + in_counter <= '0; + out_counter <= '0; + end else begin + case (in_fsm_state) + STATE_IN: begin + if (i_valid) begin + // Register incoming state as unpacked 32 bit words + for (int i = 0; i < 16; i++) begin + state_in[i] = i_state[32*i +: 32]; + end + + in_counter <= '0; + in_fsm_state <= STATE_SHIFT; + end + end + + STATE_SHIFT: begin + if (qr_o_ready) begin + in_counter <= in_counter + 1; + + if (in_counter == 3) begin + in_fsm_state <= STATE_IN; + end + end + end + + default: begin + in_fsm_state <= STATE_IN; + end + endcase + + + case (out_fsm_state) + STATE_SHIFT: begin + if (qr_o_valid) begin + if (IS_EVEN) begin + case (out_counter) + 0: begin + `QR_OUT(0, 5, 10, 15); + end + + 1: begin + `QR_OUT(1, 6, 11, 12); + end + + 2: begin + `QR_OUT(2, 7, 8, 13); + end + + 3: begin + `QR_OUT(3, 4, 9, 14); + end + endcase + end else begin + case (out_counter) + 0: begin + `QR_OUT(0, 4, 8, 12); + end + + 1: begin + `QR_OUT(1, 5, 9, 13); + end + + 2: begin + `QR_OUT(2, 6, 10, 14); + end + + 3: begin + `QR_OUT(3, 7, 11, 15); + end + endcase + end + + out_counter <= out_counter + 1; + + if (out_counter == 3) begin + out_fsm_state <= STATE_OUT; + end + end + end + + STATE_OUT: begin + if (i_ready) begin + out_fsm_state <= STATE_SHIFT; + end + end + + default: begin + out_fsm_state <= STATE_SHIFT; + end + endcase + end +end + + +endmodule \ No newline at end of file diff --git a/ChaCha20_Poly1305_64/src/sources.list b/ChaCha20_Poly1305_64/src/sources.list index 9c60645..80637d1 100644 --- a/ChaCha20_Poly1305_64/src/sources.list +++ b/ChaCha20_Poly1305_64/src/sources.list @@ -1,2 +1,4 @@ chacha20_qr.sv -chacha20_block.sv \ No newline at end of file +chacha20_block.sv +chacha20_pipelined_round.sv +chacha20_pipelined_block.sv \ No newline at end of file