// do an entire round combinationally `define ROTL(x, n) {x[31-n:0], x[31:32-n]} module chacha20_qr #( parameter PIPELINE_STAGES=7 )( input logic i_clk, input logic i_rst, input logic i_valid, output logic o_ready, input logic [31:0] a_i, b_i, c_i, d_i, output logic o_valid, input logic i_ready, output logic [31:0] a_o, b_o, c_o, d_o ); logic [31:0] a_int [7]; logic [31:0] b_int [7]; logic [31:0] c_int [7]; logic [31:0] d_int [7]; logic [6:0] valid_sr; // There is an output stage which handles isolating backpressure from the rest // of the design from the core, so we don't need to worry about it here, we can // have a single signal gate all of this. assign o_ready = i_ready; always_ff @(posedge i_clk) begin if (i_rst) begin valid_sr <= '0; end else begin if (i_ready) begin // 1. Update A a_int[0] <= a_i + b_i; b_int[0] <= b_i; c_int[0] <= c_i; d_int[0] <= d_i; // 2. Update D a_int[1] <= a_int[0]; b_int[1] <= b_int[0]; c_int[1] <= c_int[0]; d_int[1] <= `ROTL(a_int[0], 16) ^ `ROTL(d_int[0], 16); // 3. Update C a_int[2] <= a_int[1]; b_int[2] <= b_int[1]; c_int[2] <= c_int[1] + d_int[1]; d_int[2] <= d_int[1]; // 4. Update B a_int[3] <= a_int[2]; b_int[3] <= `ROTL(b_int[2], 12) ^ `ROTL(c_int[2], 12); c_int[3] <= c_int[2]; d_int[3] <= d_int[2]; // 5. Update A a_int[4] <= a_int[3] + b_int[3]; b_int[4] <= b_int[3]; c_int[4] <= c_int[3]; d_int[4] <= d_int[3]; // 6. Update D a_int[5] <= a_int[4]; b_int[5] <= b_int[4]; c_int[5] <= c_int[4]; d_int[5] <= `ROTL(a_int[4], 8) ^ `ROTL(d_int[4], 8); // 7. Update C a_int[6] <= a_int[5]; b_int[6] <= b_int[5]; c_int[6] <= c_int[5] + d_int[5]; d_int[6] <= d_int[5]; // 8. Update B a_o <= a_int[6]; b_o <= `ROTL(b_int[6], 7) ^ `ROTL(c_int[6], 7); c_o <= c_int[6]; d_o <= d_int[6]; // Simultaneously, update valid_sr; valid_sr <= {valid_sr[5:0], i_valid}; o_valid <= valid_sr[6]; end end end endmodule