From 0fa26f2030c2c5f7152883df4b5d4bc8e1b4f007 Mon Sep 17 00:00:00 2001 From: Alex Mykyta Date: Tue, 15 Feb 2022 22:31:18 -0800 Subject: [PATCH] Enhance AXI4-Lite CPU Interface to support high performance pipelined transactions --- docs/architecture.rst | 3 +- docs/cpuif/axi4lite.rst | 11 + docs/index.rst | 1 + peakrdl/regblock/addr_decode.py | 2 +- peakrdl/regblock/cpuif/axi4lite/__init__.py | 22 ++ .../regblock/cpuif/axi4lite/axi4lite_tmpl.sv | 278 ++++++++++++------ peakrdl/regblock/cpuif/base.py | 3 +- peakrdl/regblock/exporter.py | 14 +- peakrdl/regblock/module_tmpl.sv | 6 +- peakrdl/regblock/utils.py | 5 +- .../cpuifs/axi4lite/axi4lite_intf_driver.sv | 2 +- 11 files changed, 253 insertions(+), 94 deletions(-) create mode 100644 docs/cpuif/axi4lite.rst diff --git a/docs/architecture.rst b/docs/architecture.rst index 7ccc45d..8492ad9 100644 --- a/docs/architecture.rst +++ b/docs/architecture.rst @@ -51,4 +51,5 @@ optimally reduced. A second optional read response retiming register can be enabled in-line with the path back to the CPU interface layer. This can be useful if the CPU interface protocol -used has a fully combinational response path, and needs to be retimed further. +used has a fully combinational response path, and the design's complexity requires +this path to be retimed further. diff --git a/docs/cpuif/axi4lite.rst b/docs/cpuif/axi4lite.rst new file mode 100644 index 0000000..465626b --- /dev/null +++ b/docs/cpuif/axi4lite.rst @@ -0,0 +1,11 @@ +AMBA AXI4-Lite +============== + +TODO: Describe the following + +* List of interface signals + + * interface name & modports (link to advanced topics in case user wants to override) + * flattened equivalents + +* Download link to SV interface definition diff --git a/docs/index.rst b/docs/index.rst index ae967ac..0165210 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -47,6 +47,7 @@ Links cpuif/addressing cpuif/apb3 + cpuif/axi4lite cpuif/advanced cpuif/internal_protocol diff --git a/peakrdl/regblock/addr_decode.py b/peakrdl/regblock/addr_decode.py index 7702682..ba37bf7 100644 --- a/peakrdl/regblock/addr_decode.py +++ b/peakrdl/regblock/addr_decode.py @@ -87,7 +87,7 @@ class DecodeLogicGenerator(RDLForLoopGenerator): def enter_Reg(self, node: RegNode) -> None: - s = f"{self.addr_decode.get_access_strobe(node)} = cpuif_req & (cpuif_addr == {self._get_address_str(node)});" + s = f"{self.addr_decode.get_access_strobe(node)} = cpuif_req_masked & (cpuif_addr == {self._get_address_str(node)});" self.add_content(s) diff --git a/peakrdl/regblock/cpuif/axi4lite/__init__.py b/peakrdl/regblock/cpuif/axi4lite/__init__.py index 71af396..a882541 100644 --- a/peakrdl/regblock/cpuif/axi4lite/__init__.py +++ b/peakrdl/regblock/cpuif/axi4lite/__init__.py @@ -14,6 +14,28 @@ class AXI4Lite_Cpuif(CpuifBase): def data_width_bytes(self) -> int: return self.data_width // 8 + @property + def regblock_latency(self) -> int: + return max(self.exp.min_read_latency, self.exp.min_write_latency) + + @property + def max_outstanding(self) -> int: + """ + Best pipelined performance is when the max outstanding transactions + is the design's latency + 2. + Anything beyond that does not have any effect, aside from adding unnecessary + logic and additional buffer-bloat latency. + """ + return self.regblock_latency + 2 + + @property + def resp_buffer_size(self) -> int: + """ + Response buffer size must be greater or equal to max outstanding + transactions to prevent response overrun. + """ + return self.max_outstanding + class AXI4Lite_Cpuif_flattened(AXI4Lite_Cpuif): @property diff --git a/peakrdl/regblock/cpuif/axi4lite/axi4lite_tmpl.sv b/peakrdl/regblock/cpuif/axi4lite/axi4lite_tmpl.sv index 756e15b..5ca4753 100644 --- a/peakrdl/regblock/cpuif/axi4lite/axi4lite_tmpl.sv +++ b/peakrdl/regblock/cpuif/axi4lite/axi4lite_tmpl.sv @@ -1,102 +1,216 @@ -enum logic [1:0] { - CPUIF_IDLE, - CPUIF_BRESP, - CPUIF_RRESP -} cpuif_state; - -logic cpuif_prev_was_rd; +// LATENCY = {{cpuif.regblock_latency}} +// MAX OUTSTANDING = {{cpuif.max_outstanding}} +logic [{{clog2(cpuif.max_outstanding+1)-1}}:0] axil_n_in_flight; +logic axil_prev_was_rd; +logic axil_arvalid; +logic [{{cpuif.addr_width-1}}:0] axil_araddr; +logic axil_ar_accept; +logic axil_awvalid; +logic [{{cpuif.addr_width-1}}:0] axil_awaddr; +logic axil_wvalid; +logic [{{cpuif.data_width-1}}:0] axil_wdata; +logic axil_aw_accept; +logic axil_resp_acked; always_ff {{get_always_ff_event(cpuif.reset)}} begin if({{get_resetsignal(cpuif.reset)}}) begin - cpuif_state <= CPUIF_IDLE; - cpuif_prev_was_rd <= '0; + axil_prev_was_rd <= '0; + axil_arvalid <= '0; + axil_araddr <= '0; + axil_awvalid <= '0; + axil_awaddr <= '0; + axil_wvalid <= '0; + axil_wdata <= '0; + axil_n_in_flight <= '0; + end else begin + // AR* acceptance register + if(axil_ar_accept) begin + axil_prev_was_rd <= '1; + axil_arvalid <= '0; + end + if({{cpuif.signal("arvalid")}} && {{cpuif.signal("arready")}}) begin + axil_arvalid <= '1; + axil_araddr <= {{cpuif.signal("araddr")}}; + end - cpuif_req <= '0; - cpuif_req_is_wr <= '0; - cpuif_addr <= '0; - cpuif_wr_data <= '0; + // AW* & W* acceptance registers + if(axil_aw_accept) begin + axil_prev_was_rd <= '0; + axil_awvalid <= '0; + axil_wvalid <= '0; + end + if({{cpuif.signal("awvalid")}} && {{cpuif.signal("awready")}}) begin + axil_awvalid <= '1; + axil_awaddr <= {{cpuif.signal("awaddr")}}; + end + if({{cpuif.signal("wvalid")}} && {{cpuif.signal("wready")}}) begin + axil_wvalid <= '1; + axil_wdata <= {{cpuif.signal("wdata")}}; + end - {{cpuif.signal("arready")}} <= '0; - {{cpuif.signal("awready")}} <= '0; - {{cpuif.signal("wready")}} <= '0; + // Keep track of in-flight transactions + if((axil_ar_accept || axil_aw_accept) && !axil_resp_acked) begin + axil_n_in_flight <= axil_n_in_flight + 1'b1; + end else if(!(axil_ar_accept || axil_aw_accept) && axil_resp_acked) begin + axil_n_in_flight <= axil_n_in_flight - 1'b1; + end + end +end + +always_comb begin + {{cpuif.signal("arready")}} = (!axil_arvalid || axil_ar_accept); + {{cpuif.signal("awready")}} = (!axil_awvalid || axil_aw_accept); + {{cpuif.signal("wready")}} = (!axil_wvalid || axil_aw_accept); +end + +// Request dispatch +always_comb begin + cpuif_wr_data = axil_wdata; + cpuif_req = '0; + cpuif_req_is_wr = '0; + cpuif_addr = '0; + axil_ar_accept = '0; + axil_aw_accept = '0; + + if(axil_n_in_flight < 'd{{cpuif.max_outstanding}}) begin + // Can safely issue more transactions without overwhelming response buffer + if(axil_arvalid && !axil_prev_was_rd) begin + cpuif_req = '1; + cpuif_req_is_wr = '0; + cpuif_addr = axil_araddr; + if(!cpuif_req_stall_rd) axil_ar_accept = '1; + end else if(axil_awvalid && axil_wvalid) begin + cpuif_req = '1; + cpuif_req_is_wr = '1; + cpuif_addr = axil_awaddr; + if(!cpuif_req_stall_wr) axil_aw_accept = '1; + end else if(axil_arvalid) begin + cpuif_req = '1; + cpuif_req_is_wr = '0; + cpuif_addr = axil_araddr; + if(!cpuif_req_stall_rd) axil_ar_accept = '1; + end + end +end + + +// AXI4-Lite Response Logic +{%- if cpuif.resp_buffer_size == 1 %} +always_ff {{get_always_ff_event(cpuif.reset)}} begin + if({{get_resetsignal(cpuif.reset)}}) begin + {{cpuif.signal("rvalid")}} <= '0; + {{cpuif.signal("rresp")}} <= '0; + {{cpuif.signal("rdata")}} <= '0; {{cpuif.signal("bvalid")}} <= '0; {{cpuif.signal("bresp")}} <= '0; - {{cpuif.signal("rvalid")}} <= '0; - {{cpuif.signal("rdata")}} <= '0; - {{cpuif.signal("rresp")}} <= '0; end else begin - // Load response transfers as they arrive + if({{cpuif.signal("rvalid")}} && {{cpuif.signal("rready")}}) begin + {{cpuif.signal("rvalid")}} <= '0; + end + + if({{cpuif.signal("bvalid")}} && {{cpuif.signal("bready")}}) begin + {{cpuif.signal("bvalid")}} <= '0; + end + if(cpuif_rd_ack) begin {{cpuif.signal("rvalid")}} <= '1; {{cpuif.signal("rdata")}} <= cpuif_rd_data; if(cpuif_rd_err) {{cpuif.signal("rresp")}} <= 2'b10; // SLVERR else {{cpuif.signal("rresp")}} <= 2'b00; // OKAY end + if(cpuif_wr_ack) begin {{cpuif.signal("bvalid")}} <= '1; if(cpuif_wr_err) {{cpuif.signal("bresp")}} <= 2'b10; // SLVERR else {{cpuif.signal("bresp")}} <= 2'b00; // OKAY end - - // Transaction state machine - case(cpuif_state) - CPUIF_IDLE: begin - // round-robin arbitrate between read/write requests - // Allow read if previous transfer was not a read, or no write is active - if({{cpuif.signal("arvalid")}} && (!cpuif_prev_was_rd || !{{cpuif.signal("awvalid")}} || !{{cpuif.signal("wvalid")}})) begin - cpuif_req <= '1; - cpuif_req_is_wr <= '0; - {%- if cpuif.data_width == 8 %} - cpuif_addr <= {{cpuif.signal("araddr")}}[{{cpuif.addr_width-1}}:0]; - {%- else %} - cpuif_addr <= { {{-cpuif.signal("araddr")}}[{{cpuif.addr_width-1}}:{{clog2(cpuif.data_width_bytes)}}], {{clog2(cpuif.data_width_bytes)}}'b0}; - {%- endif %} - {{cpuif.signal("arready")}} <= '1; - cpuif_state <= CPUIF_RRESP; - end else if({{cpuif.signal("awvalid")}} && {{cpuif.signal("wvalid")}}) begin - {{cpuif.signal("awready")}} <= '1; - {{cpuif.signal("wready")}} <= '1; - if({{cpuif.signal("wstrb")}} != {{"%d'b" % cpuif.data_width_bytes}}{{"1" * cpuif.data_width_bytes}}) begin - // Unaligned writes or use of byte strobes is not supported yet - {{cpuif.signal("bvalid")}} <= '1; - {{cpuif.signal("bresp")}} <= 2'b10; // SLVERR - end else begin - cpuif_req <= '1; - cpuif_req_is_wr <= '1; - {%- if cpuif.data_width == 8 %} - cpuif_addr <= {{cpuif.signal("awaddr")}}[{{cpuif.addr_width-1}}:0]; - {%- else %} - cpuif_addr <= { {{-cpuif.signal("awaddr")}}[{{cpuif.addr_width-1}}:{{clog2(cpuif.data_width_bytes)}}], {{clog2(cpuif.data_width_bytes)}}'b0}; - {%- endif %} - cpuif_wr_data <= {{cpuif.signal("wdata")}}; - end - cpuif_state <= CPUIF_BRESP; - end - end - - CPUIF_BRESP: begin - cpuif_req <= '0; - {{cpuif.signal("awready")}} <= '0; - {{cpuif.signal("wready")}} <= '0; - cpuif_prev_was_rd <= '0; - if({{cpuif.signal("bvalid")}} && {{cpuif.signal("bready")}}) begin - {{cpuif.signal("bvalid")}} <= '0; - cpuif_state <= CPUIF_IDLE; - end - end - - CPUIF_RRESP: begin - cpuif_req <= '0; - {{cpuif.signal("arready")}} <= '0; - cpuif_prev_was_rd <= '1; - if({{cpuif.signal("rvalid")}} && {{cpuif.signal("rready")}}) begin - {{cpuif.signal("rvalid")}} <= '0; - cpuif_state <= CPUIF_IDLE; - end - end - - default: begin - cpuif_state <= CPUIF_IDLE; - end - endcase end end + +always_comb begin + axil_resp_acked = '0; + if({{cpuif.signal("rvalid")}} && {{cpuif.signal("rready")}}) axil_resp_acked = '1; + if({{cpuif.signal("bvalid")}} && {{cpuif.signal("bready")}}) axil_resp_acked = '1; +end + +{%- else %} +struct { + logic is_wr; + logic err; + logic [{{cpuif.data_width-1}}:0] rdata; +} axil_resp_buffer[{{cpuif.resp_buffer_size}}]; +logic [{{clog2(cpuif.resp_buffer_size)}}:0] axil_resp_wptr; +logic [{{clog2(cpuif.resp_buffer_size)}}:0] axil_resp_rptr; + +always_ff {{get_always_ff_event(cpuif.reset)}} begin + if({{get_resetsignal(cpuif.reset)}}) begin + for(int i=0; i<{{cpuif.resp_buffer_size}}; i++) begin + axil_resp_buffer[i].is_wr = '0; + axil_resp_buffer[i].err = '0; + axil_resp_buffer[i].rdata = '0; + end + axil_resp_wptr <= '0; + axil_resp_rptr <= '0; + end else begin + // Store responses in buffer until AXI response channel accepts them + if(cpuif_rd_ack || cpuif_wr_ack) begin + if(cpuif_rd_ack) begin + axil_resp_buffer[axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].is_wr = '0; + axil_resp_buffer[axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].err = cpuif_rd_err; + axil_resp_buffer[axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].rdata = cpuif_rd_data; + + end else if(cpuif_wr_ack) begin + axil_resp_buffer[axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].is_wr = '1; + axil_resp_buffer[axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].err = cpuif_wr_err; + end + {%- if is_pow2(cpuif.resp_buffer_size) %} + axil_resp_wptr <= axil_resp_wptr + 1'b1; + {%- else %} + if(axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] == {{cpuif.resp_buffer_size-1}}) begin + axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] <= '0; + axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)}}] <= ~axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)}}]; + end else begin + axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] <= axil_resp_wptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] + 1'b1; + end + {%- endif %} + end + + // Advance read pointer when acknowledged + if(axil_resp_acked) begin + {%- if is_pow2(cpuif.resp_buffer_size) %} + axil_resp_rptr <= axil_resp_rptr + 1'b1; + {%- else %} + if(axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] == {{cpuif.resp_buffer_size-1}}) begin + axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] <= '0; + axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)}}] <= ~axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)}}]; + end else begin + axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] <= axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)-1}}:0] + 1'b1; + end + {%- endif %} + end + end +end + +always_comb begin + axil_resp_acked = '0; + {{cpuif.signal("bvalid")}} = '0; + {{cpuif.signal("rvalid")}} = '0; + if(axil_resp_rptr != axil_resp_wptr) begin + if(axil_resp_buffer[axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].is_wr) begin + {{cpuif.signal("bvalid")}} = '1; + if({{cpuif.signal("bready")}}) axil_resp_acked = '1; + end else begin + {{cpuif.signal("rvalid")}} = '1; + if({{cpuif.signal("rready")}}) axil_resp_acked = '1; + end + end + + {{cpuif.signal("rdata")}} = axil_resp_buffer[axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].rdata; + if(axil_resp_buffer[axil_resp_rptr[{{clog2(cpuif.resp_buffer_size)-1}}:0]].err) begin + {{cpuif.signal("bresp")}} = 2'b10; + {{cpuif.signal("rresp")}} = 2'b10; + end else begin + {{cpuif.signal("bresp")}} = 2'b00; + {{cpuif.signal("rresp")}} = 2'b00; + end +end +{%- endif %} diff --git a/peakrdl/regblock/cpuif/base.py b/peakrdl/regblock/cpuif/base.py index 0418260..967ef35 100644 --- a/peakrdl/regblock/cpuif/base.py +++ b/peakrdl/regblock/cpuif/base.py @@ -1,6 +1,6 @@ from typing import TYPE_CHECKING, Optional -from ..utils import get_always_ff_event, clog2 +from ..utils import get_always_ff_event, clog2, is_pow2 if TYPE_CHECKING: from ..exporter import RegblockExporter @@ -25,6 +25,7 @@ class CpuifBase: "get_always_ff_event": lambda resetsignal : get_always_ff_event(self.exp.dereferencer, resetsignal), "get_resetsignal": self.exp.dereferencer.get_resetsignal, "clog2": clog2, + "is_pow2": is_pow2, } template = self.exp.jj_env.get_template(self.template_path) diff --git a/peakrdl/regblock/exporter.py b/peakrdl/regblock/exporter.py index 9a0d225..dbb677d 100644 --- a/peakrdl/regblock/exporter.py +++ b/peakrdl/regblock/exporter.py @@ -31,6 +31,8 @@ class RegblockExporter: self.field_logic = FieldLogic(self) self.readback = None # type: Readback self.dereferencer = Dereferencer(self) + self.min_read_latency = 0 + self.min_write_latency = 0 if user_template_dir: loader = jj.ChoiceLoader([ @@ -76,12 +78,12 @@ class RegblockExporter: if kwargs: raise TypeError("got an unexpected keyword argument '%s'" % list(kwargs.keys())[0]) - min_read_latency = 0 - min_write_latency = 0 + self.min_read_latency = 0 + self.min_write_latency = 0 if retime_read_fanin: - min_read_latency += 1 + self.min_read_latency += 1 if retime_read_response: - min_read_latency += 1 + self.min_read_latency += 1 # Scan the design for any unsupported features # Also collect pre-export information @@ -120,8 +122,8 @@ class RegblockExporter: "readback": self.readback, "get_always_ff_event": lambda resetsignal : get_always_ff_event(self.dereferencer, resetsignal), "retime_read_response": retime_read_response, - "min_read_latency": min_read_latency, - "min_write_latency": min_write_latency, + "min_read_latency": self.min_read_latency, + "min_write_latency": self.min_write_latency, } # Write out design diff --git a/peakrdl/regblock/module_tmpl.sv b/peakrdl/regblock/module_tmpl.sv index b72cd31..5166a74 100644 --- a/peakrdl/regblock/module_tmpl.sv +++ b/peakrdl/regblock/module_tmpl.sv @@ -36,10 +36,12 @@ module {{module_name}} ( {{cpuif.get_implementation()|indent}} + logic cpuif_req_masked; {% if min_read_latency == min_write_latency %} // Read & write latencies are balanced. Stalls not required assign cpuif_req_stall_rd = '0; assign cpuif_req_stall_wr = '0; + assign cpuif_req_masked = cpuif_req; {%- elif min_read_latency > min_write_latency %} // Read latency > write latency. May need to delay next write that follows a read logic [{{min_read_latency - min_write_latency - 1}}:0] cpuif_req_stall_sr; @@ -54,6 +56,7 @@ module {{module_name}} ( end assign cpuif_req_stall_rd = '0; assign cpuif_req_stall_wr = cpuif_req_stall_sr[0]; + assign cpuif_req_masked = cpuif_req & !(cpuif_req_is_wr & cpuif_req_stall_wr); {%- else %} // Write latency > read latency. May need to delay next read that follows a write logic [{{min_write_latency - min_read_latency - 1}}:0] cpuif_req_stall_sr; @@ -68,6 +71,7 @@ module {{module_name}} ( end assign cpuif_req_stall_rd = cpuif_req_stall_sr[0]; assign cpuif_req_stall_wr = '0; + assign cpuif_req_masked = cpuif_req & !(!cpuif_req_is_wr & cpuif_req_stall_rd); {%- endif %} //-------------------------------------------------------------------------- @@ -84,7 +88,7 @@ module {{module_name}} ( end // Pass down signals to next stage - assign decoded_req = cpuif_req; + assign decoded_req = cpuif_req_masked; assign decoded_req_is_wr = cpuif_req_is_wr; assign decoded_wr_data = cpuif_wr_data; diff --git a/peakrdl/regblock/utils.py b/peakrdl/regblock/utils.py index c550bdd..b2a72e4 100644 --- a/peakrdl/regblock/utils.py +++ b/peakrdl/regblock/utils.py @@ -32,4 +32,7 @@ def get_always_ff_event(dereferencer: 'Dereferencer', resetsignal: 'Optional[Sig return "@(posedge clk)" def clog2(n: int) -> int: - return n.bit_length() - 1 + return (n-1).bit_length() + +def is_pow2(x: int) -> bool: + return (x > 0) and ((x & (x - 1)) == 0) diff --git a/test/lib/cpuifs/axi4lite/axi4lite_intf_driver.sv b/test/lib/cpuifs/axi4lite/axi4lite_intf_driver.sv index 837f595..a05f4f5 100644 --- a/test/lib/cpuifs/axi4lite/axi4lite_intf_driver.sv +++ b/test/lib/cpuifs/axi4lite/axi4lite_intf_driver.sv @@ -136,7 +136,7 @@ interface axi4lite_intf_driver #( begin txn_b_mutex.get(); @cb; - while(cb.BREADY !== 1'b1 && cb.BVALID !== 1'b1) @(cb); + while(!(cb.BREADY === 1'b1 && cb.BVALID === 1'b1)) @(cb); assert(!$isunknown(cb.BRESP)) else $error("Read from 0x%0x returned X's on BRESP", addr); txn_b_mutex.put(); end