diff --git a/docs/architecture.rst b/docs/architecture.rst index daa88f8..4a0b74c 100644 --- a/docs/architecture.rst +++ b/docs/architecture.rst @@ -38,18 +38,15 @@ This section also assigns any hardware interface outputs. Readback -------- -The readback layer aggregates and reduces all readable registers into a single -read response. During a read operation, the same address decode strobes are used -to select the active register that is being accessed. -This allows for a simple OR-reduction operation to be used to compute the read -data response. +The readback layer aggregates and MUXes all readable registers into a single +read response. For designs with a large number of software-readable registers, an optional fanin re-timing stage can be enabled. This stage is automatically inserted at a balanced point in the read-data reduction so that fanin and logic-levels are optimally reduced. -.. figure:: diagrams/readback.png +.. figure:: diagrams/rt-readback-fanin.png :width: 65% :align: center diff --git a/docs/dev_notes/Alpha-Beta Versioning b/docs/dev_notes/Alpha-Beta Versioning deleted file mode 100644 index c8a089d..0000000 --- a/docs/dev_notes/Alpha-Beta Versioning +++ /dev/null @@ -1,10 +0,0 @@ -Holy smokes this is complicated - -Keep this exporter in Alpha/Beta for a while -Add some text in the readme or somewhere: - - No guarantees of correctness! This is always true with open source software, - but even more here! - Be sure to do your own validation before using this in production. - - Alpha means the implementation may change drastically! - Unlike official sem-ver, I am not making any guarantees on compatibility - - I need your help! Validating, finding edge cases, etc... diff --git a/docs/dev_notes/template-layers/5-readback-mux b/docs/dev_notes/template-layers/5-readback-mux index fa53102..7e9702c 100644 --- a/docs/dev_notes/template-layers/5-readback-mux +++ b/docs/dev_notes/template-layers/5-readback-mux @@ -1,35 +1,84 @@ -------------------------------------------------------------------------------- Readback mux layer -------------------------------------------------------------------------------- +Use a large always_comb block + many if statements that select the read data +based on the cpuif address. +Loops are handled the same way as address decode. -Implementation: - - Big always_comb block - - Initialize default rd_data value - - Lotsa if statements that operate on reg strb to assign rd_data - - Merges all fields together into reg - - pulls value from storage element struct, or input struct - - Provision for optional flop stage? +Other options that were considered: + - Flat case statement + con: Difficult to represent arrays. Essentially requires unrolling + con: complicates retiming strategies + con: Representing a range (required for externals) is cumbersome. Possible with stacked casez wildcards. + - AND field data with strobe, then massive OR reduce + This was the strategy prior to v1.3, but turned out to infer more overhead + than originally anticipated + - Assigning data to a flat register array, then directly indexing via address + con: Would work fine, but scales poorly for sparse regblocks. + Namely, simulators would likely allocate memory for the entire array + - Assign to a flat array that is packed sequentially, then directly indexing using a derived packed index + Concern that for sparse regfiles, the translation of addr --> packed index + becomes a nontrivial logic function -Mux Strategy: - Flat case statement: - -- Cant parameterize - + better performance? +Pros: + - Scales well for arrays since loops can be used + - Externals work well, as address ranges can be compared + - Synthesis results show more efficient logic inference - Flat 1-hot array then OR reduce: - - Create a bus-wide flat array - eg: 32-bits x N readable registers - - Assign each element: - the readback value of each register - ... masked by the register's access strobe - - I could also stuff an extra bit into the array that denotes the read is valid - A missed read will OR reduce down to a 0 - - Finally, OR reduce all the elements in the array down to a flat 32-bit bus - - Retiming the large OR fanin can be done by chopping up the array into stages - for 2 stages, sqrt(N) gives each stage's fanin size. Round to favor - more fanin on 2nd stage - 3 stages uses cube-root. etc... - - This has the benefit of re-using the address decode logic. - synth can choose to replicate logic if fanout is bad +Example: + logic [7:0] out; + always_comb begin + out = '0; + for(int i=0; i<64; i++) begin + if(i == addr) out = data[i]; + end + end + + +How to implement retiming: + Ideally this would partition the design into several equal sub-regions, but + with loop structures, this is pretty difficult.. + What if instead, it is partitioned into equal address ranges? + + First stage compares the lower-half of the address bits. + Values are assigned to the appropriate output "bin" + + logic [7:0] out[8]; + always_comb begin + for(int i=0; i<8; i++) out[i] = '0; + + for(int i=0; i<64; i++) begin + automatic bit [5:0] this_addr = i; + + if(this_addr[2:0] == addr[2:0]) out[this_addr[5:3]] = data[i]; + end + end + + (not showing retiming ff for `out` and `addr`) + The second stage muxes down the resulting bins using the high address bits. + If the user up-sizes the address bits, need to check the upper bits to prevent aliasing + Assuming min address bit range is [5:0], but it was padded up to [8:0], do the following: + + logic [7:0] rd_data; + always_comb begin + if(addr[8:6] != '0) begin + // Invalid read range + rd_data = '0; + end else begin + rd_data = out[addr[5:3]]; + end + end + +Retiming with external blocks + One minor downside is the above scheme does not work well for external blocks + that span a range of addresses. Depending on the range, it may span multiple + retiming bins which complicates how this would be assigned cleanly. + This would be complicated even further with arrays of externals since the + span of bins could change depending on the iteration. + + Since externals can already be retimed, and large fanin of external blocks + is likely less of a concern, implement these as a separate readback mux on + the side that does not get retimed at all. WARNING: @@ -42,8 +91,14 @@ WARNING: Forwards response strobe back up to cpu interface layer -TODO: - Dont forget about alias registers here -TODO: - Does the endinness the user sets matter anywhere? +Variables: + From decode: + decoded_addr + decoded_req + decoded_req_is_wr + + Response: + readback_done + readback_err + readback_data diff --git a/docs/diagrams/diagrams.odg b/docs/diagrams/diagrams.odg index 5b42eb6..dd6fab7 100644 Binary files a/docs/diagrams/diagrams.odg and b/docs/diagrams/diagrams.odg differ diff --git a/docs/diagrams/readback.png b/docs/diagrams/readback.png deleted file mode 100644 index 9508650..0000000 Binary files a/docs/diagrams/readback.png and /dev/null differ diff --git a/docs/diagrams/rt-readback-fanin.drawio b/docs/diagrams/rt-readback-fanin.drawio new file mode 100644 index 0000000..3d591d8 --- /dev/null +++ b/docs/diagrams/rt-readback-fanin.drawio @@ -0,0 +1,242 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/diagrams/rt-readback-fanin.png b/docs/diagrams/rt-readback-fanin.png new file mode 100644 index 0000000..e05530b Binary files /dev/null and b/docs/diagrams/rt-readback-fanin.png differ diff --git a/pyproject.toml b/pyproject.toml index dd15049..951b88b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "peakrdl-regblock" dynamic = ["version"] requires-python = ">=3.7" dependencies = [ - "systemrdl-compiler ~= 1.31", + "systemrdl-compiler ~= 1.32", "Jinja2 >= 2.11", ] diff --git a/src/peakrdl_regblock/addr_decode.py b/src/peakrdl_regblock/addr_decode.py index 7617a44..7f79613 100644 --- a/src/peakrdl_regblock/addr_decode.py +++ b/src/peakrdl_regblock/addr_decode.py @@ -133,8 +133,9 @@ class DecodeLogicGenerator(RDLForLoopGenerator): self._array_stride_stack = [] # type: List[int] def _add_addressablenode_decoding_flags(self, node: 'AddressableNode') -> None: - addr_str = self._get_address_str(node) - addr_decoding_str = f"cpuif_req_masked & (cpuif_addr >= {addr_str}) & (cpuif_addr <= {addr_str} + {SVInt(node.size - 1, self.addr_decode.exp.ds.addr_width)})" + addr_lo = self._get_address_str(node) + addr_hi = f"{addr_lo} + {SVInt(node.size - 1, self.addr_decode.exp.ds.addr_width)}" + addr_decoding_str = f"cpuif_req_masked & (cpuif_addr >= {addr_lo}) & (cpuif_addr <= {addr_hi})" rhs = addr_decoding_str rhs_valid_addr = addr_decoding_str if isinstance(node, MemNode): diff --git a/src/peakrdl_regblock/exporter.py b/src/peakrdl_regblock/exporter.py index ba4d2d6..c6e068e 100644 --- a/src/peakrdl_regblock/exporter.py +++ b/src/peakrdl_regblock/exporter.py @@ -165,12 +165,6 @@ class RegblockExporter: # Validate that there are no unsupported constructs DesignValidator(self).do_validate() - # Compute readback implementation early. - # Readback has the capability to disable retiming if the fanin is tiny. - # This affects the rest of the design's implementation, and must be known - # before any other templates are rendered - readback_implementation = self.readback.get_implementation() - # Build Jinja template context context = { "cpuif": self.cpuif, @@ -184,7 +178,7 @@ class RegblockExporter: "default_resetsignal_name": self.dereferencer.default_resetsignal_name, "address_decode": self.address_decode, "field_logic": self.field_logic, - "readback_implementation": readback_implementation, + "readback_implementation": self.readback.get_implementation(), "ext_write_acks": ext_write_acks, "ext_read_acks": ext_read_acks, "parity": parity, @@ -319,6 +313,10 @@ class DesignState: ) self.cpuif_data_width = 32 + # Also, to avoid silly edge cases, disable read fanin retiming since + # it has little benefit anyways + self.retime_read_fanin = False + #------------------------ # Min address width encloses the total size AND at least 1 useful address bit self.addr_width = max(clog2(self.top_node.size), clog2(self.cpuif_data_width//8) + 1) @@ -328,6 +326,15 @@ class DesignState: msg.fatal(f"User-specified address width shall be greater than or equal to {self.addr_width}.") self.addr_width = user_addr_width + if self.retime_read_fanin: + # Check if address width is sufficient to even bother with read fanin retiming + data_width_bytes = self.cpuif_data_width // 8 + unused_low_addr_bits = clog2(data_width_bytes) + relevant_addr_width = self.addr_width - unused_low_addr_bits + if relevant_addr_width < 2: + # Unable to partition the address space. Disable retiming + self.retime_read_fanin = False + @property def min_read_latency(self) -> int: n = 0 diff --git a/src/peakrdl_regblock/module_tmpl.sv b/src/peakrdl_regblock/module_tmpl.sv index 1c15395..5ec9666 100644 --- a/src/peakrdl_regblock/module_tmpl.sv +++ b/src/peakrdl_regblock/module_tmpl.sv @@ -30,24 +30,7 @@ module {{ds.module_name}} logic cpuif_req_masked; {%- if ds.has_external_addressable %} - logic external_req; logic external_pending; - logic external_wr_ack; - logic external_rd_ack; - always_ff {{get_always_ff_event(cpuif.reset)}} begin - if({{get_resetsignal(cpuif.reset)}}) begin - external_pending <= '0; - end else begin - if(external_req & ~external_wr_ack & ~external_rd_ack) external_pending <= '1; - else if(external_wr_ack | external_rd_ack) external_pending <= '0; - `ifndef SYNTHESIS - assert_bad_ext_wr_ack: assert(!external_wr_ack || (external_pending | external_req)) - else $error("An external wr_ack strobe was asserted when no external request was active"); - assert_bad_ext_rd_ack: assert(!external_rd_ack || (external_pending | external_req)) - else $error("An external rd_ack strobe was asserted when no external request was active"); - `endif - end - end {%- endif %} {% if ds.min_read_latency == ds.min_write_latency %} // Read & write latencies are balanced. Stalls not required @@ -109,11 +92,9 @@ module {{ds.module_name}} decoded_reg_strb_t decoded_reg_strb; logic decoded_err; {%- if ds.has_external_addressable %} - logic decoded_strb_is_external; + logic decoded_req_is_external; {% endif %} -{%- if ds.has_external_block %} logic [{{cpuif.addr_width-1}}:0] decoded_addr; -{% endif %} logic decoded_req; logic decoded_req_is_wr; logic [{{cpuif.data_width-1}}:0] decoded_wr_data; @@ -147,15 +128,31 @@ module {{ds.module_name}} decoded_err = '0; {%- endif %} {%- if ds.has_external_addressable %} - decoded_strb_is_external = is_external; - external_req = is_external; + decoded_req_is_external = is_external; {%- endif %} end +{%- if ds.has_external_addressable %} + logic external_wr_ack; + logic external_rd_ack; + always_ff {{get_always_ff_event(cpuif.reset)}} begin + if({{get_resetsignal(cpuif.reset)}}) begin + external_pending <= '0; + end else begin + if(decoded_req_is_external & ~external_wr_ack & ~external_rd_ack) external_pending <= '1; + else if(external_wr_ack | external_rd_ack) external_pending <= '0; + `ifndef SYNTHESIS + assert_bad_ext_wr_ack: assert(!external_wr_ack || (external_pending | decoded_req_is_external)) + else $error("An external wr_ack strobe was asserted when no external request was active"); + assert_bad_ext_rd_ack: assert(!external_rd_ack || (external_pending | decoded_req_is_external)) + else $error("An external rd_ack strobe was asserted when no external request was active"); + `endif + end + end +{%- endif %} + // Pass down signals to next stage -{%- if ds.has_external_block %} assign decoded_addr = cpuif_addr; -{% endif %} assign decoded_req = cpuif_req_masked; assign decoded_req_is_wr = cpuif_req_is_wr; assign decoded_wr_data = cpuif_wr_data; @@ -223,7 +220,7 @@ module {{ds.module_name}} {{ext_write_acks.get_implementation()|indent(8)}} external_wr_ack = wr_ack; end - assign cpuif_wr_ack = external_wr_ack | (decoded_req & decoded_req_is_wr & ~decoded_strb_is_external); + assign cpuif_wr_ack = external_wr_ack | (decoded_req & decoded_req_is_wr & ~decoded_req_is_external); {%- else %} assign cpuif_wr_ack = decoded_req & decoded_req_is_wr; {%- endif %} @@ -262,10 +259,26 @@ module {{ds.module_name}} {%- endif %} {%- endif %} + logic [{{cpuif.addr_width-1}}:0] rd_mux_addr; +{%- if ds.has_external_addressable %} + logic [{{cpuif.addr_width-1}}:0] pending_rd_addr; + // Hold read mux address to guarantee it is stable throughout any external accesses + always_ff {{get_always_ff_event(cpuif.reset)}} begin + if({{get_resetsignal(cpuif.reset)}}) begin + pending_rd_addr <= '0; + end else begin + if(decoded_req) pending_rd_addr <= decoded_addr; + end + end + assign rd_mux_addr = decoded_req ? decoded_addr : pending_rd_addr; +{%- else %} + assign rd_mux_addr = decoded_addr; +{%- endif %} + logic readback_err; logic readback_done; logic [{{cpuif.data_width-1}}:0] readback_data; -{{readback_implementation|indent}} + {{readback_implementation|indent}} {% if ds.retime_read_response %} always_ff {{get_always_ff_event(cpuif.reset)}} begin if({{get_resetsignal(cpuif.reset)}}) begin diff --git a/src/peakrdl_regblock/readback/__init__.py b/src/peakrdl_regblock/readback/__init__.py index dafb1e0..9e3117a 100644 --- a/src/peakrdl_regblock/readback/__init__.py +++ b/src/peakrdl_regblock/readback/__init__.py @@ -1,72 +1 @@ -from typing import TYPE_CHECKING -import math - -from .generators import ReadbackAssignmentGenerator - -if TYPE_CHECKING: - from ..exporter import RegblockExporter, DesignState - from systemrdl.node import AddrmapNode - -class Readback: - def __init__(self, exp:'RegblockExporter'): - self.exp = exp - - @property - def ds(self) -> 'DesignState': - return self.exp.ds - - @property - def top_node(self) -> 'AddrmapNode': - return self.exp.ds.top_node - - def get_implementation(self) -> str: - gen = ReadbackAssignmentGenerator(self.exp) - array_assignments = gen.get_content(self.top_node) - array_size = gen.current_offset - - # Enabling the fanin stage doesnt make sense if readback fanin is - # small. This also avoids pesky corner cases - if array_size < 4: - self.ds.retime_read_fanin = False - - context = { - "array_assignments" : array_assignments, - "array_size" : array_size, - 'get_always_ff_event': self.exp.dereferencer.get_always_ff_event, - 'get_resetsignal': self.exp.dereferencer.get_resetsignal, - "cpuif": self.exp.cpuif, - "ds": self.ds, - } - - if self.ds.retime_read_fanin: - # If adding a fanin pipeline stage, goal is to try to - # split the fanin path in the middle so that fanin into the stage - # and the following are roughly balanced. - fanin_target = math.sqrt(array_size) - - # Size of fanin group to consume per fanin element - fanin_stride = math.floor(fanin_target) - - # Number of array elements to reduce to. - # Round up to an extra element in case there is some residual - fanin_array_size = math.ceil(array_size / fanin_stride) - - # leftovers are handled in an extra array element - fanin_residual_stride = array_size % fanin_stride - - if fanin_residual_stride != 0: - # If there is a partial fanin element, reduce the number of - # loops performed in the bulk fanin stage - fanin_loop_iter = fanin_array_size - 1 - else: - fanin_loop_iter = fanin_array_size - - context['fanin_stride'] = fanin_stride - context['fanin_array_size'] = fanin_array_size - context['fanin_residual_stride'] = fanin_residual_stride - context['fanin_loop_iter'] = fanin_loop_iter - - template = self.exp.jj_env.get_template( - "readback/templates/readback.sv" - ) - return template.render(context) +from .readback import Readback diff --git a/src/peakrdl_regblock/readback/generators.py b/src/peakrdl_regblock/readback/generators.py deleted file mode 100644 index 87f2969..0000000 --- a/src/peakrdl_regblock/readback/generators.py +++ /dev/null @@ -1,381 +0,0 @@ -from typing import TYPE_CHECKING, List - -from systemrdl.node import RegNode, AddressableNode -from systemrdl.walker import WalkerAction - -from ..forloop_generator import RDLForLoopGenerator, LoopBody - -from ..utils import do_bitswap, do_slice - -if TYPE_CHECKING: - from ..exporter import RegblockExporter - -class ReadbackLoopBody(LoopBody): - def __init__(self, dim: int, iterator: str, i_type: str) -> None: - super().__init__(dim, iterator, i_type) - self.n_regs = 0 - - def __str__(self) -> str: - # replace $i#sz token when stringifying - s = super().__str__() - token = f"${self.iterator}sz" - s = s.replace(token, str(self.n_regs)) - return s - -class ReadbackAssignmentGenerator(RDLForLoopGenerator): - i_type = "genvar" - loop_body_cls = ReadbackLoopBody - - def __init__(self, exp:'RegblockExporter') -> None: - super().__init__() - self.exp = exp - - # The readback array collects all possible readback values into a flat - # array. The array width is equal to the CPUIF bus width. Each entry in - # the array represents an aligned read access. - self.current_offset = 0 - self.start_offset_stack = [] # type: List[int] - self.dim_stack = [] # type: List[int] - - @property - def current_offset_str(self) -> str: - """ - Derive a string that represents the current offset being assigned. - This consists of: - - The current integer offset - - multiplied index of any enclosing loop - - The integer offset from "current_offset" is static and is monotonically - incremented as more register assignments are processed. - - The component of the offset from loops is added by multiplying the current - loop index by the loop size. - Since the loop's size is not known at this time, it is emitted as a - placeholder token like: $i0sz, $i1sz, $i2sz, etc - These tokens can be replaced once the loop body has been completed and the - size of its contents is known. - """ - offset_parts = [] - for i in range(self._loop_level): - offset_parts.append(f"i{i} * $i{i}sz") - offset_parts.append(str(self.current_offset)) - return " + ".join(offset_parts) - - def push_loop(self, dim: int) -> None: - super().push_loop(dim) - self.start_offset_stack.append(self.current_offset) - self.dim_stack.append(dim) - - def pop_loop(self) -> None: - start_offset = self.start_offset_stack.pop() - dim = self.dim_stack.pop() - - # Number of registers enclosed in this loop - n_regs = self.current_offset - start_offset - self.current_loop.n_regs = n_regs # type: ignore - - super().pop_loop() - - # Advance current scope's offset to account for loop's contents - self.current_offset = start_offset + n_regs * dim - - - def enter_AddressableComponent(self, node: 'AddressableNode') -> WalkerAction: - super().enter_AddressableComponent(node) - - if node.external and not isinstance(node, RegNode): - # External block - strb = self.exp.hwif.get_external_rd_ack(node) - data = self.exp.hwif.get_external_rd_data(node) - self.add_content(f"assign readback_array[{self.current_offset_str}] = {strb} ? {data} : '0;") - self.current_offset += 1 - return WalkerAction.SkipDescendants - - return WalkerAction.Continue - - def enter_Reg(self, node: RegNode) -> WalkerAction: - if not node.has_sw_readable: - return WalkerAction.SkipDescendants - - if node.external: - self.process_external_reg(node) - return WalkerAction.SkipDescendants - - accesswidth = node.get_property('accesswidth') - regwidth = node.get_property('regwidth') - rbuf = node.get_property('buffer_reads') - if rbuf: - trigger = node.get_property('rbuffer_trigger') - is_own_trigger = (isinstance(trigger, RegNode) and trigger == node) - if is_own_trigger: - if accesswidth < regwidth: - self.process_buffered_reg_with_bypass(node, regwidth, accesswidth) - else: - # bypass cancels out. Behaves like a normal reg - self.process_reg(node) - else: - self.process_buffered_reg(node, regwidth, accesswidth) - elif accesswidth < regwidth: - self.process_wide_reg(node, accesswidth) - else: - self.process_reg(node) - - return WalkerAction.SkipDescendants - - def process_external_reg(self, node: RegNode) -> None: - strb = self.exp.hwif.get_external_rd_ack(node) - data = self.exp.hwif.get_external_rd_data(node) - regwidth = node.get_property('regwidth') - if regwidth < self.exp.cpuif.data_width: - self.add_content(f"assign readback_array[{self.current_offset_str}][{self.exp.cpuif.data_width-1}:{regwidth}] = '0;") - self.add_content(f"assign readback_array[{self.current_offset_str}][{regwidth-1}:0] = {strb} ? {data} : '0;") - else: - self.add_content(f"assign readback_array[{self.current_offset_str}] = {strb} ? {data} : '0;") - - self.current_offset += 1 - - def process_reg(self, node: RegNode) -> None: - current_bit = 0 - rd_strb = f"({self.exp.dereferencer.get_access_strobe(node)} && !decoded_req_is_wr)" - # Fields are sorted by ascending low bit - for field in node.fields(): - if not field.is_sw_readable: - continue - - # insert reserved assignment before this field if needed - if field.low != current_bit: - self.add_content(f"assign readback_array[{self.current_offset_str}][{field.low-1}:{current_bit}] = '0;") - - value = self.exp.dereferencer.get_value(field) - if field.msb < field.lsb: - # Field gets bitswapped since it is in [low:high] orientation - value = do_bitswap(value) - - self.add_content(f"assign readback_array[{self.current_offset_str}][{field.high}:{field.low}] = {rd_strb} ? {value} : '0;") - - current_bit = field.high + 1 - - # Insert final reserved assignment if needed - bus_width = self.exp.cpuif.data_width - if current_bit < bus_width: - self.add_content(f"assign readback_array[{self.current_offset_str}][{bus_width-1}:{current_bit}] = '0;") - - self.current_offset += 1 - - - def process_buffered_reg(self, node: RegNode, regwidth: int, accesswidth: int) -> None: - rbuf = self.exp.read_buffering.get_rbuf_data(node) - - if accesswidth < regwidth: - # Is wide reg - n_subwords = regwidth // accesswidth - astrb = self.exp.dereferencer.get_access_strobe(node, reduce_substrobes=False) - for i in range(n_subwords): - rd_strb = f"({astrb}[{i}] && !decoded_req_is_wr)" - bslice = f"[{(i + 1) * accesswidth - 1}:{i*accesswidth}]" - self.add_content(f"assign readback_array[{self.current_offset_str}] = {rd_strb} ? {rbuf}{bslice} : '0;") - self.current_offset += 1 - - else: - # Is regular reg - rd_strb = f"({self.exp.dereferencer.get_access_strobe(node)} && !decoded_req_is_wr)" - self.add_content(f"assign readback_array[{self.current_offset_str}][{regwidth-1}:0] = {rd_strb} ? {rbuf} : '0;") - - bus_width = self.exp.cpuif.data_width - if regwidth < bus_width: - self.add_content(f"assign readback_array[{self.current_offset_str}][{bus_width-1}:{regwidth}] = '0;") - - self.current_offset += 1 - - - def process_buffered_reg_with_bypass(self, node: RegNode, regwidth: int, accesswidth: int) -> None: - """ - Special case for a buffered register when the register is its own trigger. - First sub-word shall bypass the read buffer and assign directly. - Subsequent subwords assign from the buffer. - Caller guarantees this is a wide reg - """ - astrb = self.exp.dereferencer.get_access_strobe(node, reduce_substrobes=False) - - # Generate assignments for first sub-word - bidx = 0 - rd_strb = f"({astrb}[0] && !decoded_req_is_wr)" - for field in node.fields(): - if not field.is_sw_readable: - continue - - if field.low >= accesswidth: - # field is not in this subword. - break - - if bidx < field.low: - # insert padding before - self.add_content(f"assign readback_array[{self.current_offset_str}][{field.low - 1}:{bidx}] = '0;") - - if field.high >= accesswidth: - # field gets truncated - r_low = field.low - r_high = accesswidth - 1 - f_low = 0 - f_high = accesswidth - 1 - field.low - - if field.msb < field.lsb: - # Field gets bitswapped since it is in [low:high] orientation - # Mirror the low/high indexes - f_low = field.width - 1 - f_low - f_high = field.width - 1 - f_high - f_low, f_high = f_high, f_low - value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)) - else: - value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low) - - self.add_content(f"assign readback_array[{self.current_offset_str}][{r_high}:{r_low}] = {rd_strb} ? {value} : '0;") - bidx = accesswidth - else: - # field fits in subword - value = self.exp.dereferencer.get_value(field) - if field.msb < field.lsb: - # Field gets bitswapped since it is in [low:high] orientation - value = do_bitswap(value) - self.add_content(f"assign readback_array[{self.current_offset_str}][{field.high}:{field.low}] = {rd_strb} ? {value} : '0;") - bidx = field.high + 1 - - # pad up remainder of subword - if bidx < accesswidth: - self.add_content(f"assign readback_array[{self.current_offset_str}][{accesswidth-1}:{bidx}] = '0;") - self.current_offset += 1 - - # Assign remainder of subwords from read buffer - n_subwords = regwidth // accesswidth - rbuf = self.exp.read_buffering.get_rbuf_data(node) - for i in range(1, n_subwords): - rd_strb = f"({astrb}[{i}] && !decoded_req_is_wr)" - bslice = f"[{(i + 1) * accesswidth - 1}:{i*accesswidth}]" - self.add_content(f"assign readback_array[{self.current_offset_str}] = {rd_strb} ? {rbuf}{bslice} : '0;") - self.current_offset += 1 - - def process_wide_reg(self, node: RegNode, accesswidth: int) -> None: - bus_width = self.exp.cpuif.data_width - - subword_idx = 0 - current_bit = 0 # Bit-offset within the wide register - access_strb = self.exp.dereferencer.get_access_strobe(node, reduce_substrobes=False) - # Fields are sorted by ascending low bit - for field in node.fields(): - if not field.is_sw_readable: - continue - - # insert zero assignment before this field if needed - if field.low >= accesswidth*(subword_idx+1): - # field does not start in this subword - if current_bit > accesswidth * subword_idx: - # current subword had content. Assign remainder - low = current_bit % accesswidth - high = bus_width - 1 - self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = '0;") - self.current_offset += 1 - - # Advance to subword that contains the start of the field - subword_idx = field.low // accesswidth - current_bit = accesswidth * subword_idx - - if current_bit != field.low: - # assign zero up to start of this field - low = current_bit % accesswidth - high = (field.low % accesswidth) - 1 - self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = '0;") - current_bit = field.low - - - # Assign field - # loop until the entire field's assignments have been generated - field_pos = field.low - while current_bit <= field.high: - # Assign the field - rd_strb = f"({access_strb}[{subword_idx}] && !decoded_req_is_wr)" - if (field_pos == field.low) and (field.high < accesswidth*(subword_idx+1)): - # entire field fits into this subword - low = field.low - accesswidth * subword_idx - high = field.high - accesswidth * subword_idx - - value = self.exp.dereferencer.get_value(field) - if field.msb < field.lsb: - # Field gets bitswapped since it is in [low:high] orientation - value = do_bitswap(value) - - self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = {rd_strb} ? {value} : '0;") - - current_bit = field.high + 1 - - if current_bit == accesswidth*(subword_idx+1): - # Field ends at the subword boundary - subword_idx += 1 - self.current_offset += 1 - elif field.high >= accesswidth*(subword_idx+1): - # only a subset of the field can fit into this subword - # high end gets truncated - - # assignment slice - r_low = field_pos - accesswidth * subword_idx - r_high = accesswidth - 1 - - # field slice - f_low = field_pos - field.low - f_high = accesswidth * (subword_idx + 1) - 1 - field.low - - if field.msb < field.lsb: - # Field gets bitswapped since it is in [low:high] orientation - # Mirror the low/high indexes - f_low = field.width - 1 - f_low - f_high = field.width - 1 - f_high - f_low, f_high = f_high, f_low - - value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)) - else: - value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low) - - self.add_content(f"assign readback_array[{self.current_offset_str}][{r_high}:{r_low}] = {rd_strb} ? {value} : '0;") - - # advance to the next subword - subword_idx += 1 - current_bit = accesswidth * subword_idx - field_pos = current_bit - self.current_offset += 1 - else: - # only a subset of the field can fit into this subword - # finish field - - # assignment slice - r_low = field_pos - accesswidth * subword_idx - r_high = field.high - accesswidth * subword_idx - - # field slice - f_low = field_pos - field.low - f_high = field.high - field.low - - if field.msb < field.lsb: - # Field gets bitswapped since it is in [low:high] orientation - # Mirror the low/high indexes - f_low = field.width - 1 - f_low - f_high = field.width - 1 - f_high - f_low, f_high = f_high, f_low - - value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)) - else: - value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low) - - self.add_content(f"assign readback_array[{self.current_offset_str}][{r_high}:{r_low}] = {rd_strb} ? {value} : '0;") - - current_bit = field.high + 1 - if current_bit == accesswidth*(subword_idx+1): - # Field ends at the subword boundary - subword_idx += 1 - self.current_offset += 1 - - # insert zero assignment after the last field if needed - if current_bit > accesswidth * subword_idx: - # current subword had content. Assign remainder - low = current_bit % accesswidth - high = bus_width - 1 - self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = '0;") - self.current_offset += 1 diff --git a/src/peakrdl_regblock/readback/readback.py b/src/peakrdl_regblock/readback/readback.py new file mode 100644 index 0000000..b36ac0b --- /dev/null +++ b/src/peakrdl_regblock/readback/readback.py @@ -0,0 +1,101 @@ +from typing import TYPE_CHECKING + +from .readback_mux_generator import ReadbackMuxGenerator, RetimedReadbackMuxGenerator, RetimedExtBlockReadbackMuxGenerator +from ..utils import clog2 + +if TYPE_CHECKING: + from ..exporter import RegblockExporter, DesignState + +class Readback: + def __init__(self, exp:'RegblockExporter'): + self.exp = exp + + @property + def ds(self) -> 'DesignState': + return self.exp.ds + + def get_implementation(self) -> str: + if self.ds.retime_read_fanin: + return self.get_2stage_implementation() + else: + # No retiming + return self.get_1stage_implementation() + + + def get_empty_implementation(self) -> str: + """ + Readback implementation when there are no readable registers + """ + context = { + "ds": self.ds, + } + template = self.exp.jj_env.get_template( + "readback/templates/empty_readback.sv" + ) + return template.render(context) + + + def get_1stage_implementation(self) -> str: + """ + Implements readback without any retiming + """ + gen = ReadbackMuxGenerator(self.exp) + mux_impl = gen.get_content(self.ds.top_node) + + if not mux_impl: + # Design has no readable registers. + return self.get_empty_implementation() + + context = { + "readback_mux": mux_impl, + "cpuif": self.exp.cpuif, + "ds": self.ds, + } + template = self.exp.jj_env.get_template( + "readback/templates/readback_no_rt.sv" + ) + + return template.render(context) + + + def get_2stage_implementation(self) -> str: + """ + Implements readback that is retimed to 2 stages + """ + # Split the decode to happen in two stages, using low address bits first + # then high address bits. + # Split in the middle of the "relevant" address bits - the ones that + # actually contribute to addressing in the regblock + unused_low_addr_bits = clog2(self.exp.cpuif.data_width_bytes) + relevant_addr_width = self.ds.addr_width - unused_low_addr_bits + low_addr_width = (relevant_addr_width // 2) + unused_low_addr_bits + high_addr_width = self.ds.addr_width - low_addr_width + + mux_gen = RetimedReadbackMuxGenerator(self.exp) + mux_impl = mux_gen.get_content(self.ds.top_node) + + if not mux_impl: + # Design has no readable addresses. + return self.get_empty_implementation() + + if self.ds.has_external_block: + ext_mux_gen = RetimedExtBlockReadbackMuxGenerator(self.exp) + ext_mux_impl = ext_mux_gen.get_content(self.ds.top_node) + else: + ext_mux_impl = None + + context = { + "readback_mux": mux_impl, + "ext_block_readback_mux": ext_mux_impl, + "cpuif": self.exp.cpuif, + "ds": self.ds, + "low_addr_width": low_addr_width, + "high_addr_width": high_addr_width, + 'get_always_ff_event': self.exp.dereferencer.get_always_ff_event, + 'get_resetsignal': self.exp.dereferencer.get_resetsignal, + } + template = self.exp.jj_env.get_template( + "readback/templates/readback_with_rt.sv" + ) + + return template.render(context) diff --git a/src/peakrdl_regblock/readback/readback_mux_generator.py b/src/peakrdl_regblock/readback/readback_mux_generator.py new file mode 100644 index 0000000..34d7891 --- /dev/null +++ b/src/peakrdl_regblock/readback/readback_mux_generator.py @@ -0,0 +1,361 @@ +from typing import TYPE_CHECKING, List, Sequence, Optional + +from systemrdl.node import RegNode, AddressableNode, FieldNode +from systemrdl.walker import WalkerAction + +from ..forloop_generator import RDLForLoopGenerator +from ..utils import SVInt, do_bitswap, do_slice + +if TYPE_CHECKING: + from ..exporter import DesignState, RegblockExporter + +class ReadbackMuxGenerator(RDLForLoopGenerator): + def __init__(self, exp: 'RegblockExporter') -> None: + super().__init__() + + self.exp = exp + + # List of address strides for each dimension + self._array_stride_stack: List[int] = [] + + @property + def ds(self) -> 'DesignState': + return self.exp.ds + + + def enter_AddressableComponent(self, node: AddressableNode) -> Optional[WalkerAction]: + super().enter_AddressableComponent(node) + + if node.array_dimensions: + assert node.array_stride is not None + # Collect strides for each array dimension + current_stride = node.array_stride + strides = [] + for dim in reversed(node.array_dimensions): + strides.append(current_stride) + current_stride *= dim + strides.reverse() + self._array_stride_stack.extend(strides) + + if node.external and not isinstance(node, RegNode): + # Is an external block + self.process_external_block(node) + return WalkerAction.SkipDescendants + + return WalkerAction.Continue + + + def process_external_block(self, node: AddressableNode) -> None: + addr_lo = self._get_address_str(node) + addr_hi = f"{addr_lo} + {SVInt(node.size - 1, self.exp.ds.addr_width)}" + self.add_content(f"if((rd_mux_addr >= {addr_lo}) && (rd_mux_addr <= {addr_hi})) begin") + data = self.exp.hwif.get_external_rd_data(node) + self.add_content(f" readback_data_var = {data};") + self.add_content("end") + + + def enter_Reg(self, node: RegNode) -> WalkerAction: + fields = node.fields(sw_readable_only=True) + if not fields: + # Reg has no readable fields + return WalkerAction.SkipDescendants + + if node.external: + self.process_external_reg(node) + return WalkerAction.SkipDescendants + + accesswidth = node.get_property('accesswidth') + regwidth = node.get_property('regwidth') + rbuf = node.get_property('buffer_reads') + + if rbuf: + trigger = node.get_property('rbuffer_trigger') + is_own_trigger = (isinstance(trigger, RegNode) and trigger == node) + if is_own_trigger: + if accesswidth < regwidth: + self.process_wide_buffered_reg_with_bypass(node, fields, regwidth, accesswidth) + else: + # bypass cancels out. Behaves like a normal reg + self.process_reg(node, fields) + else: + self.process_buffered_reg(node, regwidth, accesswidth) + elif accesswidth < regwidth: + self.process_wide_reg(node, fields, regwidth, accesswidth) + else: + self.process_reg(node, fields) + + return WalkerAction.SkipDescendants + + + def _get_address_str(self, node: AddressableNode, subword_offset: int=0) -> str: + expr_width = self.ds.addr_width + a = str(SVInt( + node.raw_absolute_address - self.ds.top_node.raw_absolute_address + subword_offset, + expr_width + )) + for i, stride in enumerate(self._array_stride_stack): + a += f" + ({expr_width})'(i{i}) * {SVInt(stride, expr_width)}" + return a + + + def get_addr_compare_conditional(self, addr: str) -> str: + return f"rd_mux_addr == {addr}" + + def get_readback_data_var(self, addr: str) -> str: + return "readback_data_var" + + def process_external_reg(self, node: RegNode) -> None: + accesswidth = node.get_property('accesswidth') + regwidth = node.get_property('regwidth') + data = self.exp.hwif.get_external_rd_data(node) + + if regwidth > accesswidth: + # Is wide reg. + # The retiming scheme requires singular address comparisons rather than + # ranges. To support this, unroll the subwords + n_subwords = regwidth // accesswidth + subword_stride = accesswidth // 8 + for subword_idx in range(n_subwords): + addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride) + conditional = self.get_addr_compare_conditional(addr) + var = self.get_readback_data_var(addr) + self.add_content(f"if({conditional}) begin") + self.add_content(f" {var} = {data};") + self.add_content("end") + else: + addr = self._get_address_str(node) + conditional = self.get_addr_compare_conditional(addr) + var = self.get_readback_data_var(addr) + self.add_content(f"if({conditional}) begin") + if regwidth < self.exp.cpuif.data_width: + self.add_content(f" {var}[{regwidth-1}:0] = {data};") + else: + self.add_content(f" {var} = {data};") + self.add_content("end") + + + def process_reg(self, node: RegNode, fields: Sequence[FieldNode]) -> None: + """ + Process a regular register + """ + addr = self._get_address_str(node) + conditional = self.get_addr_compare_conditional(addr) + var = self.get_readback_data_var(addr) + self.add_content(f"if({conditional}) begin") + + for field in fields: + value = self.exp.dereferencer.get_value(field) + if field.msb < field.lsb: + # Field gets bitswapped since it is in [low:high] orientation + value = do_bitswap(value) + + if field.width == 1: + self.add_content(f" {var}[{field.low}] = {value};") + else: + self.add_content(f" {var}[{field.high}:{field.low}] = {value};") + + self.add_content("end") + + + def process_buffered_reg(self, node: RegNode, regwidth: int, accesswidth: int) -> None: + """ + Process a register which is fully buffered + """ + rbuf = self.exp.read_buffering.get_rbuf_data(node) + + if accesswidth < regwidth: + # Is wide reg + n_subwords = regwidth // accesswidth + subword_stride = accesswidth // 8 + for subword_idx in range(n_subwords): + addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride) + conditional = self.get_addr_compare_conditional(addr) + var = self.get_readback_data_var(addr) + bslice = f"[{(subword_idx + 1) * accesswidth - 1}:{subword_idx*accesswidth}]" + self.add_content(f"if({conditional}) begin") + self.add_content(f" {var} = {rbuf}{bslice};") + self.add_content("end") + else: + # Is regular reg + addr = self._get_address_str(node) + conditional = self.get_addr_compare_conditional(addr) + var = self.get_readback_data_var(addr) + self.add_content(f"if({conditional}) begin") + self.add_content(f" {var}[{regwidth-1}:0] = {rbuf};") + self.add_content("end") + + + def process_wide_buffered_reg_with_bypass(self, node: RegNode, fields: Sequence[FieldNode], regwidth: int, accesswidth: int) -> None: + """ + Special case for a wide buffered register where the register is its own + trigger. + + First sub-word shall bypass the read buffer and assign directly. + Subsequent subwords assign from the buffer. + """ + + # Generate assignments for first sub-word + subword_assignments = self.get_wide_reg_subword_assignments(node, fields, regwidth, accesswidth) + if subword_assignments[0]: + addr = self._get_address_str(node, subword_offset=0) + conditional = self.get_addr_compare_conditional(addr) + self.add_content(f"if({conditional}) begin") + for assignment in subword_assignments[0]: + self.add_content(" " + assignment) + self.add_content("end") + + # Assign remainder of subwords from read buffer + n_subwords = regwidth // accesswidth + subword_stride = accesswidth // 8 + rbuf = self.exp.read_buffering.get_rbuf_data(node) + for subword_idx in range(1, n_subwords): + addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride) + bslice = f"[{(subword_idx + 1) * accesswidth - 1}:{subword_idx*accesswidth}]" + conditional = self.get_addr_compare_conditional(addr) + var = self.get_readback_data_var(addr) + self.add_content(f"if({conditional}) begin") + self.add_content(f" {var} = {rbuf}{bslice};") + self.add_content("end") + + + def get_wide_reg_subword_assignments(self, node: RegNode, fields: Sequence[FieldNode], regwidth: int, accesswidth: int) -> List[List[str]]: + """ + Get a list of assignments for each subword + + Returns a 2d array where the first dimension indicates the subword index. + The next dimension is the list of assignments + """ + n_subwords = regwidth // accesswidth + subword_stride = accesswidth // 8 + subword_assignments: List[List[str]] = [[] for _ in range(n_subwords)] + + # Fields are sorted by ascending low bit + for field in fields: + subword_idx = field.low // accesswidth + + if field.high < accesswidth * (subword_idx + 1): + # entire field fits into this subword + low = field.low - accesswidth * subword_idx + high = field.high - accesswidth * subword_idx + + value = self.exp.dereferencer.get_value(field) + if field.msb < field.lsb: + # Field gets bitswapped since it is in [low:high] orientation + value = do_bitswap(value) + + addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride) + var = self.get_readback_data_var(addr) + subword_assignments[subword_idx].append(f"{var}[{high}:{low}] = {value};") + + else: + # Field spans multiple sub-words + # loop through subword indexes until the entire field has been assigned + while field.high >= accesswidth * subword_idx: + # Allowable field window for this subword + subword_low = accesswidth * subword_idx + subword_high = subword_low + accesswidth - 1 + + # field slice (relative to reg) + f_low = max(subword_low, field.low) + f_high = min(subword_high, field.high) + + # assignment slice + r_low = f_low - accesswidth * subword_idx + r_high = f_high - accesswidth * subword_idx + + # Adjust to be relative to field + f_low -= field.low + f_high -= field.low + + if field.msb < field.lsb: + # Field gets bitswapped since it is in [low:high] orientation + # Mirror the low/high indexes + f_low = field.width - 1 - f_low + f_high = field.width - 1 - f_high + f_low, f_high = f_high, f_low + + value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)) + else: + value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low) + + addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride) + var = self.get_readback_data_var(addr) + subword_assignments[subword_idx].append(f"{var}[{r_high}:{r_low}] = {value};") + + # advance to the next subword + subword_idx += 1 + + return subword_assignments + + + def process_wide_reg(self, node: RegNode, fields: Sequence[FieldNode], regwidth: int, accesswidth: int) -> None: + """ + Process a register whose accesswidth < regwidth + """ + subword_assignments = self.get_wide_reg_subword_assignments(node, fields, regwidth, accesswidth) + + # Add generated content, wrapped in the address conditional + subword_stride = accesswidth // 8 + for subword_idx, assignments in enumerate(subword_assignments): + if not assignments: + continue + addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride) + conditional = self.get_addr_compare_conditional(addr) + self.add_content(f"if({conditional}) begin") + for assignment in assignments: + self.add_content(" " + assignment) + self.add_content("end") + + + def exit_AddressableComponent(self, node: AddressableNode) -> None: + super().exit_AddressableComponent(node) + + if not node.array_dimensions: + return + + for _ in node.array_dimensions: + self._array_stride_stack.pop() + + +class RetimedReadbackMuxGenerator(ReadbackMuxGenerator): + """ + Alternate variant that is dedicated to building the 1st decode stage + """ + + def process_external_block(self, node: AddressableNode) -> None: + # Do nothing. External blocks are handled in a completely separate readback mux + pass + + def get_addr_compare_conditional(self, addr: str) -> str: + # In the pipelined variant, compare the low-bits of both sides + return f"ad_low(rd_mux_addr) == ad_low({addr})" + + def get_readback_data_var(self, addr: str) -> str: + # In the pipelined variant, assign to the bin indexed by the high bits of addr + return f"readback_data_var[ad_hi({addr})]" + + +class RetimedExtBlockReadbackMuxGenerator(ReadbackMuxGenerator): + """ + When retiming is enabled, external blocks are implemented as a separate + reaback mux that is not retimed using a partitioned address. + + This is because the address partitioning scheme used for individual register + addresses does not work cleanly for address ranges. (not possible to cleanly + map readback of a range to high-address data bins) + + Since the non-retimed mux generator already implements external ranges, + re-use it and suppress generation of register logic. + """ + + def enter_Reg(self, node: RegNode) -> WalkerAction: + return WalkerAction.SkipDescendants + + def process_external_block(self, node: AddressableNode) -> None: + addr_lo = self._get_address_str(node) + addr_hi = f"{addr_lo} + {SVInt(node.size - 1, self.exp.ds.addr_width)}" + self.add_content(f"if((rd_mux_addr >= {addr_lo}) && (rd_mux_addr <= {addr_hi})) begin") + data = self.exp.hwif.get_external_rd_data(node) + self.add_content(f" readback_data_var = {data};") + self.add_content(" is_external_block_var = 1'b1;") + self.add_content("end") diff --git a/src/peakrdl_regblock/readback/templates/empty_readback.sv b/src/peakrdl_regblock/readback/templates/empty_readback.sv new file mode 100644 index 0000000..f493b83 --- /dev/null +++ b/src/peakrdl_regblock/readback/templates/empty_readback.sv @@ -0,0 +1,7 @@ +assign readback_done = decoded_req & ~decoded_req_is_wr; +assign readback_data = '0; +{%- if ds.err_if_bad_addr or ds.err_if_bad_rw %} +assign readback_err = decoded_err; +{%- else %} +assign readback_err = '0; +{%- endif %} diff --git a/src/peakrdl_regblock/readback/templates/readback.sv b/src/peakrdl_regblock/readback/templates/readback.sv deleted file mode 100644 index 08ed492..0000000 --- a/src/peakrdl_regblock/readback/templates/readback.sv +++ /dev/null @@ -1,94 +0,0 @@ -{% if array_assignments is not none %} -// Assign readback values to a flattened array -logic [{{cpuif.data_width-1}}:0] readback_array[{{array_size}}]; -{{array_assignments}} - - -{%- if ds.retime_read_fanin %} - -// fanin stage -logic [{{cpuif.data_width-1}}:0] readback_array_c[{{fanin_array_size}}]; -for(genvar g=0; g<{{fanin_loop_iter}}; g++) begin - always_comb begin - automatic logic [{{cpuif.data_width-1}}:0] readback_data_var; - readback_data_var = '0; - for(int i=g*{{fanin_stride}}; i<((g+1)*{{fanin_stride}}); i++) readback_data_var |= readback_array[i]; - readback_array_c[g] = readback_data_var; - end -end -{%- if fanin_residual_stride == 1 %} -assign readback_array_c[{{fanin_array_size-1}}] = readback_array[{{array_size-1}}]; -{%- elif fanin_residual_stride > 1 %} -always_comb begin - automatic logic [{{cpuif.data_width-1}}:0] readback_data_var; - readback_data_var = '0; - for(int i={{(fanin_array_size-1) * fanin_stride}}; i<{{array_size}}; i++) readback_data_var |= readback_array[i]; - readback_array_c[{{fanin_array_size-1}}] = readback_data_var; -end -{%- endif %} - -logic [{{cpuif.data_width-1}}:0] readback_array_r[{{fanin_array_size}}]; -logic readback_done_r; -logic readback_err_r; -always_ff {{get_always_ff_event(cpuif.reset)}} begin - if({{get_resetsignal(cpuif.reset)}}) begin - for(int i=0; i<{{fanin_array_size}}; i++) readback_array_r[i] <= '0; - readback_done_r <= '0; - readback_err_r <= '0; - end else begin - readback_array_r <= readback_array_c; - readback_err_r <= decoded_err; - {%- if ds.has_external_addressable %} - readback_done_r <= decoded_req & ~decoded_req_is_wr & ~decoded_strb_is_external; - {%- else %} - readback_done_r <= decoded_req & ~decoded_req_is_wr; - {%- endif %} - end -end - -// Reduce the array -always_comb begin - automatic logic [{{cpuif.data_width-1}}:0] readback_data_var; - readback_done = readback_done_r; - {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %} - readback_err = readback_err_r; - {%- else %} - readback_err = '0; - {%- endif %} - readback_data_var = '0; - for(int i=0; i<{{fanin_array_size}}; i++) readback_data_var |= readback_array_r[i]; - readback_data = readback_data_var; -end - -{%- else %} - -// Reduce the array -always_comb begin - automatic logic [{{cpuif.data_width-1}}:0] readback_data_var; - {%- if ds.has_external_addressable %} - readback_done = decoded_req & ~decoded_req_is_wr & ~decoded_strb_is_external; - {%- else %} - readback_done = decoded_req & ~decoded_req_is_wr; - {%- endif %} - {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %} - readback_err = decoded_err; - {%- else %} - readback_err = '0; - {%- endif %} - readback_data_var = '0; - for(int i=0; i<{{array_size}}; i++) readback_data_var |= readback_array[i]; - readback_data = readback_data_var; -end -{%- endif %} - - - -{%- else %} -assign readback_done = decoded_req & ~decoded_req_is_wr; -assign readback_data = '0; -{%- if ds.err_if_bad_addr or ds.err_if_bad_rw %} -assign readback_err = decoded_err; -{%- else %} -assign readback_err = '0; -{%- endif %} -{% endif %} diff --git a/src/peakrdl_regblock/readback/templates/readback_no_rt.sv b/src/peakrdl_regblock/readback/templates/readback_no_rt.sv new file mode 100644 index 0000000..02d2cc2 --- /dev/null +++ b/src/peakrdl_regblock/readback/templates/readback_no_rt.sv @@ -0,0 +1,17 @@ +always_comb begin + automatic logic [{{cpuif.data_width-1}}:0] readback_data_var; + readback_data_var = '0; + {{readback_mux|indent}} + readback_data = readback_data_var; + + {%- if ds.has_external_addressable %} + readback_done = decoded_req & ~decoded_req_is_wr & ~decoded_req_is_external; + {%- else %} + readback_done = decoded_req & ~decoded_req_is_wr; + {%- endif %} + {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %} + readback_err = decoded_err; + {%- else %} + readback_err = '0; + {%- endif %} +end diff --git a/src/peakrdl_regblock/readback/templates/readback_with_rt.sv b/src/peakrdl_regblock/readback/templates/readback_with_rt.sv new file mode 100644 index 0000000..28fe077 --- /dev/null +++ b/src/peakrdl_regblock/readback/templates/readback_with_rt.sv @@ -0,0 +1,82 @@ +function automatic bit [{{low_addr_width-1}}:0] ad_low(bit [{{ds.addr_width-1}}:0] addr); + return addr[{{low_addr_width-1}}:0]; +endfunction +function automatic bit [{{high_addr_width-1}}:0] ad_hi(bit [{{ds.addr_width-1}}:0] addr); + return addr[{{ds.addr_width-1}}:{{low_addr_width}}]; +endfunction + +// readback stage 1 +logic [{{cpuif.data_width-1}}:0] readback_data_rt_c[{{2 ** high_addr_width}}]; +always_comb begin + automatic logic [{{cpuif.data_width-1}}:0] readback_data_var[{{2 ** high_addr_width}}]; + for(int i=0; i<{{2 ** high_addr_width}}; i++) readback_data_var[i] = '0; + {{readback_mux|indent}} + readback_data_rt_c = readback_data_var; +end + +logic [{{cpuif.data_width-1}}:0] readback_data_rt[{{2 ** high_addr_width}}]; +logic readback_done_rt; +logic readback_err_rt; +logic [{{ds.addr_width-1}}:0] readback_addr_rt; +always_ff {{get_always_ff_event(cpuif.reset)}} begin + if({{get_resetsignal(cpuif.reset)}}) begin + for(int i=0; i<{{2 ** high_addr_width}}; i++) readback_data_rt[i] <= '0; + readback_done_rt <= '0; + readback_err_rt <= '0; + readback_addr_rt <= '0; + end else begin + readback_data_rt <= readback_data_rt_c; + readback_err_rt <= decoded_err; + {%- if ds.has_external_addressable %} + readback_done_rt <= decoded_req & ~decoded_req_is_wr & ~decoded_req_is_external; + {%- else %} + readback_done_rt <= decoded_req & ~decoded_req_is_wr; + {%- endif %} + readback_addr_rt <= rd_mux_addr; + end +end + +{% if ds.has_external_block %} +logic [{{cpuif.data_width-1}}:0] readback_ext_block_data_rt_c; +logic readback_is_ext_block_c; +always_comb begin + automatic logic [{{cpuif.data_width-1}}:0] readback_data_var; + automatic logic is_external_block_var; + readback_data_var = '0; + is_external_block_var = '0; + {{ext_block_readback_mux|indent}} + readback_ext_block_data_rt_c = readback_data_var; + readback_is_ext_block_c = is_external_block_var; +end + +logic [{{cpuif.data_width-1}}:0] readback_ext_block_data_rt; +logic readback_is_ext_block; +always_ff {{get_always_ff_event(cpuif.reset)}} begin + if({{get_resetsignal(cpuif.reset)}}) begin + readback_ext_block_data_rt <= '0; + readback_is_ext_block <= '0; + end else begin + readback_ext_block_data_rt <= readback_ext_block_data_rt_c; + readback_is_ext_block <= readback_is_ext_block_c; + end +end +{% endif %} + +// readback stage 2 +always_comb begin + {%- if ds.has_external_block %} + if(readback_is_ext_block) begin + readback_data = readback_ext_block_data_rt; + end else begin + readback_data = readback_data_rt[readback_addr_rt[{{ds.addr_width-1}}:{{low_addr_width}}]]; + end + {%- else %} + readback_data = readback_data_rt[readback_addr_rt[{{ds.addr_width-1}}:{{low_addr_width}}]]; + {%- endif %} + readback_done = readback_done_rt; + {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %} + readback_err = readback_err_rt; + {%- else %} + readback_err = '0; + {%- endif %} +end diff --git a/tests/README.md b/tests/README.md index c20ca4e..58d32f5 100644 --- a/tests/README.md +++ b/tests/README.md @@ -6,10 +6,10 @@ Testcases require an installation of the Questa simulator, and for `vlog` & `vsim` commands to be visible via the PATH environment variable. -*Questa - Intel FPGA Starter Edition* can be downloaded for free from Intel: -* Go to https://www.intel.com/content/www/us/en/collections/products/fpga/software/downloads.html?edition=pro&q=questa&s=Relevancy -* Select latest version of Questa -* Download Questa files. +*Questa-Altera FPGA and Starter Edition* can be downloaded for free from Altera: +* Go to https://www.altera.com/downloads +* Select "Simulation Tools" +* Download Questa * Install * Be sure to choose "Starter Edition" for the free version. * Create an account on https://licensing.intel.com @@ -18,7 +18,7 @@ commands to be visible via the PATH environment variable. * Go to https://licensing.intel.com/psg/s/sales-signup-evaluationlicenses * Generate a free *Starter Edition* license file for Questa * Easiest to use a *fixed* license using your NIC ID (MAC address of your network card via `ifconfig`) -* Download the license file and point the `LM_LICENSE_FILE` environment variable to the folder which contains it. +* Download the license file and point the `LM_LICENSE_FILE` environment variable to the folder which contains it. In newer versions of Questa, use the `SALT_LICENSE_SERVER` environment variable instead. * (optional) Delete Intel libraries to save some disk space * Delete `/questa_fse/intel` * Edit `/questa_fse/modelsim.ini` and remove lines that reference the `intel` libraries diff --git a/tests/test_only_external_blocks/__init__.py b/tests/test_only_external_blocks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_only_external_blocks/regblock.rdl b/tests/test_only_external_blocks/regblock.rdl new file mode 100644 index 0000000..be6e664 --- /dev/null +++ b/tests/test_only_external_blocks/regblock.rdl @@ -0,0 +1,11 @@ +addrmap top { + mem ext_mem #( + longint SIZE = 0x100 + ) { + memwidth = 32; + mementries = SIZE / 4; + }; + + external ext_mem #(.SIZE(0x10)) mem1 @ 0x0000; + external ext_mem #(.SIZE(0x90)) mem2 @ 0x0200; +}; diff --git a/tests/test_only_external_blocks/tb_template.sv b/tests/test_only_external_blocks/tb_template.sv new file mode 100644 index 0000000..bf8c2cb --- /dev/null +++ b/tests/test_only_external_blocks/tb_template.sv @@ -0,0 +1,115 @@ +{% extends "lib/tb_base.sv" %} + + + +{%- block dut_support %} + {% sv_line_anchor %} + + external_block #( + .ADDR_WIDTH($clog2('h10)) + ) mem1_inst ( + .clk(clk), + .rst(rst), + + .req(hwif_out.mem1.req), + .req_is_wr(hwif_out.mem1.req_is_wr), + .addr(hwif_out.mem1.addr), + .wr_data(hwif_out.mem1.wr_data), + .wr_biten(hwif_out.mem1.wr_biten), + .rd_ack(hwif_in.mem1.rd_ack), + .rd_data(hwif_in.mem1.rd_data), + .wr_ack(hwif_in.mem1.wr_ack) + ); + + external_block #( + .ADDR_WIDTH($clog2('h90)) + ) mem2_inst ( + .clk(clk), + .rst(rst), + + .req(hwif_out.mem2.req), + .req_is_wr(hwif_out.mem2.req_is_wr), + .addr(hwif_out.mem2.addr), + .wr_data(hwif_out.mem2.wr_data), + .wr_biten(hwif_out.mem2.wr_biten), + .rd_ack(hwif_in.mem2.rd_ack), + .rd_data(hwif_in.mem2.rd_data), + .wr_ack(hwif_in.mem2.wr_ack) + ); + +{%- endblock %} + + + +{% block seq %} + {% sv_line_anchor %} + ##1; + cb.rst <= '0; + ##1; + + //-------------------------------------------------------------------------- + // Simple read/write tests + //-------------------------------------------------------------------------- + // mem1 + repeat(32) begin + logic [31:0] x; + int unsigned addr; + x = $urandom(); + addr = 'h0; + addr += $urandom_range(('h10 / 4) - 1) * 4; + cpuif.write(addr, x); + cpuif.assert_read(addr, x); + end + + // mem2 + repeat(32) begin + logic [31:0] x; + int unsigned addr; + x = $urandom(); + addr = 'h200; + addr += $urandom_range(('h90 / 4) - 1) * 4; + cpuif.write(addr, x); + cpuif.assert_read(addr, x); + end + + //-------------------------------------------------------------------------- + // Pipelined access + //-------------------------------------------------------------------------- + // init array with unique known value + for(int i=0; i<('h10 / 4); i++) begin + cpuif.write('h0 + i*4, 'h1000 + i); + end + for(int i=0; i<('h90 / 4); i++) begin + cpuif.write('h200 + i*4, 'h3000 + i); + end + + // random pipelined read/writes + repeat(256) begin + fork + begin + int i; + logic [31:0] x; + int unsigned addr; + case($urandom_range(1)) + 0: begin + i = $urandom_range(('h10 / 4) - 1); + x = 'h1000 + i; + addr = 'h0 + i*4; + end + 1: begin + i = $urandom_range(('h90 / 4) - 1); + x = 'h3000 + i; + addr = 'h200 + i*4; + end + endcase + + case($urandom_range(1)) + 0: cpuif.write(addr, x); + 1: cpuif.assert_read(addr, x); + endcase + end + join_none + end + wait fork; + +{% endblock %} diff --git a/tests/test_only_external_blocks/testcase.py b/tests/test_only_external_blocks/testcase.py new file mode 100644 index 0000000..2486293 --- /dev/null +++ b/tests/test_only_external_blocks/testcase.py @@ -0,0 +1,29 @@ +from parameterized import parameterized_class + +from ..lib.cpuifs.apb4 import APB4 +from ..lib.cpuifs.axi4lite import AXI4Lite +from ..lib.cpuifs.passthrough import Passthrough +from ..lib.sim_testcase import SimTestCase +from ..lib.test_params import get_permutation_class_name, get_permutations + + +@parameterized_class(get_permutations({ + "cpuif": [ + APB4(), + Passthrough(), + ], + "retime_read_fanin": [True, False], + "retime_read_response": [True, False], + "retime_external": [True, False], +}), class_name_func=get_permutation_class_name) +class Test(SimTestCase): + extra_tb_files = [ + "../lib/external_reg.sv", + "../lib/external_block.sv", + ] + init_hwif_in = False + clocking_hwif_in = False + timeout_clk_cycles = 30000 + + def test_dut(self): + self.run_test()