diff --git a/docs/architecture.rst b/docs/architecture.rst
index daa88f8..4a0b74c 100644
--- a/docs/architecture.rst
+++ b/docs/architecture.rst
@@ -38,18 +38,15 @@ This section also assigns any hardware interface outputs.
Readback
--------
-The readback layer aggregates and reduces all readable registers into a single
-read response. During a read operation, the same address decode strobes are used
-to select the active register that is being accessed.
-This allows for a simple OR-reduction operation to be used to compute the read
-data response.
+The readback layer aggregates and MUXes all readable registers into a single
+read response.
For designs with a large number of software-readable registers, an optional
fanin re-timing stage can be enabled. This stage is automatically inserted at a
balanced point in the read-data reduction so that fanin and logic-levels are
optimally reduced.
-.. figure:: diagrams/readback.png
+.. figure:: diagrams/rt-readback-fanin.png
:width: 65%
:align: center
diff --git a/docs/dev_notes/Alpha-Beta Versioning b/docs/dev_notes/Alpha-Beta Versioning
deleted file mode 100644
index c8a089d..0000000
--- a/docs/dev_notes/Alpha-Beta Versioning
+++ /dev/null
@@ -1,10 +0,0 @@
-Holy smokes this is complicated
-
-Keep this exporter in Alpha/Beta for a while
-Add some text in the readme or somewhere:
- - No guarantees of correctness! This is always true with open source software,
- but even more here!
- Be sure to do your own validation before using this in production.
- - Alpha means the implementation may change drastically!
- Unlike official sem-ver, I am not making any guarantees on compatibility
- - I need your help! Validating, finding edge cases, etc...
diff --git a/docs/dev_notes/template-layers/5-readback-mux b/docs/dev_notes/template-layers/5-readback-mux
index fa53102..7e9702c 100644
--- a/docs/dev_notes/template-layers/5-readback-mux
+++ b/docs/dev_notes/template-layers/5-readback-mux
@@ -1,35 +1,84 @@
--------------------------------------------------------------------------------
Readback mux layer
--------------------------------------------------------------------------------
+Use a large always_comb block + many if statements that select the read data
+based on the cpuif address.
+Loops are handled the same way as address decode.
-Implementation:
- - Big always_comb block
- - Initialize default rd_data value
- - Lotsa if statements that operate on reg strb to assign rd_data
- - Merges all fields together into reg
- - pulls value from storage element struct, or input struct
- - Provision for optional flop stage?
+Other options that were considered:
+ - Flat case statement
+ con: Difficult to represent arrays. Essentially requires unrolling
+ con: complicates retiming strategies
+ con: Representing a range (required for externals) is cumbersome. Possible with stacked casez wildcards.
+ - AND field data with strobe, then massive OR reduce
+ This was the strategy prior to v1.3, but turned out to infer more overhead
+ than originally anticipated
+ - Assigning data to a flat register array, then directly indexing via address
+ con: Would work fine, but scales poorly for sparse regblocks.
+ Namely, simulators would likely allocate memory for the entire array
+ - Assign to a flat array that is packed sequentially, then directly indexing using a derived packed index
+ Concern that for sparse regfiles, the translation of addr --> packed index
+ becomes a nontrivial logic function
-Mux Strategy:
- Flat case statement:
- -- Cant parameterize
- + better performance?
+Pros:
+ - Scales well for arrays since loops can be used
+ - Externals work well, as address ranges can be compared
+ - Synthesis results show more efficient logic inference
- Flat 1-hot array then OR reduce:
- - Create a bus-wide flat array
- eg: 32-bits x N readable registers
- - Assign each element:
- the readback value of each register
- ... masked by the register's access strobe
- - I could also stuff an extra bit into the array that denotes the read is valid
- A missed read will OR reduce down to a 0
- - Finally, OR reduce all the elements in the array down to a flat 32-bit bus
- - Retiming the large OR fanin can be done by chopping up the array into stages
- for 2 stages, sqrt(N) gives each stage's fanin size. Round to favor
- more fanin on 2nd stage
- 3 stages uses cube-root. etc...
- - This has the benefit of re-using the address decode logic.
- synth can choose to replicate logic if fanout is bad
+Example:
+ logic [7:0] out;
+ always_comb begin
+ out = '0;
+ for(int i=0; i<64; i++) begin
+ if(i == addr) out = data[i];
+ end
+ end
+
+
+How to implement retiming:
+ Ideally this would partition the design into several equal sub-regions, but
+ with loop structures, this is pretty difficult..
+ What if instead, it is partitioned into equal address ranges?
+
+ First stage compares the lower-half of the address bits.
+ Values are assigned to the appropriate output "bin"
+
+ logic [7:0] out[8];
+ always_comb begin
+ for(int i=0; i<8; i++) out[i] = '0;
+
+ for(int i=0; i<64; i++) begin
+ automatic bit [5:0] this_addr = i;
+
+ if(this_addr[2:0] == addr[2:0]) out[this_addr[5:3]] = data[i];
+ end
+ end
+
+ (not showing retiming ff for `out` and `addr`)
+ The second stage muxes down the resulting bins using the high address bits.
+ If the user up-sizes the address bits, need to check the upper bits to prevent aliasing
+ Assuming min address bit range is [5:0], but it was padded up to [8:0], do the following:
+
+ logic [7:0] rd_data;
+ always_comb begin
+ if(addr[8:6] != '0) begin
+ // Invalid read range
+ rd_data = '0;
+ end else begin
+ rd_data = out[addr[5:3]];
+ end
+ end
+
+Retiming with external blocks
+ One minor downside is the above scheme does not work well for external blocks
+ that span a range of addresses. Depending on the range, it may span multiple
+ retiming bins which complicates how this would be assigned cleanly.
+ This would be complicated even further with arrays of externals since the
+ span of bins could change depending on the iteration.
+
+ Since externals can already be retimed, and large fanin of external blocks
+ is likely less of a concern, implement these as a separate readback mux on
+ the side that does not get retimed at all.
WARNING:
@@ -42,8 +91,14 @@ WARNING:
Forwards response strobe back up to cpu interface layer
-TODO:
- Dont forget about alias registers here
-TODO:
- Does the endinness the user sets matter anywhere?
+Variables:
+ From decode:
+ decoded_addr
+ decoded_req
+ decoded_req_is_wr
+
+ Response:
+ readback_done
+ readback_err
+ readback_data
diff --git a/docs/diagrams/diagrams.odg b/docs/diagrams/diagrams.odg
index 5b42eb6..dd6fab7 100644
Binary files a/docs/diagrams/diagrams.odg and b/docs/diagrams/diagrams.odg differ
diff --git a/docs/diagrams/readback.png b/docs/diagrams/readback.png
deleted file mode 100644
index 9508650..0000000
Binary files a/docs/diagrams/readback.png and /dev/null differ
diff --git a/docs/diagrams/rt-readback-fanin.drawio b/docs/diagrams/rt-readback-fanin.drawio
new file mode 100644
index 0000000..3d591d8
--- /dev/null
+++ b/docs/diagrams/rt-readback-fanin.drawio
@@ -0,0 +1,242 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/diagrams/rt-readback-fanin.png b/docs/diagrams/rt-readback-fanin.png
new file mode 100644
index 0000000..e05530b
Binary files /dev/null and b/docs/diagrams/rt-readback-fanin.png differ
diff --git a/pyproject.toml b/pyproject.toml
index dd15049..951b88b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "peakrdl-regblock"
dynamic = ["version"]
requires-python = ">=3.7"
dependencies = [
- "systemrdl-compiler ~= 1.31",
+ "systemrdl-compiler ~= 1.32",
"Jinja2 >= 2.11",
]
diff --git a/src/peakrdl_regblock/addr_decode.py b/src/peakrdl_regblock/addr_decode.py
index 7617a44..7f79613 100644
--- a/src/peakrdl_regblock/addr_decode.py
+++ b/src/peakrdl_regblock/addr_decode.py
@@ -133,8 +133,9 @@ class DecodeLogicGenerator(RDLForLoopGenerator):
self._array_stride_stack = [] # type: List[int]
def _add_addressablenode_decoding_flags(self, node: 'AddressableNode') -> None:
- addr_str = self._get_address_str(node)
- addr_decoding_str = f"cpuif_req_masked & (cpuif_addr >= {addr_str}) & (cpuif_addr <= {addr_str} + {SVInt(node.size - 1, self.addr_decode.exp.ds.addr_width)})"
+ addr_lo = self._get_address_str(node)
+ addr_hi = f"{addr_lo} + {SVInt(node.size - 1, self.addr_decode.exp.ds.addr_width)}"
+ addr_decoding_str = f"cpuif_req_masked & (cpuif_addr >= {addr_lo}) & (cpuif_addr <= {addr_hi})"
rhs = addr_decoding_str
rhs_valid_addr = addr_decoding_str
if isinstance(node, MemNode):
diff --git a/src/peakrdl_regblock/exporter.py b/src/peakrdl_regblock/exporter.py
index ba4d2d6..c6e068e 100644
--- a/src/peakrdl_regblock/exporter.py
+++ b/src/peakrdl_regblock/exporter.py
@@ -165,12 +165,6 @@ class RegblockExporter:
# Validate that there are no unsupported constructs
DesignValidator(self).do_validate()
- # Compute readback implementation early.
- # Readback has the capability to disable retiming if the fanin is tiny.
- # This affects the rest of the design's implementation, and must be known
- # before any other templates are rendered
- readback_implementation = self.readback.get_implementation()
-
# Build Jinja template context
context = {
"cpuif": self.cpuif,
@@ -184,7 +178,7 @@ class RegblockExporter:
"default_resetsignal_name": self.dereferencer.default_resetsignal_name,
"address_decode": self.address_decode,
"field_logic": self.field_logic,
- "readback_implementation": readback_implementation,
+ "readback_implementation": self.readback.get_implementation(),
"ext_write_acks": ext_write_acks,
"ext_read_acks": ext_read_acks,
"parity": parity,
@@ -319,6 +313,10 @@ class DesignState:
)
self.cpuif_data_width = 32
+ # Also, to avoid silly edge cases, disable read fanin retiming since
+ # it has little benefit anyways
+ self.retime_read_fanin = False
+
#------------------------
# Min address width encloses the total size AND at least 1 useful address bit
self.addr_width = max(clog2(self.top_node.size), clog2(self.cpuif_data_width//8) + 1)
@@ -328,6 +326,15 @@ class DesignState:
msg.fatal(f"User-specified address width shall be greater than or equal to {self.addr_width}.")
self.addr_width = user_addr_width
+ if self.retime_read_fanin:
+ # Check if address width is sufficient to even bother with read fanin retiming
+ data_width_bytes = self.cpuif_data_width // 8
+ unused_low_addr_bits = clog2(data_width_bytes)
+ relevant_addr_width = self.addr_width - unused_low_addr_bits
+ if relevant_addr_width < 2:
+ # Unable to partition the address space. Disable retiming
+ self.retime_read_fanin = False
+
@property
def min_read_latency(self) -> int:
n = 0
diff --git a/src/peakrdl_regblock/module_tmpl.sv b/src/peakrdl_regblock/module_tmpl.sv
index 1c15395..5ec9666 100644
--- a/src/peakrdl_regblock/module_tmpl.sv
+++ b/src/peakrdl_regblock/module_tmpl.sv
@@ -30,24 +30,7 @@ module {{ds.module_name}}
logic cpuif_req_masked;
{%- if ds.has_external_addressable %}
- logic external_req;
logic external_pending;
- logic external_wr_ack;
- logic external_rd_ack;
- always_ff {{get_always_ff_event(cpuif.reset)}} begin
- if({{get_resetsignal(cpuif.reset)}}) begin
- external_pending <= '0;
- end else begin
- if(external_req & ~external_wr_ack & ~external_rd_ack) external_pending <= '1;
- else if(external_wr_ack | external_rd_ack) external_pending <= '0;
- `ifndef SYNTHESIS
- assert_bad_ext_wr_ack: assert(!external_wr_ack || (external_pending | external_req))
- else $error("An external wr_ack strobe was asserted when no external request was active");
- assert_bad_ext_rd_ack: assert(!external_rd_ack || (external_pending | external_req))
- else $error("An external rd_ack strobe was asserted when no external request was active");
- `endif
- end
- end
{%- endif %}
{% if ds.min_read_latency == ds.min_write_latency %}
// Read & write latencies are balanced. Stalls not required
@@ -109,11 +92,9 @@ module {{ds.module_name}}
decoded_reg_strb_t decoded_reg_strb;
logic decoded_err;
{%- if ds.has_external_addressable %}
- logic decoded_strb_is_external;
+ logic decoded_req_is_external;
{% endif %}
-{%- if ds.has_external_block %}
logic [{{cpuif.addr_width-1}}:0] decoded_addr;
-{% endif %}
logic decoded_req;
logic decoded_req_is_wr;
logic [{{cpuif.data_width-1}}:0] decoded_wr_data;
@@ -147,15 +128,31 @@ module {{ds.module_name}}
decoded_err = '0;
{%- endif %}
{%- if ds.has_external_addressable %}
- decoded_strb_is_external = is_external;
- external_req = is_external;
+ decoded_req_is_external = is_external;
{%- endif %}
end
+{%- if ds.has_external_addressable %}
+ logic external_wr_ack;
+ logic external_rd_ack;
+ always_ff {{get_always_ff_event(cpuif.reset)}} begin
+ if({{get_resetsignal(cpuif.reset)}}) begin
+ external_pending <= '0;
+ end else begin
+ if(decoded_req_is_external & ~external_wr_ack & ~external_rd_ack) external_pending <= '1;
+ else if(external_wr_ack | external_rd_ack) external_pending <= '0;
+ `ifndef SYNTHESIS
+ assert_bad_ext_wr_ack: assert(!external_wr_ack || (external_pending | decoded_req_is_external))
+ else $error("An external wr_ack strobe was asserted when no external request was active");
+ assert_bad_ext_rd_ack: assert(!external_rd_ack || (external_pending | decoded_req_is_external))
+ else $error("An external rd_ack strobe was asserted when no external request was active");
+ `endif
+ end
+ end
+{%- endif %}
+
// Pass down signals to next stage
-{%- if ds.has_external_block %}
assign decoded_addr = cpuif_addr;
-{% endif %}
assign decoded_req = cpuif_req_masked;
assign decoded_req_is_wr = cpuif_req_is_wr;
assign decoded_wr_data = cpuif_wr_data;
@@ -223,7 +220,7 @@ module {{ds.module_name}}
{{ext_write_acks.get_implementation()|indent(8)}}
external_wr_ack = wr_ack;
end
- assign cpuif_wr_ack = external_wr_ack | (decoded_req & decoded_req_is_wr & ~decoded_strb_is_external);
+ assign cpuif_wr_ack = external_wr_ack | (decoded_req & decoded_req_is_wr & ~decoded_req_is_external);
{%- else %}
assign cpuif_wr_ack = decoded_req & decoded_req_is_wr;
{%- endif %}
@@ -262,10 +259,26 @@ module {{ds.module_name}}
{%- endif %}
{%- endif %}
+ logic [{{cpuif.addr_width-1}}:0] rd_mux_addr;
+{%- if ds.has_external_addressable %}
+ logic [{{cpuif.addr_width-1}}:0] pending_rd_addr;
+ // Hold read mux address to guarantee it is stable throughout any external accesses
+ always_ff {{get_always_ff_event(cpuif.reset)}} begin
+ if({{get_resetsignal(cpuif.reset)}}) begin
+ pending_rd_addr <= '0;
+ end else begin
+ if(decoded_req) pending_rd_addr <= decoded_addr;
+ end
+ end
+ assign rd_mux_addr = decoded_req ? decoded_addr : pending_rd_addr;
+{%- else %}
+ assign rd_mux_addr = decoded_addr;
+{%- endif %}
+
logic readback_err;
logic readback_done;
logic [{{cpuif.data_width-1}}:0] readback_data;
-{{readback_implementation|indent}}
+ {{readback_implementation|indent}}
{% if ds.retime_read_response %}
always_ff {{get_always_ff_event(cpuif.reset)}} begin
if({{get_resetsignal(cpuif.reset)}}) begin
diff --git a/src/peakrdl_regblock/readback/__init__.py b/src/peakrdl_regblock/readback/__init__.py
index dafb1e0..9e3117a 100644
--- a/src/peakrdl_regblock/readback/__init__.py
+++ b/src/peakrdl_regblock/readback/__init__.py
@@ -1,72 +1 @@
-from typing import TYPE_CHECKING
-import math
-
-from .generators import ReadbackAssignmentGenerator
-
-if TYPE_CHECKING:
- from ..exporter import RegblockExporter, DesignState
- from systemrdl.node import AddrmapNode
-
-class Readback:
- def __init__(self, exp:'RegblockExporter'):
- self.exp = exp
-
- @property
- def ds(self) -> 'DesignState':
- return self.exp.ds
-
- @property
- def top_node(self) -> 'AddrmapNode':
- return self.exp.ds.top_node
-
- def get_implementation(self) -> str:
- gen = ReadbackAssignmentGenerator(self.exp)
- array_assignments = gen.get_content(self.top_node)
- array_size = gen.current_offset
-
- # Enabling the fanin stage doesnt make sense if readback fanin is
- # small. This also avoids pesky corner cases
- if array_size < 4:
- self.ds.retime_read_fanin = False
-
- context = {
- "array_assignments" : array_assignments,
- "array_size" : array_size,
- 'get_always_ff_event': self.exp.dereferencer.get_always_ff_event,
- 'get_resetsignal': self.exp.dereferencer.get_resetsignal,
- "cpuif": self.exp.cpuif,
- "ds": self.ds,
- }
-
- if self.ds.retime_read_fanin:
- # If adding a fanin pipeline stage, goal is to try to
- # split the fanin path in the middle so that fanin into the stage
- # and the following are roughly balanced.
- fanin_target = math.sqrt(array_size)
-
- # Size of fanin group to consume per fanin element
- fanin_stride = math.floor(fanin_target)
-
- # Number of array elements to reduce to.
- # Round up to an extra element in case there is some residual
- fanin_array_size = math.ceil(array_size / fanin_stride)
-
- # leftovers are handled in an extra array element
- fanin_residual_stride = array_size % fanin_stride
-
- if fanin_residual_stride != 0:
- # If there is a partial fanin element, reduce the number of
- # loops performed in the bulk fanin stage
- fanin_loop_iter = fanin_array_size - 1
- else:
- fanin_loop_iter = fanin_array_size
-
- context['fanin_stride'] = fanin_stride
- context['fanin_array_size'] = fanin_array_size
- context['fanin_residual_stride'] = fanin_residual_stride
- context['fanin_loop_iter'] = fanin_loop_iter
-
- template = self.exp.jj_env.get_template(
- "readback/templates/readback.sv"
- )
- return template.render(context)
+from .readback import Readback
diff --git a/src/peakrdl_regblock/readback/generators.py b/src/peakrdl_regblock/readback/generators.py
deleted file mode 100644
index 87f2969..0000000
--- a/src/peakrdl_regblock/readback/generators.py
+++ /dev/null
@@ -1,381 +0,0 @@
-from typing import TYPE_CHECKING, List
-
-from systemrdl.node import RegNode, AddressableNode
-from systemrdl.walker import WalkerAction
-
-from ..forloop_generator import RDLForLoopGenerator, LoopBody
-
-from ..utils import do_bitswap, do_slice
-
-if TYPE_CHECKING:
- from ..exporter import RegblockExporter
-
-class ReadbackLoopBody(LoopBody):
- def __init__(self, dim: int, iterator: str, i_type: str) -> None:
- super().__init__(dim, iterator, i_type)
- self.n_regs = 0
-
- def __str__(self) -> str:
- # replace $i#sz token when stringifying
- s = super().__str__()
- token = f"${self.iterator}sz"
- s = s.replace(token, str(self.n_regs))
- return s
-
-class ReadbackAssignmentGenerator(RDLForLoopGenerator):
- i_type = "genvar"
- loop_body_cls = ReadbackLoopBody
-
- def __init__(self, exp:'RegblockExporter') -> None:
- super().__init__()
- self.exp = exp
-
- # The readback array collects all possible readback values into a flat
- # array. The array width is equal to the CPUIF bus width. Each entry in
- # the array represents an aligned read access.
- self.current_offset = 0
- self.start_offset_stack = [] # type: List[int]
- self.dim_stack = [] # type: List[int]
-
- @property
- def current_offset_str(self) -> str:
- """
- Derive a string that represents the current offset being assigned.
- This consists of:
- - The current integer offset
- - multiplied index of any enclosing loop
-
- The integer offset from "current_offset" is static and is monotonically
- incremented as more register assignments are processed.
-
- The component of the offset from loops is added by multiplying the current
- loop index by the loop size.
- Since the loop's size is not known at this time, it is emitted as a
- placeholder token like: $i0sz, $i1sz, $i2sz, etc
- These tokens can be replaced once the loop body has been completed and the
- size of its contents is known.
- """
- offset_parts = []
- for i in range(self._loop_level):
- offset_parts.append(f"i{i} * $i{i}sz")
- offset_parts.append(str(self.current_offset))
- return " + ".join(offset_parts)
-
- def push_loop(self, dim: int) -> None:
- super().push_loop(dim)
- self.start_offset_stack.append(self.current_offset)
- self.dim_stack.append(dim)
-
- def pop_loop(self) -> None:
- start_offset = self.start_offset_stack.pop()
- dim = self.dim_stack.pop()
-
- # Number of registers enclosed in this loop
- n_regs = self.current_offset - start_offset
- self.current_loop.n_regs = n_regs # type: ignore
-
- super().pop_loop()
-
- # Advance current scope's offset to account for loop's contents
- self.current_offset = start_offset + n_regs * dim
-
-
- def enter_AddressableComponent(self, node: 'AddressableNode') -> WalkerAction:
- super().enter_AddressableComponent(node)
-
- if node.external and not isinstance(node, RegNode):
- # External block
- strb = self.exp.hwif.get_external_rd_ack(node)
- data = self.exp.hwif.get_external_rd_data(node)
- self.add_content(f"assign readback_array[{self.current_offset_str}] = {strb} ? {data} : '0;")
- self.current_offset += 1
- return WalkerAction.SkipDescendants
-
- return WalkerAction.Continue
-
- def enter_Reg(self, node: RegNode) -> WalkerAction:
- if not node.has_sw_readable:
- return WalkerAction.SkipDescendants
-
- if node.external:
- self.process_external_reg(node)
- return WalkerAction.SkipDescendants
-
- accesswidth = node.get_property('accesswidth')
- regwidth = node.get_property('regwidth')
- rbuf = node.get_property('buffer_reads')
- if rbuf:
- trigger = node.get_property('rbuffer_trigger')
- is_own_trigger = (isinstance(trigger, RegNode) and trigger == node)
- if is_own_trigger:
- if accesswidth < regwidth:
- self.process_buffered_reg_with_bypass(node, regwidth, accesswidth)
- else:
- # bypass cancels out. Behaves like a normal reg
- self.process_reg(node)
- else:
- self.process_buffered_reg(node, regwidth, accesswidth)
- elif accesswidth < regwidth:
- self.process_wide_reg(node, accesswidth)
- else:
- self.process_reg(node)
-
- return WalkerAction.SkipDescendants
-
- def process_external_reg(self, node: RegNode) -> None:
- strb = self.exp.hwif.get_external_rd_ack(node)
- data = self.exp.hwif.get_external_rd_data(node)
- regwidth = node.get_property('regwidth')
- if regwidth < self.exp.cpuif.data_width:
- self.add_content(f"assign readback_array[{self.current_offset_str}][{self.exp.cpuif.data_width-1}:{regwidth}] = '0;")
- self.add_content(f"assign readback_array[{self.current_offset_str}][{regwidth-1}:0] = {strb} ? {data} : '0;")
- else:
- self.add_content(f"assign readback_array[{self.current_offset_str}] = {strb} ? {data} : '0;")
-
- self.current_offset += 1
-
- def process_reg(self, node: RegNode) -> None:
- current_bit = 0
- rd_strb = f"({self.exp.dereferencer.get_access_strobe(node)} && !decoded_req_is_wr)"
- # Fields are sorted by ascending low bit
- for field in node.fields():
- if not field.is_sw_readable:
- continue
-
- # insert reserved assignment before this field if needed
- if field.low != current_bit:
- self.add_content(f"assign readback_array[{self.current_offset_str}][{field.low-1}:{current_bit}] = '0;")
-
- value = self.exp.dereferencer.get_value(field)
- if field.msb < field.lsb:
- # Field gets bitswapped since it is in [low:high] orientation
- value = do_bitswap(value)
-
- self.add_content(f"assign readback_array[{self.current_offset_str}][{field.high}:{field.low}] = {rd_strb} ? {value} : '0;")
-
- current_bit = field.high + 1
-
- # Insert final reserved assignment if needed
- bus_width = self.exp.cpuif.data_width
- if current_bit < bus_width:
- self.add_content(f"assign readback_array[{self.current_offset_str}][{bus_width-1}:{current_bit}] = '0;")
-
- self.current_offset += 1
-
-
- def process_buffered_reg(self, node: RegNode, regwidth: int, accesswidth: int) -> None:
- rbuf = self.exp.read_buffering.get_rbuf_data(node)
-
- if accesswidth < regwidth:
- # Is wide reg
- n_subwords = regwidth // accesswidth
- astrb = self.exp.dereferencer.get_access_strobe(node, reduce_substrobes=False)
- for i in range(n_subwords):
- rd_strb = f"({astrb}[{i}] && !decoded_req_is_wr)"
- bslice = f"[{(i + 1) * accesswidth - 1}:{i*accesswidth}]"
- self.add_content(f"assign readback_array[{self.current_offset_str}] = {rd_strb} ? {rbuf}{bslice} : '0;")
- self.current_offset += 1
-
- else:
- # Is regular reg
- rd_strb = f"({self.exp.dereferencer.get_access_strobe(node)} && !decoded_req_is_wr)"
- self.add_content(f"assign readback_array[{self.current_offset_str}][{regwidth-1}:0] = {rd_strb} ? {rbuf} : '0;")
-
- bus_width = self.exp.cpuif.data_width
- if regwidth < bus_width:
- self.add_content(f"assign readback_array[{self.current_offset_str}][{bus_width-1}:{regwidth}] = '0;")
-
- self.current_offset += 1
-
-
- def process_buffered_reg_with_bypass(self, node: RegNode, regwidth: int, accesswidth: int) -> None:
- """
- Special case for a buffered register when the register is its own trigger.
- First sub-word shall bypass the read buffer and assign directly.
- Subsequent subwords assign from the buffer.
- Caller guarantees this is a wide reg
- """
- astrb = self.exp.dereferencer.get_access_strobe(node, reduce_substrobes=False)
-
- # Generate assignments for first sub-word
- bidx = 0
- rd_strb = f"({astrb}[0] && !decoded_req_is_wr)"
- for field in node.fields():
- if not field.is_sw_readable:
- continue
-
- if field.low >= accesswidth:
- # field is not in this subword.
- break
-
- if bidx < field.low:
- # insert padding before
- self.add_content(f"assign readback_array[{self.current_offset_str}][{field.low - 1}:{bidx}] = '0;")
-
- if field.high >= accesswidth:
- # field gets truncated
- r_low = field.low
- r_high = accesswidth - 1
- f_low = 0
- f_high = accesswidth - 1 - field.low
-
- if field.msb < field.lsb:
- # Field gets bitswapped since it is in [low:high] orientation
- # Mirror the low/high indexes
- f_low = field.width - 1 - f_low
- f_high = field.width - 1 - f_high
- f_low, f_high = f_high, f_low
- value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low))
- else:
- value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)
-
- self.add_content(f"assign readback_array[{self.current_offset_str}][{r_high}:{r_low}] = {rd_strb} ? {value} : '0;")
- bidx = accesswidth
- else:
- # field fits in subword
- value = self.exp.dereferencer.get_value(field)
- if field.msb < field.lsb:
- # Field gets bitswapped since it is in [low:high] orientation
- value = do_bitswap(value)
- self.add_content(f"assign readback_array[{self.current_offset_str}][{field.high}:{field.low}] = {rd_strb} ? {value} : '0;")
- bidx = field.high + 1
-
- # pad up remainder of subword
- if bidx < accesswidth:
- self.add_content(f"assign readback_array[{self.current_offset_str}][{accesswidth-1}:{bidx}] = '0;")
- self.current_offset += 1
-
- # Assign remainder of subwords from read buffer
- n_subwords = regwidth // accesswidth
- rbuf = self.exp.read_buffering.get_rbuf_data(node)
- for i in range(1, n_subwords):
- rd_strb = f"({astrb}[{i}] && !decoded_req_is_wr)"
- bslice = f"[{(i + 1) * accesswidth - 1}:{i*accesswidth}]"
- self.add_content(f"assign readback_array[{self.current_offset_str}] = {rd_strb} ? {rbuf}{bslice} : '0;")
- self.current_offset += 1
-
- def process_wide_reg(self, node: RegNode, accesswidth: int) -> None:
- bus_width = self.exp.cpuif.data_width
-
- subword_idx = 0
- current_bit = 0 # Bit-offset within the wide register
- access_strb = self.exp.dereferencer.get_access_strobe(node, reduce_substrobes=False)
- # Fields are sorted by ascending low bit
- for field in node.fields():
- if not field.is_sw_readable:
- continue
-
- # insert zero assignment before this field if needed
- if field.low >= accesswidth*(subword_idx+1):
- # field does not start in this subword
- if current_bit > accesswidth * subword_idx:
- # current subword had content. Assign remainder
- low = current_bit % accesswidth
- high = bus_width - 1
- self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = '0;")
- self.current_offset += 1
-
- # Advance to subword that contains the start of the field
- subword_idx = field.low // accesswidth
- current_bit = accesswidth * subword_idx
-
- if current_bit != field.low:
- # assign zero up to start of this field
- low = current_bit % accesswidth
- high = (field.low % accesswidth) - 1
- self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = '0;")
- current_bit = field.low
-
-
- # Assign field
- # loop until the entire field's assignments have been generated
- field_pos = field.low
- while current_bit <= field.high:
- # Assign the field
- rd_strb = f"({access_strb}[{subword_idx}] && !decoded_req_is_wr)"
- if (field_pos == field.low) and (field.high < accesswidth*(subword_idx+1)):
- # entire field fits into this subword
- low = field.low - accesswidth * subword_idx
- high = field.high - accesswidth * subword_idx
-
- value = self.exp.dereferencer.get_value(field)
- if field.msb < field.lsb:
- # Field gets bitswapped since it is in [low:high] orientation
- value = do_bitswap(value)
-
- self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = {rd_strb} ? {value} : '0;")
-
- current_bit = field.high + 1
-
- if current_bit == accesswidth*(subword_idx+1):
- # Field ends at the subword boundary
- subword_idx += 1
- self.current_offset += 1
- elif field.high >= accesswidth*(subword_idx+1):
- # only a subset of the field can fit into this subword
- # high end gets truncated
-
- # assignment slice
- r_low = field_pos - accesswidth * subword_idx
- r_high = accesswidth - 1
-
- # field slice
- f_low = field_pos - field.low
- f_high = accesswidth * (subword_idx + 1) - 1 - field.low
-
- if field.msb < field.lsb:
- # Field gets bitswapped since it is in [low:high] orientation
- # Mirror the low/high indexes
- f_low = field.width - 1 - f_low
- f_high = field.width - 1 - f_high
- f_low, f_high = f_high, f_low
-
- value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low))
- else:
- value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)
-
- self.add_content(f"assign readback_array[{self.current_offset_str}][{r_high}:{r_low}] = {rd_strb} ? {value} : '0;")
-
- # advance to the next subword
- subword_idx += 1
- current_bit = accesswidth * subword_idx
- field_pos = current_bit
- self.current_offset += 1
- else:
- # only a subset of the field can fit into this subword
- # finish field
-
- # assignment slice
- r_low = field_pos - accesswidth * subword_idx
- r_high = field.high - accesswidth * subword_idx
-
- # field slice
- f_low = field_pos - field.low
- f_high = field.high - field.low
-
- if field.msb < field.lsb:
- # Field gets bitswapped since it is in [low:high] orientation
- # Mirror the low/high indexes
- f_low = field.width - 1 - f_low
- f_high = field.width - 1 - f_high
- f_low, f_high = f_high, f_low
-
- value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low))
- else:
- value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)
-
- self.add_content(f"assign readback_array[{self.current_offset_str}][{r_high}:{r_low}] = {rd_strb} ? {value} : '0;")
-
- current_bit = field.high + 1
- if current_bit == accesswidth*(subword_idx+1):
- # Field ends at the subword boundary
- subword_idx += 1
- self.current_offset += 1
-
- # insert zero assignment after the last field if needed
- if current_bit > accesswidth * subword_idx:
- # current subword had content. Assign remainder
- low = current_bit % accesswidth
- high = bus_width - 1
- self.add_content(f"assign readback_array[{self.current_offset_str}][{high}:{low}] = '0;")
- self.current_offset += 1
diff --git a/src/peakrdl_regblock/readback/readback.py b/src/peakrdl_regblock/readback/readback.py
new file mode 100644
index 0000000..b36ac0b
--- /dev/null
+++ b/src/peakrdl_regblock/readback/readback.py
@@ -0,0 +1,101 @@
+from typing import TYPE_CHECKING
+
+from .readback_mux_generator import ReadbackMuxGenerator, RetimedReadbackMuxGenerator, RetimedExtBlockReadbackMuxGenerator
+from ..utils import clog2
+
+if TYPE_CHECKING:
+ from ..exporter import RegblockExporter, DesignState
+
+class Readback:
+ def __init__(self, exp:'RegblockExporter'):
+ self.exp = exp
+
+ @property
+ def ds(self) -> 'DesignState':
+ return self.exp.ds
+
+ def get_implementation(self) -> str:
+ if self.ds.retime_read_fanin:
+ return self.get_2stage_implementation()
+ else:
+ # No retiming
+ return self.get_1stage_implementation()
+
+
+ def get_empty_implementation(self) -> str:
+ """
+ Readback implementation when there are no readable registers
+ """
+ context = {
+ "ds": self.ds,
+ }
+ template = self.exp.jj_env.get_template(
+ "readback/templates/empty_readback.sv"
+ )
+ return template.render(context)
+
+
+ def get_1stage_implementation(self) -> str:
+ """
+ Implements readback without any retiming
+ """
+ gen = ReadbackMuxGenerator(self.exp)
+ mux_impl = gen.get_content(self.ds.top_node)
+
+ if not mux_impl:
+ # Design has no readable registers.
+ return self.get_empty_implementation()
+
+ context = {
+ "readback_mux": mux_impl,
+ "cpuif": self.exp.cpuif,
+ "ds": self.ds,
+ }
+ template = self.exp.jj_env.get_template(
+ "readback/templates/readback_no_rt.sv"
+ )
+
+ return template.render(context)
+
+
+ def get_2stage_implementation(self) -> str:
+ """
+ Implements readback that is retimed to 2 stages
+ """
+ # Split the decode to happen in two stages, using low address bits first
+ # then high address bits.
+ # Split in the middle of the "relevant" address bits - the ones that
+ # actually contribute to addressing in the regblock
+ unused_low_addr_bits = clog2(self.exp.cpuif.data_width_bytes)
+ relevant_addr_width = self.ds.addr_width - unused_low_addr_bits
+ low_addr_width = (relevant_addr_width // 2) + unused_low_addr_bits
+ high_addr_width = self.ds.addr_width - low_addr_width
+
+ mux_gen = RetimedReadbackMuxGenerator(self.exp)
+ mux_impl = mux_gen.get_content(self.ds.top_node)
+
+ if not mux_impl:
+ # Design has no readable addresses.
+ return self.get_empty_implementation()
+
+ if self.ds.has_external_block:
+ ext_mux_gen = RetimedExtBlockReadbackMuxGenerator(self.exp)
+ ext_mux_impl = ext_mux_gen.get_content(self.ds.top_node)
+ else:
+ ext_mux_impl = None
+
+ context = {
+ "readback_mux": mux_impl,
+ "ext_block_readback_mux": ext_mux_impl,
+ "cpuif": self.exp.cpuif,
+ "ds": self.ds,
+ "low_addr_width": low_addr_width,
+ "high_addr_width": high_addr_width,
+ 'get_always_ff_event': self.exp.dereferencer.get_always_ff_event,
+ 'get_resetsignal': self.exp.dereferencer.get_resetsignal,
+ }
+ template = self.exp.jj_env.get_template(
+ "readback/templates/readback_with_rt.sv"
+ )
+
+ return template.render(context)
diff --git a/src/peakrdl_regblock/readback/readback_mux_generator.py b/src/peakrdl_regblock/readback/readback_mux_generator.py
new file mode 100644
index 0000000..34d7891
--- /dev/null
+++ b/src/peakrdl_regblock/readback/readback_mux_generator.py
@@ -0,0 +1,361 @@
+from typing import TYPE_CHECKING, List, Sequence, Optional
+
+from systemrdl.node import RegNode, AddressableNode, FieldNode
+from systemrdl.walker import WalkerAction
+
+from ..forloop_generator import RDLForLoopGenerator
+from ..utils import SVInt, do_bitswap, do_slice
+
+if TYPE_CHECKING:
+ from ..exporter import DesignState, RegblockExporter
+
+class ReadbackMuxGenerator(RDLForLoopGenerator):
+ def __init__(self, exp: 'RegblockExporter') -> None:
+ super().__init__()
+
+ self.exp = exp
+
+ # List of address strides for each dimension
+ self._array_stride_stack: List[int] = []
+
+ @property
+ def ds(self) -> 'DesignState':
+ return self.exp.ds
+
+
+ def enter_AddressableComponent(self, node: AddressableNode) -> Optional[WalkerAction]:
+ super().enter_AddressableComponent(node)
+
+ if node.array_dimensions:
+ assert node.array_stride is not None
+ # Collect strides for each array dimension
+ current_stride = node.array_stride
+ strides = []
+ for dim in reversed(node.array_dimensions):
+ strides.append(current_stride)
+ current_stride *= dim
+ strides.reverse()
+ self._array_stride_stack.extend(strides)
+
+ if node.external and not isinstance(node, RegNode):
+ # Is an external block
+ self.process_external_block(node)
+ return WalkerAction.SkipDescendants
+
+ return WalkerAction.Continue
+
+
+ def process_external_block(self, node: AddressableNode) -> None:
+ addr_lo = self._get_address_str(node)
+ addr_hi = f"{addr_lo} + {SVInt(node.size - 1, self.exp.ds.addr_width)}"
+ self.add_content(f"if((rd_mux_addr >= {addr_lo}) && (rd_mux_addr <= {addr_hi})) begin")
+ data = self.exp.hwif.get_external_rd_data(node)
+ self.add_content(f" readback_data_var = {data};")
+ self.add_content("end")
+
+
+ def enter_Reg(self, node: RegNode) -> WalkerAction:
+ fields = node.fields(sw_readable_only=True)
+ if not fields:
+ # Reg has no readable fields
+ return WalkerAction.SkipDescendants
+
+ if node.external:
+ self.process_external_reg(node)
+ return WalkerAction.SkipDescendants
+
+ accesswidth = node.get_property('accesswidth')
+ regwidth = node.get_property('regwidth')
+ rbuf = node.get_property('buffer_reads')
+
+ if rbuf:
+ trigger = node.get_property('rbuffer_trigger')
+ is_own_trigger = (isinstance(trigger, RegNode) and trigger == node)
+ if is_own_trigger:
+ if accesswidth < regwidth:
+ self.process_wide_buffered_reg_with_bypass(node, fields, regwidth, accesswidth)
+ else:
+ # bypass cancels out. Behaves like a normal reg
+ self.process_reg(node, fields)
+ else:
+ self.process_buffered_reg(node, regwidth, accesswidth)
+ elif accesswidth < regwidth:
+ self.process_wide_reg(node, fields, regwidth, accesswidth)
+ else:
+ self.process_reg(node, fields)
+
+ return WalkerAction.SkipDescendants
+
+
+ def _get_address_str(self, node: AddressableNode, subword_offset: int=0) -> str:
+ expr_width = self.ds.addr_width
+ a = str(SVInt(
+ node.raw_absolute_address - self.ds.top_node.raw_absolute_address + subword_offset,
+ expr_width
+ ))
+ for i, stride in enumerate(self._array_stride_stack):
+ a += f" + ({expr_width})'(i{i}) * {SVInt(stride, expr_width)}"
+ return a
+
+
+ def get_addr_compare_conditional(self, addr: str) -> str:
+ return f"rd_mux_addr == {addr}"
+
+ def get_readback_data_var(self, addr: str) -> str:
+ return "readback_data_var"
+
+ def process_external_reg(self, node: RegNode) -> None:
+ accesswidth = node.get_property('accesswidth')
+ regwidth = node.get_property('regwidth')
+ data = self.exp.hwif.get_external_rd_data(node)
+
+ if regwidth > accesswidth:
+ # Is wide reg.
+ # The retiming scheme requires singular address comparisons rather than
+ # ranges. To support this, unroll the subwords
+ n_subwords = regwidth // accesswidth
+ subword_stride = accesswidth // 8
+ for subword_idx in range(n_subwords):
+ addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride)
+ conditional = self.get_addr_compare_conditional(addr)
+ var = self.get_readback_data_var(addr)
+ self.add_content(f"if({conditional}) begin")
+ self.add_content(f" {var} = {data};")
+ self.add_content("end")
+ else:
+ addr = self._get_address_str(node)
+ conditional = self.get_addr_compare_conditional(addr)
+ var = self.get_readback_data_var(addr)
+ self.add_content(f"if({conditional}) begin")
+ if regwidth < self.exp.cpuif.data_width:
+ self.add_content(f" {var}[{regwidth-1}:0] = {data};")
+ else:
+ self.add_content(f" {var} = {data};")
+ self.add_content("end")
+
+
+ def process_reg(self, node: RegNode, fields: Sequence[FieldNode]) -> None:
+ """
+ Process a regular register
+ """
+ addr = self._get_address_str(node)
+ conditional = self.get_addr_compare_conditional(addr)
+ var = self.get_readback_data_var(addr)
+ self.add_content(f"if({conditional}) begin")
+
+ for field in fields:
+ value = self.exp.dereferencer.get_value(field)
+ if field.msb < field.lsb:
+ # Field gets bitswapped since it is in [low:high] orientation
+ value = do_bitswap(value)
+
+ if field.width == 1:
+ self.add_content(f" {var}[{field.low}] = {value};")
+ else:
+ self.add_content(f" {var}[{field.high}:{field.low}] = {value};")
+
+ self.add_content("end")
+
+
+ def process_buffered_reg(self, node: RegNode, regwidth: int, accesswidth: int) -> None:
+ """
+ Process a register which is fully buffered
+ """
+ rbuf = self.exp.read_buffering.get_rbuf_data(node)
+
+ if accesswidth < regwidth:
+ # Is wide reg
+ n_subwords = regwidth // accesswidth
+ subword_stride = accesswidth // 8
+ for subword_idx in range(n_subwords):
+ addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride)
+ conditional = self.get_addr_compare_conditional(addr)
+ var = self.get_readback_data_var(addr)
+ bslice = f"[{(subword_idx + 1) * accesswidth - 1}:{subword_idx*accesswidth}]"
+ self.add_content(f"if({conditional}) begin")
+ self.add_content(f" {var} = {rbuf}{bslice};")
+ self.add_content("end")
+ else:
+ # Is regular reg
+ addr = self._get_address_str(node)
+ conditional = self.get_addr_compare_conditional(addr)
+ var = self.get_readback_data_var(addr)
+ self.add_content(f"if({conditional}) begin")
+ self.add_content(f" {var}[{regwidth-1}:0] = {rbuf};")
+ self.add_content("end")
+
+
+ def process_wide_buffered_reg_with_bypass(self, node: RegNode, fields: Sequence[FieldNode], regwidth: int, accesswidth: int) -> None:
+ """
+ Special case for a wide buffered register where the register is its own
+ trigger.
+
+ First sub-word shall bypass the read buffer and assign directly.
+ Subsequent subwords assign from the buffer.
+ """
+
+ # Generate assignments for first sub-word
+ subword_assignments = self.get_wide_reg_subword_assignments(node, fields, regwidth, accesswidth)
+ if subword_assignments[0]:
+ addr = self._get_address_str(node, subword_offset=0)
+ conditional = self.get_addr_compare_conditional(addr)
+ self.add_content(f"if({conditional}) begin")
+ for assignment in subword_assignments[0]:
+ self.add_content(" " + assignment)
+ self.add_content("end")
+
+ # Assign remainder of subwords from read buffer
+ n_subwords = regwidth // accesswidth
+ subword_stride = accesswidth // 8
+ rbuf = self.exp.read_buffering.get_rbuf_data(node)
+ for subword_idx in range(1, n_subwords):
+ addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride)
+ bslice = f"[{(subword_idx + 1) * accesswidth - 1}:{subword_idx*accesswidth}]"
+ conditional = self.get_addr_compare_conditional(addr)
+ var = self.get_readback_data_var(addr)
+ self.add_content(f"if({conditional}) begin")
+ self.add_content(f" {var} = {rbuf}{bslice};")
+ self.add_content("end")
+
+
+ def get_wide_reg_subword_assignments(self, node: RegNode, fields: Sequence[FieldNode], regwidth: int, accesswidth: int) -> List[List[str]]:
+ """
+ Get a list of assignments for each subword
+
+ Returns a 2d array where the first dimension indicates the subword index.
+ The next dimension is the list of assignments
+ """
+ n_subwords = regwidth // accesswidth
+ subword_stride = accesswidth // 8
+ subword_assignments: List[List[str]] = [[] for _ in range(n_subwords)]
+
+ # Fields are sorted by ascending low bit
+ for field in fields:
+ subword_idx = field.low // accesswidth
+
+ if field.high < accesswidth * (subword_idx + 1):
+ # entire field fits into this subword
+ low = field.low - accesswidth * subword_idx
+ high = field.high - accesswidth * subword_idx
+
+ value = self.exp.dereferencer.get_value(field)
+ if field.msb < field.lsb:
+ # Field gets bitswapped since it is in [low:high] orientation
+ value = do_bitswap(value)
+
+ addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride)
+ var = self.get_readback_data_var(addr)
+ subword_assignments[subword_idx].append(f"{var}[{high}:{low}] = {value};")
+
+ else:
+ # Field spans multiple sub-words
+ # loop through subword indexes until the entire field has been assigned
+ while field.high >= accesswidth * subword_idx:
+ # Allowable field window for this subword
+ subword_low = accesswidth * subword_idx
+ subword_high = subword_low + accesswidth - 1
+
+ # field slice (relative to reg)
+ f_low = max(subword_low, field.low)
+ f_high = min(subword_high, field.high)
+
+ # assignment slice
+ r_low = f_low - accesswidth * subword_idx
+ r_high = f_high - accesswidth * subword_idx
+
+ # Adjust to be relative to field
+ f_low -= field.low
+ f_high -= field.low
+
+ if field.msb < field.lsb:
+ # Field gets bitswapped since it is in [low:high] orientation
+ # Mirror the low/high indexes
+ f_low = field.width - 1 - f_low
+ f_high = field.width - 1 - f_high
+ f_low, f_high = f_high, f_low
+
+ value = do_bitswap(do_slice(self.exp.dereferencer.get_value(field), f_high, f_low))
+ else:
+ value = do_slice(self.exp.dereferencer.get_value(field), f_high, f_low)
+
+ addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride)
+ var = self.get_readback_data_var(addr)
+ subword_assignments[subword_idx].append(f"{var}[{r_high}:{r_low}] = {value};")
+
+ # advance to the next subword
+ subword_idx += 1
+
+ return subword_assignments
+
+
+ def process_wide_reg(self, node: RegNode, fields: Sequence[FieldNode], regwidth: int, accesswidth: int) -> None:
+ """
+ Process a register whose accesswidth < regwidth
+ """
+ subword_assignments = self.get_wide_reg_subword_assignments(node, fields, regwidth, accesswidth)
+
+ # Add generated content, wrapped in the address conditional
+ subword_stride = accesswidth // 8
+ for subword_idx, assignments in enumerate(subword_assignments):
+ if not assignments:
+ continue
+ addr = self._get_address_str(node, subword_offset=subword_idx*subword_stride)
+ conditional = self.get_addr_compare_conditional(addr)
+ self.add_content(f"if({conditional}) begin")
+ for assignment in assignments:
+ self.add_content(" " + assignment)
+ self.add_content("end")
+
+
+ def exit_AddressableComponent(self, node: AddressableNode) -> None:
+ super().exit_AddressableComponent(node)
+
+ if not node.array_dimensions:
+ return
+
+ for _ in node.array_dimensions:
+ self._array_stride_stack.pop()
+
+
+class RetimedReadbackMuxGenerator(ReadbackMuxGenerator):
+ """
+ Alternate variant that is dedicated to building the 1st decode stage
+ """
+
+ def process_external_block(self, node: AddressableNode) -> None:
+ # Do nothing. External blocks are handled in a completely separate readback mux
+ pass
+
+ def get_addr_compare_conditional(self, addr: str) -> str:
+ # In the pipelined variant, compare the low-bits of both sides
+ return f"ad_low(rd_mux_addr) == ad_low({addr})"
+
+ def get_readback_data_var(self, addr: str) -> str:
+ # In the pipelined variant, assign to the bin indexed by the high bits of addr
+ return f"readback_data_var[ad_hi({addr})]"
+
+
+class RetimedExtBlockReadbackMuxGenerator(ReadbackMuxGenerator):
+ """
+ When retiming is enabled, external blocks are implemented as a separate
+ reaback mux that is not retimed using a partitioned address.
+
+ This is because the address partitioning scheme used for individual register
+ addresses does not work cleanly for address ranges. (not possible to cleanly
+ map readback of a range to high-address data bins)
+
+ Since the non-retimed mux generator already implements external ranges,
+ re-use it and suppress generation of register logic.
+ """
+
+ def enter_Reg(self, node: RegNode) -> WalkerAction:
+ return WalkerAction.SkipDescendants
+
+ def process_external_block(self, node: AddressableNode) -> None:
+ addr_lo = self._get_address_str(node)
+ addr_hi = f"{addr_lo} + {SVInt(node.size - 1, self.exp.ds.addr_width)}"
+ self.add_content(f"if((rd_mux_addr >= {addr_lo}) && (rd_mux_addr <= {addr_hi})) begin")
+ data = self.exp.hwif.get_external_rd_data(node)
+ self.add_content(f" readback_data_var = {data};")
+ self.add_content(" is_external_block_var = 1'b1;")
+ self.add_content("end")
diff --git a/src/peakrdl_regblock/readback/templates/empty_readback.sv b/src/peakrdl_regblock/readback/templates/empty_readback.sv
new file mode 100644
index 0000000..f493b83
--- /dev/null
+++ b/src/peakrdl_regblock/readback/templates/empty_readback.sv
@@ -0,0 +1,7 @@
+assign readback_done = decoded_req & ~decoded_req_is_wr;
+assign readback_data = '0;
+{%- if ds.err_if_bad_addr or ds.err_if_bad_rw %}
+assign readback_err = decoded_err;
+{%- else %}
+assign readback_err = '0;
+{%- endif %}
diff --git a/src/peakrdl_regblock/readback/templates/readback.sv b/src/peakrdl_regblock/readback/templates/readback.sv
deleted file mode 100644
index 08ed492..0000000
--- a/src/peakrdl_regblock/readback/templates/readback.sv
+++ /dev/null
@@ -1,94 +0,0 @@
-{% if array_assignments is not none %}
-// Assign readback values to a flattened array
-logic [{{cpuif.data_width-1}}:0] readback_array[{{array_size}}];
-{{array_assignments}}
-
-
-{%- if ds.retime_read_fanin %}
-
-// fanin stage
-logic [{{cpuif.data_width-1}}:0] readback_array_c[{{fanin_array_size}}];
-for(genvar g=0; g<{{fanin_loop_iter}}; g++) begin
- always_comb begin
- automatic logic [{{cpuif.data_width-1}}:0] readback_data_var;
- readback_data_var = '0;
- for(int i=g*{{fanin_stride}}; i<((g+1)*{{fanin_stride}}); i++) readback_data_var |= readback_array[i];
- readback_array_c[g] = readback_data_var;
- end
-end
-{%- if fanin_residual_stride == 1 %}
-assign readback_array_c[{{fanin_array_size-1}}] = readback_array[{{array_size-1}}];
-{%- elif fanin_residual_stride > 1 %}
-always_comb begin
- automatic logic [{{cpuif.data_width-1}}:0] readback_data_var;
- readback_data_var = '0;
- for(int i={{(fanin_array_size-1) * fanin_stride}}; i<{{array_size}}; i++) readback_data_var |= readback_array[i];
- readback_array_c[{{fanin_array_size-1}}] = readback_data_var;
-end
-{%- endif %}
-
-logic [{{cpuif.data_width-1}}:0] readback_array_r[{{fanin_array_size}}];
-logic readback_done_r;
-logic readback_err_r;
-always_ff {{get_always_ff_event(cpuif.reset)}} begin
- if({{get_resetsignal(cpuif.reset)}}) begin
- for(int i=0; i<{{fanin_array_size}}; i++) readback_array_r[i] <= '0;
- readback_done_r <= '0;
- readback_err_r <= '0;
- end else begin
- readback_array_r <= readback_array_c;
- readback_err_r <= decoded_err;
- {%- if ds.has_external_addressable %}
- readback_done_r <= decoded_req & ~decoded_req_is_wr & ~decoded_strb_is_external;
- {%- else %}
- readback_done_r <= decoded_req & ~decoded_req_is_wr;
- {%- endif %}
- end
-end
-
-// Reduce the array
-always_comb begin
- automatic logic [{{cpuif.data_width-1}}:0] readback_data_var;
- readback_done = readback_done_r;
- {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %}
- readback_err = readback_err_r;
- {%- else %}
- readback_err = '0;
- {%- endif %}
- readback_data_var = '0;
- for(int i=0; i<{{fanin_array_size}}; i++) readback_data_var |= readback_array_r[i];
- readback_data = readback_data_var;
-end
-
-{%- else %}
-
-// Reduce the array
-always_comb begin
- automatic logic [{{cpuif.data_width-1}}:0] readback_data_var;
- {%- if ds.has_external_addressable %}
- readback_done = decoded_req & ~decoded_req_is_wr & ~decoded_strb_is_external;
- {%- else %}
- readback_done = decoded_req & ~decoded_req_is_wr;
- {%- endif %}
- {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %}
- readback_err = decoded_err;
- {%- else %}
- readback_err = '0;
- {%- endif %}
- readback_data_var = '0;
- for(int i=0; i<{{array_size}}; i++) readback_data_var |= readback_array[i];
- readback_data = readback_data_var;
-end
-{%- endif %}
-
-
-
-{%- else %}
-assign readback_done = decoded_req & ~decoded_req_is_wr;
-assign readback_data = '0;
-{%- if ds.err_if_bad_addr or ds.err_if_bad_rw %}
-assign readback_err = decoded_err;
-{%- else %}
-assign readback_err = '0;
-{%- endif %}
-{% endif %}
diff --git a/src/peakrdl_regblock/readback/templates/readback_no_rt.sv b/src/peakrdl_regblock/readback/templates/readback_no_rt.sv
new file mode 100644
index 0000000..02d2cc2
--- /dev/null
+++ b/src/peakrdl_regblock/readback/templates/readback_no_rt.sv
@@ -0,0 +1,17 @@
+always_comb begin
+ automatic logic [{{cpuif.data_width-1}}:0] readback_data_var;
+ readback_data_var = '0;
+ {{readback_mux|indent}}
+ readback_data = readback_data_var;
+
+ {%- if ds.has_external_addressable %}
+ readback_done = decoded_req & ~decoded_req_is_wr & ~decoded_req_is_external;
+ {%- else %}
+ readback_done = decoded_req & ~decoded_req_is_wr;
+ {%- endif %}
+ {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %}
+ readback_err = decoded_err;
+ {%- else %}
+ readback_err = '0;
+ {%- endif %}
+end
diff --git a/src/peakrdl_regblock/readback/templates/readback_with_rt.sv b/src/peakrdl_regblock/readback/templates/readback_with_rt.sv
new file mode 100644
index 0000000..28fe077
--- /dev/null
+++ b/src/peakrdl_regblock/readback/templates/readback_with_rt.sv
@@ -0,0 +1,82 @@
+function automatic bit [{{low_addr_width-1}}:0] ad_low(bit [{{ds.addr_width-1}}:0] addr);
+ return addr[{{low_addr_width-1}}:0];
+endfunction
+function automatic bit [{{high_addr_width-1}}:0] ad_hi(bit [{{ds.addr_width-1}}:0] addr);
+ return addr[{{ds.addr_width-1}}:{{low_addr_width}}];
+endfunction
+
+// readback stage 1
+logic [{{cpuif.data_width-1}}:0] readback_data_rt_c[{{2 ** high_addr_width}}];
+always_comb begin
+ automatic logic [{{cpuif.data_width-1}}:0] readback_data_var[{{2 ** high_addr_width}}];
+ for(int i=0; i<{{2 ** high_addr_width}}; i++) readback_data_var[i] = '0;
+ {{readback_mux|indent}}
+ readback_data_rt_c = readback_data_var;
+end
+
+logic [{{cpuif.data_width-1}}:0] readback_data_rt[{{2 ** high_addr_width}}];
+logic readback_done_rt;
+logic readback_err_rt;
+logic [{{ds.addr_width-1}}:0] readback_addr_rt;
+always_ff {{get_always_ff_event(cpuif.reset)}} begin
+ if({{get_resetsignal(cpuif.reset)}}) begin
+ for(int i=0; i<{{2 ** high_addr_width}}; i++) readback_data_rt[i] <= '0;
+ readback_done_rt <= '0;
+ readback_err_rt <= '0;
+ readback_addr_rt <= '0;
+ end else begin
+ readback_data_rt <= readback_data_rt_c;
+ readback_err_rt <= decoded_err;
+ {%- if ds.has_external_addressable %}
+ readback_done_rt <= decoded_req & ~decoded_req_is_wr & ~decoded_req_is_external;
+ {%- else %}
+ readback_done_rt <= decoded_req & ~decoded_req_is_wr;
+ {%- endif %}
+ readback_addr_rt <= rd_mux_addr;
+ end
+end
+
+{% if ds.has_external_block %}
+logic [{{cpuif.data_width-1}}:0] readback_ext_block_data_rt_c;
+logic readback_is_ext_block_c;
+always_comb begin
+ automatic logic [{{cpuif.data_width-1}}:0] readback_data_var;
+ automatic logic is_external_block_var;
+ readback_data_var = '0;
+ is_external_block_var = '0;
+ {{ext_block_readback_mux|indent}}
+ readback_ext_block_data_rt_c = readback_data_var;
+ readback_is_ext_block_c = is_external_block_var;
+end
+
+logic [{{cpuif.data_width-1}}:0] readback_ext_block_data_rt;
+logic readback_is_ext_block;
+always_ff {{get_always_ff_event(cpuif.reset)}} begin
+ if({{get_resetsignal(cpuif.reset)}}) begin
+ readback_ext_block_data_rt <= '0;
+ readback_is_ext_block <= '0;
+ end else begin
+ readback_ext_block_data_rt <= readback_ext_block_data_rt_c;
+ readback_is_ext_block <= readback_is_ext_block_c;
+ end
+end
+{% endif %}
+
+// readback stage 2
+always_comb begin
+ {%- if ds.has_external_block %}
+ if(readback_is_ext_block) begin
+ readback_data = readback_ext_block_data_rt;
+ end else begin
+ readback_data = readback_data_rt[readback_addr_rt[{{ds.addr_width-1}}:{{low_addr_width}}]];
+ end
+ {%- else %}
+ readback_data = readback_data_rt[readback_addr_rt[{{ds.addr_width-1}}:{{low_addr_width}}]];
+ {%- endif %}
+ readback_done = readback_done_rt;
+ {%- if ds.err_if_bad_addr or ds.err_if_bad_rw %}
+ readback_err = readback_err_rt;
+ {%- else %}
+ readback_err = '0;
+ {%- endif %}
+end
diff --git a/tests/README.md b/tests/README.md
index c20ca4e..58d32f5 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -6,10 +6,10 @@
Testcases require an installation of the Questa simulator, and for `vlog` & `vsim`
commands to be visible via the PATH environment variable.
-*Questa - Intel FPGA Starter Edition* can be downloaded for free from Intel:
-* Go to https://www.intel.com/content/www/us/en/collections/products/fpga/software/downloads.html?edition=pro&q=questa&s=Relevancy
-* Select latest version of Questa
-* Download Questa files.
+*Questa-Altera FPGA and Starter Edition* can be downloaded for free from Altera:
+* Go to https://www.altera.com/downloads
+* Select "Simulation Tools"
+* Download Questa
* Install
* Be sure to choose "Starter Edition" for the free version.
* Create an account on https://licensing.intel.com
@@ -18,7 +18,7 @@ commands to be visible via the PATH environment variable.
* Go to https://licensing.intel.com/psg/s/sales-signup-evaluationlicenses
* Generate a free *Starter Edition* license file for Questa
* Easiest to use a *fixed* license using your NIC ID (MAC address of your network card via `ifconfig`)
-* Download the license file and point the `LM_LICENSE_FILE` environment variable to the folder which contains it.
+* Download the license file and point the `LM_LICENSE_FILE` environment variable to the folder which contains it. In newer versions of Questa, use the `SALT_LICENSE_SERVER` environment variable instead.
* (optional) Delete Intel libraries to save some disk space
* Delete `/questa_fse/intel`
* Edit `/questa_fse/modelsim.ini` and remove lines that reference the `intel` libraries
diff --git a/tests/test_only_external_blocks/__init__.py b/tests/test_only_external_blocks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_only_external_blocks/regblock.rdl b/tests/test_only_external_blocks/regblock.rdl
new file mode 100644
index 0000000..be6e664
--- /dev/null
+++ b/tests/test_only_external_blocks/regblock.rdl
@@ -0,0 +1,11 @@
+addrmap top {
+ mem ext_mem #(
+ longint SIZE = 0x100
+ ) {
+ memwidth = 32;
+ mementries = SIZE / 4;
+ };
+
+ external ext_mem #(.SIZE(0x10)) mem1 @ 0x0000;
+ external ext_mem #(.SIZE(0x90)) mem2 @ 0x0200;
+};
diff --git a/tests/test_only_external_blocks/tb_template.sv b/tests/test_only_external_blocks/tb_template.sv
new file mode 100644
index 0000000..bf8c2cb
--- /dev/null
+++ b/tests/test_only_external_blocks/tb_template.sv
@@ -0,0 +1,115 @@
+{% extends "lib/tb_base.sv" %}
+
+
+
+{%- block dut_support %}
+ {% sv_line_anchor %}
+
+ external_block #(
+ .ADDR_WIDTH($clog2('h10))
+ ) mem1_inst (
+ .clk(clk),
+ .rst(rst),
+
+ .req(hwif_out.mem1.req),
+ .req_is_wr(hwif_out.mem1.req_is_wr),
+ .addr(hwif_out.mem1.addr),
+ .wr_data(hwif_out.mem1.wr_data),
+ .wr_biten(hwif_out.mem1.wr_biten),
+ .rd_ack(hwif_in.mem1.rd_ack),
+ .rd_data(hwif_in.mem1.rd_data),
+ .wr_ack(hwif_in.mem1.wr_ack)
+ );
+
+ external_block #(
+ .ADDR_WIDTH($clog2('h90))
+ ) mem2_inst (
+ .clk(clk),
+ .rst(rst),
+
+ .req(hwif_out.mem2.req),
+ .req_is_wr(hwif_out.mem2.req_is_wr),
+ .addr(hwif_out.mem2.addr),
+ .wr_data(hwif_out.mem2.wr_data),
+ .wr_biten(hwif_out.mem2.wr_biten),
+ .rd_ack(hwif_in.mem2.rd_ack),
+ .rd_data(hwif_in.mem2.rd_data),
+ .wr_ack(hwif_in.mem2.wr_ack)
+ );
+
+{%- endblock %}
+
+
+
+{% block seq %}
+ {% sv_line_anchor %}
+ ##1;
+ cb.rst <= '0;
+ ##1;
+
+ //--------------------------------------------------------------------------
+ // Simple read/write tests
+ //--------------------------------------------------------------------------
+ // mem1
+ repeat(32) begin
+ logic [31:0] x;
+ int unsigned addr;
+ x = $urandom();
+ addr = 'h0;
+ addr += $urandom_range(('h10 / 4) - 1) * 4;
+ cpuif.write(addr, x);
+ cpuif.assert_read(addr, x);
+ end
+
+ // mem2
+ repeat(32) begin
+ logic [31:0] x;
+ int unsigned addr;
+ x = $urandom();
+ addr = 'h200;
+ addr += $urandom_range(('h90 / 4) - 1) * 4;
+ cpuif.write(addr, x);
+ cpuif.assert_read(addr, x);
+ end
+
+ //--------------------------------------------------------------------------
+ // Pipelined access
+ //--------------------------------------------------------------------------
+ // init array with unique known value
+ for(int i=0; i<('h10 / 4); i++) begin
+ cpuif.write('h0 + i*4, 'h1000 + i);
+ end
+ for(int i=0; i<('h90 / 4); i++) begin
+ cpuif.write('h200 + i*4, 'h3000 + i);
+ end
+
+ // random pipelined read/writes
+ repeat(256) begin
+ fork
+ begin
+ int i;
+ logic [31:0] x;
+ int unsigned addr;
+ case($urandom_range(1))
+ 0: begin
+ i = $urandom_range(('h10 / 4) - 1);
+ x = 'h1000 + i;
+ addr = 'h0 + i*4;
+ end
+ 1: begin
+ i = $urandom_range(('h90 / 4) - 1);
+ x = 'h3000 + i;
+ addr = 'h200 + i*4;
+ end
+ endcase
+
+ case($urandom_range(1))
+ 0: cpuif.write(addr, x);
+ 1: cpuif.assert_read(addr, x);
+ endcase
+ end
+ join_none
+ end
+ wait fork;
+
+{% endblock %}
diff --git a/tests/test_only_external_blocks/testcase.py b/tests/test_only_external_blocks/testcase.py
new file mode 100644
index 0000000..2486293
--- /dev/null
+++ b/tests/test_only_external_blocks/testcase.py
@@ -0,0 +1,29 @@
+from parameterized import parameterized_class
+
+from ..lib.cpuifs.apb4 import APB4
+from ..lib.cpuifs.axi4lite import AXI4Lite
+from ..lib.cpuifs.passthrough import Passthrough
+from ..lib.sim_testcase import SimTestCase
+from ..lib.test_params import get_permutation_class_name, get_permutations
+
+
+@parameterized_class(get_permutations({
+ "cpuif": [
+ APB4(),
+ Passthrough(),
+ ],
+ "retime_read_fanin": [True, False],
+ "retime_read_response": [True, False],
+ "retime_external": [True, False],
+}), class_name_func=get_permutation_class_name)
+class Test(SimTestCase):
+ extra_tb_files = [
+ "../lib/external_reg.sv",
+ "../lib/external_block.sv",
+ ]
+ init_hwif_in = False
+ clocking_hwif_in = False
+ timeout_clk_cycles = 30000
+
+ def test_dut(self):
+ self.run_test()