From b991d33e9691db96c1b910747e249a4e59b4bcc0 Mon Sep 17 00:00:00 2001
From: Mikael Henriksson <mike.zx@hotmail.com>
Date: Wed, 15 Mar 2023 15:47:54 +0100
Subject: [PATCH] codegen: add synchronous write address generation to memory
 based HDL generation

---
 b_asic/codegen/vhdl/architecture.py | 189 ++++++++++++++++++----------
 b_asic/codegen/vhdl/common.py       | 153 +++++++++++++++-------
 b_asic/resources.py                 |   6 +
 test/test_resources.py              |   2 +-
 4 files changed, 236 insertions(+), 114 deletions(-)

diff --git a/b_asic/codegen/vhdl/architecture.py b/b_asic/codegen/vhdl/architecture.py
index eba4624d..80c933b9 100644
--- a/b_asic/codegen/vhdl/architecture.py
+++ b/b_asic/codegen/vhdl/architecture.py
@@ -2,16 +2,12 @@
 Module for code generation of VHDL architectures.
 """
 from io import TextIOWrapper
-from typing import Dict, Optional, Set, cast
+from typing import Set, cast
 
 from b_asic.codegen import vhdl
 from b_asic.codegen.vhdl import VHDL_TAB
 from b_asic.process import MemoryVariable, PlainMemoryVariable
-from b_asic.resources import (
-    ProcessCollection,
-    _ForwardBackwardEntry,
-    _ForwardBackwardTable,
-)
+from b_asic.resources import ProcessCollection, _ForwardBackwardTable
 
 
 def write_memory_based_storage(
@@ -22,6 +18,7 @@ def write_memory_based_storage(
     read_ports: int,
     write_ports: int,
     total_ports: int,
+    input_sync: bool = True,
 ):
     """
     Generate the VHDL architecture for a memory based architecture from a process collection of memory variables.
@@ -44,6 +41,10 @@ def write_memory_based_storage(
         Number of write ports.
     total_ports : int
         Total concurrent memory accesses possible.
+    input_sync : bool, default: True
+        Add registers to the input signals (enable signal and data input signals).
+        Adding registers to the inputs allow pipelining of address generation (which is added automatically).
+        For large interleavers, this can improve timing significantly.
     """
 
     # Code settings
@@ -67,7 +68,9 @@ def write_memory_based_storage(
     vhdl.common.write_type_decl(
         f, 'mem_type', 'array(0 to MEM_DEPTH-1) of std_logic_vector(MEM_WL-1 downto 0)'
     )
-    vhdl.common.write_signal_decl(f, 'memory', 'mem_type', name_pad=14)
+    vhdl.common.write_signal_decl(
+        f, name='memory', type='mem_type', name_pad=14, vivado_ram_style='distributed'
+    )
     for i in range(read_ports):
         vhdl.common.write_signal_decl(
             f, f'read_port_{i}', 'std_logic_vector(MEM_WL-1 downto 0)', name_pad=14
@@ -86,40 +89,63 @@ def write_memory_based_storage(
         vhdl.common.write_signal_decl(f, f'write_en_{i}', 'std_logic', name_pad=14)
 
     # Schedule time counter
-    f.write('\n')
-    f.write(f'{VHDL_TAB}-- Schedule counter\n')
+    f.write(f'\n{VHDL_TAB}-- Schedule counter\n')
     vhdl.common.write_signal_decl(
         f,
         name='schedule_cnt',
         type=f'integer range 0 to {schedule_time}-1',
         name_pad=14,
     )
-    f.write('\n')
+
+    # Input sync signals
+    if input_sync:
+        f.write(f'\n{VHDL_TAB}-- Input synchronization\n')
+        for i in range(read_ports):
+            vhdl.common.write_signal_decl(
+                f, f'p_{i}_in_sync', 'std_logic_vector(WL-1 downto 0)', name_pad=14
+            )
 
     #
     # Architecture body begin
     #
-    f.write(f'begin\n\n')
+    f.write(f'\nbegin\n\n')
     f.write(f'{VHDL_TAB}-- Schedule counter\n')
     vhdl.common.write_synchronous_process(
         f=f,
         name='schedule_cnt_proc',
         clk='clk',
-        indent=len(1 * VHDL_TAB),
         body=(
-            f'{0*VHDL_TAB}if en = \'1\' then\n'
-            f'{1*VHDL_TAB}if schedule_cnt = {schedule_time}-1 then\n'
-            f'{2*VHDL_TAB}schedule_cnt <= 0;\n'
-            f'{1*VHDL_TAB}else\n'
-            f'{2*VHDL_TAB}schedule_cnt <= schedule_cnt + 1;\n'
+            f'{0*VHDL_TAB}if rst = \'1\' then\n'
+            f'{1*VHDL_TAB}schedule_cnt <= 0;\n'
+            f'{0*VHDL_TAB}else\n'
+            f'{1*VHDL_TAB}if en = \'1\' then\n'
+            f'{2*VHDL_TAB}if schedule_cnt = {schedule_time-1} then\n'
+            f'{3*VHDL_TAB}schedule_cnt <= 0;\n'
+            f'{2*VHDL_TAB}else\n'
+            f'{3*VHDL_TAB}schedule_cnt <= schedule_cnt + 1;\n'
+            f'{2*VHDL_TAB}end if;\n'
             f'{1*VHDL_TAB}end if;\n'
             f'{0*VHDL_TAB}end if;\n'
         ),
     )
 
+    if input_sync:
+        f.write(f'\n{VHDL_TAB}-- Input synchronization\n')
+        vhdl.common.write_synchronous_process_prologue(
+            f=f,
+            name='input_sync_proc',
+            clk='clk',
+        )
+        for i in range(read_ports):
+            f.write(f'{3*VHDL_TAB}p_{i}_in_sync <= p_{i}_in;\n')
+        vhdl.common.write_synchronous_process_epilogue(
+            f=f,
+            name='input_sync_proc',
+            clk='clk',
+        )
+
     # Infer memory
-    f.write('\n')
-    f.write(f'{VHDL_TAB}-- Memory\n')
+    f.write(f'\n{VHDL_TAB}-- Memory\n')
     vhdl.common.write_asynchronous_read_memory(
         f=f,
         clk='clk',
@@ -134,73 +160,96 @@ def write_memory_based_storage(
         },
     )
 
-    f.write(f'\n{VHDL_TAB}-- Memory writes\n')
-    f.write(f'{VHDL_TAB}process(schedule_cnt)\n')
-    f.write(f'{VHDL_TAB}begin\n')
-
-    f.write(f'{2*VHDL_TAB}case schedule_cnt is\n')
+    # Write address generation
+    f.write(f'\n{VHDL_TAB}-- Memory write address generation\n')
+    if input_sync:
+        vhdl.common.write_synchronous_process_prologue(
+            f, clk="clk", name="mem_write_address_proc"
+        )
+    else:
+        vhdl.common.write_process_prologue(
+            f, sensitivity_list="schedule_cnt", name="mem_write_address_proc"
+        )
+    f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
     for i, collection in enumerate(assignment):
         for mv in collection:
             mv = cast(MemoryVariable, mv)
             if mv.execution_time:
-                f.write(f'{3*VHDL_TAB}-- {mv!r}\n')
-                f.write(f'{3*VHDL_TAB}when {mv.start_time} =>\n')
-                f.write(f'{4*VHDL_TAB}write_adr_0 <= {i};\n')
-                f.write(f'{4*VHDL_TAB}write_en_0 <= \'1\';\n')
-    f.write(f'{3*VHDL_TAB}when others =>\n')
-    f.write(f'{4*VHDL_TAB}write_adr_0 <= 0;\n')
-    f.write(f'{4*VHDL_TAB}write_en_0 <= \'0\';\n')
-    f.write(f'{2*VHDL_TAB}end case;\n')
-
-    f.write(f'{1*VHDL_TAB}end process;\n')
-
-    f.write(f'\n{VHDL_TAB}-- Memory reads\n')
-    f.write(f'{VHDL_TAB}process(schedule_cnt)\n')
-    f.write(f'{VHDL_TAB}begin\n')
+                f.write(f'{4*VHDL_TAB}-- {mv!r}\n')
+                f.write(f'{4*VHDL_TAB}when {(mv.start_time) % schedule_time} =>\n')
+                f.write(f'{5*VHDL_TAB}write_adr_0 <= {i};\n')
+                f.write(f'{5*VHDL_TAB}write_en_0 <= \'1\';\n')
+    f.write(f'{4*VHDL_TAB}when others =>\n')
+    f.write(f'{5*VHDL_TAB}write_adr_0 <= 0;\n')
+    f.write(f'{5*VHDL_TAB}write_en_0 <= \'0\';\n')
+    f.write(f'{3*VHDL_TAB}end case;\n')
+    if input_sync:
+        vhdl.common.write_synchronous_process_epilogue(
+            f, clk="clk", name="mem_write_address_proc"
+        )
+    else:
+        vhdl.common.write_process_epilogue(
+            f, sensitivity_list="clk", name="mem_write_address_proc"
+        )
 
-    f.write(f'{2*VHDL_TAB}case schedule_cnt is\n')
+    # Read address generation
+    f.write(f'\n{VHDL_TAB}-- Memory read address generation\n')
+    vhdl.common.write_synchronous_process_prologue(
+        f, clk="clk", name="mem_read_address_proc"
+    )
+    f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
     for i, collection in enumerate(assignment):
         for mv in collection:
             mv = cast(PlainMemoryVariable, mv)
-            f.write(f'{3*VHDL_TAB}-- {mv!r}\n')
+            f.write(f'{4*VHDL_TAB}-- {mv!r}\n')
             for read_time in mv.reads.values():
                 f.write(
-                    f'{3*VHDL_TAB}when'
-                    f' {(mv.start_time + read_time) % schedule_time} =>\n'
+                    f'{4*VHDL_TAB}when'
+                    f' {(mv.start_time+read_time-int(not(input_sync))) % schedule_time} =>\n'
                 )
-                f.write(f'{4*VHDL_TAB}read_adr_0 <= {i};\n')
-                f.write(f'{4*VHDL_TAB}read_en_0 <= \'1\';\n')
-    f.write(f'{3*VHDL_TAB}when others =>\n')
-    f.write(f'{4*VHDL_TAB}read_adr_0 <= 0;\n')
-    f.write(f'{4*VHDL_TAB}read_en_0 <= \'0\';\n')
-    f.write(f'{2*VHDL_TAB}end case;\n')
-    f.write(f'{1*VHDL_TAB}end process;\n\n')
+                f.write(f'{5*VHDL_TAB}read_adr_0 <= {i};\n')
+                f.write(f'{5*VHDL_TAB}read_en_0 <= \'1\';\n')
+    f.write(f'{4*VHDL_TAB}when others =>\n')
+    f.write(f'{5*VHDL_TAB}read_adr_0 <= 0;\n')
+    f.write(f'{5*VHDL_TAB}read_en_0 <= \'0\';\n')
+    f.write(f'{3*VHDL_TAB}end case;\n')
+    vhdl.common.write_synchronous_process_epilogue(
+        f, clk="clk", name="mem_read_address_proc"
+    )
 
-    f.write(f'{1*VHDL_TAB}-- Input and output assignment\n')
-    f.write(f'{1*VHDL_TAB}write_port_0 <= p_0_in;\n')
+    f.write(f'\n{1*VHDL_TAB}-- Input and output assignment\n')
+    if input_sync:
+        f.write(f'{1*VHDL_TAB}write_port_0 <= p_0_in_sync;\n')
+    else:
+        f.write(f'{1*VHDL_TAB}write_port_0 <= p_0_in;\n')
     p_zero_exec = filter(
         lambda p: p.execution_time == 0, (p for pc in assignment for p in pc)
     )
     vhdl.common.write_synchronous_process_prologue(
         f,
         clk='clk',
-        indent=len(VHDL_TAB),
         name='output_reg_proc',
     )
     f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
     for p in p_zero_exec:
-        f.write(f'{4*VHDL_TAB}when {p.start_time} => p_0_out <= p_0_in;\n')
+        if input_sync:
+            f.write(
+                f'{4*VHDL_TAB}when {(p.start_time+1)%schedule_time} => p_0_out <='
+                ' p_0_in_sync;\n'
+            )
+        else:
+            f.write(
+                f'{4*VHDL_TAB}when {(p.start_time)%schedule_time} => p_0_out <='
+                ' p_0_in;\n'
+            )
     f.write(f'{4*VHDL_TAB}when others => p_0_out <= read_port_0;\n')
     f.write(f'{3*VHDL_TAB}end case;\n')
     vhdl.common.write_synchronous_process_epilogue(
         f,
         clk='clk',
-        indent=len(VHDL_TAB),
         name='output_reg_proc',
     )
-
-    f.write('\n')
-    f.write(f'end architecture {architecture_name};')
+    f.write(f'\nend architecture {architecture_name};')
 
 
 def write_register_based_storage(
@@ -231,10 +280,9 @@ def write_register_based_storage(
         name_pad=14,
         default_value='0',
     )
-    f.write('\n')
 
     # Shift register
-    f.write(f'{VHDL_TAB}-- Shift register\n')
+    f.write(f'\n{VHDL_TAB}-- Shift register\n')
     vhdl.common.write_type_decl(
         f,
         name='shift_reg_type',
@@ -247,6 +295,16 @@ def write_register_based_storage(
         name_pad=14,
     )
 
+    # Output mux selector
+    f.write(f'\n{VHDL_TAB}-- Output mux select signal\n')
+    output_regs = {entry.outputs_from for entry in forward_backward_table.table}
+    vhdl.common.write_signal_decl(
+        f,
+        name='out_mux_sel',
+        type=f'integer range 0 to {len(output_regs)-1}',
+        name_pad=14,
+    )
+
     #
     # Architecture body begin
     #
@@ -257,7 +315,6 @@ def write_register_based_storage(
         f=f,
         name='schedule_cnt_proc',
         clk='clk',
-        indent=len(1 * VHDL_TAB),
         body=(
             f'{0*VHDL_TAB}if en = \'1\' then\n'
             f'{1*VHDL_TAB}if schedule_cnt = {schedule_time}-1 then\n'
@@ -269,15 +326,13 @@ def write_register_based_storage(
         ),
     )
 
+    # Shift register multiplexer logic
     f.write(f'\n{VHDL_TAB}-- Multiplexers for shift register\n')
     vhdl.common.write_synchronous_process_prologue(
         f,
         clk='clk',
         name='shift_reg_proc',
-        indent=len(VHDL_TAB),
     )
-
-    # Default for all register
     f.write(f'{3*VHDL_TAB}-- Default case\n')
     f.write(f'{3*VHDL_TAB}shift_reg(0) <= p_0_in;\n')
     for reg_idx in range(1, reg_cnt):
@@ -296,17 +351,17 @@ def write_register_based_storage(
         f,
         clk='clk',
         name='shift_reg_proc',
-        indent=len(VHDL_TAB),
     )
 
+    # Output multiplexer logic
     f.write(f'\n{VHDL_TAB}-- Output muliplexer\n')
+    f.write(f'\n{VHDL_TAB}-- {output_regs}\n')
+    f.write(f'\n{VHDL_TAB}-- { list(range(len(output_regs))) }\n')
     vhdl.common.write_synchronous_process_prologue(
         f,
         clk='clk',
         name='out_mux_proc',
-        indent=len(VHDL_TAB),
     )
-
     f.write(f'{3*VHDL_TAB}-- Default case\n')
     f.write(f'{3*VHDL_TAB}p_0_out <= shift_reg({reg_cnt-1});\n')
     f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
@@ -322,12 +377,10 @@ def write_register_based_storage(
                     )
     f.write(f'{4*VHDL_TAB}when others => null;\n')
     f.write(f'{3*VHDL_TAB}end case;\n')
-
     vhdl.common.write_synchronous_process_epilogue(
         f,
         clk='clk',
         name='out_mux_proc',
-        indent=len(VHDL_TAB),
     )
 
     f.write(f'end architecture {architecture_name};')
diff --git a/b_asic/codegen/vhdl/common.py b/b_asic/codegen/vhdl/common.py
index 632fd959..3248aaab 100644
--- a/b_asic/codegen/vhdl/common.py
+++ b/b_asic/codegen/vhdl/common.py
@@ -66,6 +66,8 @@ def write_signal_decl(
     type: str,
     default_value: Optional[str] = None,
     name_pad: Optional[int] = None,
+    vivado_ram_style: Optional[str] = None,
+    quartus_ram_style: Optional[str] = None,
 ):
     """
     Create a VHDL signal declaration: ::
@@ -80,10 +82,16 @@ def write_signal_decl(
         Signal name.
     type : str
         Signal type.
-    default_value : str, optional
+    default_value : string, optional
         An optional default value to the signal.
     name_pad : int, optional
         An optional left padding value applied to the name.
+    vivado_ram_style : string, optional
+        An optional Xilinx Vivado RAM style attribute to apply to this signal delcaration.
+        If set, exactly one of: "block", "distributed", "registers", "ultra", "mixed" or "auto".
+    quartus_ram_style : string, optional
+        An optional Quartus Prime RAM style attribute to apply to this signal delcaration.
+        If set, exactly one of: "M4K", "M9K", "M10K", "M20K", "M144K", "MLAB" or "logic".
     """
     # Spacing of VHDL signals declaration always with a single tab
     name_pad = 0 if name_pad is None else name_pad
@@ -91,6 +99,18 @@ def write_signal_decl(
     if default_value is not None:
         f.write(f' := {default_value}')
     f.write(f';\n')
+    if vivado_ram_style:
+        f.write(f'{VHDL_TAB}attribute ram_style : string;\n')
+        f.write(
+            f'{VHDL_TAB}attribute ram_style of {name} : signal is'
+            f' "{vivado_ram_style}";\n'
+        )
+    if quartus_ram_style:
+        f.write(f'{VHDL_TAB}attribute ramstyle : string;\n')
+        f.write(
+            f'{VHDL_TAB}attribute ramstyle of {name} : signal is'
+            f' "{quartus_ram_style}";\n'
+        )
 
 
 def write_constant_decl(
@@ -141,42 +161,63 @@ def write_type_decl(
     f.write(f'{VHDL_TAB}type {name} is {alias};\n')
 
 
-def write_synchronous_process(
+def write_process_prologue(
     f: TextIOWrapper,
-    clk: str,
-    body: str,
-    indent: Optional[int] = 0,
+    sensitivity_list: str,
+    indent: str = VHDL_TAB,
     name: Optional[str] = None,
 ):
     """
-    Write a regular VHDL synchronous process with a single clock object in the sensitivity list triggering
-    a rising edge block by some body of VHDL code.
+    Write only the prologue of a regular VHDL process with a user provided sensitivity list.
+    This method should almost always guarantely be followed by a write_asynchronous_process_epilogue.
 
     Parameters
     ----------
     f : :class:`io.TextIOWrapper`
-        The TextIOWrapper to write the VHDL code onto.
-    clk : str
-        Name of the clock.
-    body : str
-        Body of the `if rising_edge(clk) then` block.
-    indent : Optional[int]
-        Indent this process block with `indent` columns
+        The TextIOWrapper object to write the type declaration to.
+    sensitivity_list : str
+        Content of the process sensitivity list.
+    indent : str, default: 1*VHDL_TAB
+        Indentation used in the process. This string is applied to the first written line of all output.
     name : Optional[str]
-        An optional name for the process
+        An optional name for the process.
     """
-    space = '' if indent is None else ' ' * indent
-    write_synchronous_process_prologue(f, clk, indent, name)
-    for line in body.split('\n'):
-        if len(line):
-            f.write(f'{space}{2*VHDL_TAB}{line}\n')
-    write_synchronous_process_epilogue(f, clk, indent, name)
+    if name is not None:
+        f.write(f'{indent}{name}: process({sensitivity_list})\n')
+    else:
+        f.write(f'{indent}process({sensitivity_list})\n')
+    f.write(f'{indent}begin\n')
+
+
+def write_process_epilogue(
+    f: TextIOWrapper,
+    sensitivity_list: Optional[str] = None,
+    indent: str = VHDL_TAB,
+    name: Optional[str] = None,
+):
+    """
+    Parameters
+    ----------
+    f : :class:`io.TextIOWrapper`
+        The TextIOWrapper object to write the type declaration to.
+    sensitivity_list : str
+        Content of the process sensitivity list. Not needed when writing the epligoue.
+    indent : str, default: 1*VHDL_TAB
+        Indentation used in the process. This string is applied to the first written line of all output.
+    name : Optional[str]
+        An optional name of the ending process.
+    """
+    _ = sensitivity_list
+    f.write(f'{indent}end process')
+    if name is not None:
+        f.write(' ' + name)
+    f.write(';\n')
 
 
 def write_synchronous_process_prologue(
     f: TextIOWrapper,
     clk: str,
-    indent: Optional[int] = 0,
+    indent: str = VHDL_TAB,
     name: Optional[str] = None,
 ):
     """
@@ -190,24 +231,19 @@ def write_synchronous_process_prologue(
         The TextIOWrapper to write the VHDL code onto.
     clk : str
         Name of the clock.
-    indent : Optional[int]
-        Indent this process block with `indent` columns
+    indent : str, default: VHDL_TAB
+        Indentation used in the process. This string is applied to the first written line of all output.
     name : Optional[str]
-        An optional name for the process
+        An optional name for the process.
     """
-    space = '' if indent is None else ' ' * indent
-    if name is not None:
-        f.write(f'{space}{name}: process({clk})\n')
-    else:
-        f.write(f'{space}process({clk})\n')
-    f.write(f'{space}begin\n')
-    f.write(f'{space}{VHDL_TAB}if rising_edge(clk) then\n')
+    write_process_prologue(f, sensitivity_list=clk, indent=indent, name=name)
+    f.write(f'{indent}{VHDL_TAB}if rising_edge(clk) then\n')
 
 
 def write_synchronous_process_epilogue(
     f: TextIOWrapper,
     clk: Optional[str],
-    indent: Optional[int] = 0,
+    indent: str = VHDL_TAB,
     name: Optional[str] = None,
 ):
     """
@@ -221,18 +257,45 @@ def write_synchronous_process_epilogue(
         The TextIOWrapper to write the VHDL code onto.
     clk : str
         Name of the clock.
-    indent : Optional[int]
+    indent : str, default: VHDL_TAB
         Indent this process block with `indent` columns
     name : Optional[str]
         An optional name for the process
     """
     _ = clk
-    space = '' if indent is None else ' ' * indent
-    f.write(f'{space}{VHDL_TAB}end if;\n')
-    f.write(f'{space}end process')
-    if name is not None:
-        f.write(' ' + name)
-    f.write(';\n')
+    f.write(f'{indent}{VHDL_TAB}end if;\n')
+    write_process_epilogue(f, sensitivity_list=clk, indent=indent, name=name)
+
+
+def write_synchronous_process(
+    f: TextIOWrapper,
+    clk: str,
+    body: str,
+    indent: str = VHDL_TAB,
+    name: Optional[str] = None,
+):
+    """
+    Write a regular VHDL synchronous process with a single clock object in the sensitivity list triggering
+    a rising edge block by some body of VHDL code.
+
+    Parameters
+    ----------
+    f : :class:`io.TextIOWrapper`
+        The TextIOWrapper to write the VHDL code onto.
+    clk : str
+        Name of the clock.
+    body : str
+        Body of the `if rising_edge(clk) then` block.
+    indent : int, default: VHDL_TAB
+        Indent this process block with `indent` columns
+    name : Optional[str]
+        An optional name for the process
+    """
+    write_synchronous_process_prologue(f, clk, indent, name)
+    for line in body.split('\n'):
+        if len(line):
+            f.write(f'{indent}{2*VHDL_TAB}{line}\n')
+    write_synchronous_process_epilogue(f, clk, indent, name)
 
 
 def write_synchronous_memory(
@@ -260,7 +323,7 @@ def write_synchronous_memory(
     """
     assert len(read_ports) >= 1
     assert len(write_ports) >= 1
-    write_synchronous_process_prologue(f, clk=clk, name=name, indent=len(VHDL_TAB))
+    write_synchronous_process_prologue(f, clk=clk, name=name)
     for read_name, address, re in read_ports:
         f.write(f'{3*VHDL_TAB}if {re} = \'1\' then\n')
         f.write(f'{4*VHDL_TAB}{read_name} <= memory({address});\n')
@@ -269,7 +332,7 @@ def write_synchronous_memory(
         f.write(f'{3*VHDL_TAB}if {we} = \'1\' then\n')
         f.write(f'{4*VHDL_TAB}memory({address}) <= {write_name};\n')
         f.write(f'{3*VHDL_TAB}end if;\n')
-    write_synchronous_process_epilogue(f, clk=clk, name=name, indent=len(VHDL_TAB))
+    write_synchronous_process_epilogue(f, clk=clk, name=name)
 
 
 def write_asynchronous_read_memory(
@@ -280,7 +343,7 @@ def write_asynchronous_read_memory(
     name: Optional[str] = None,
 ):
     """
-    Infer a VHDL synchronous reads and writes.
+    Infer a VHDL memory with synchronous writes and asynchronous reads.
 
     Parameters
     ----------
@@ -297,11 +360,11 @@ def write_asynchronous_read_memory(
     """
     assert len(read_ports) >= 1
     assert len(write_ports) >= 1
-    write_synchronous_process_prologue(f, clk=clk, name=name, indent=len(VHDL_TAB))
+    write_synchronous_process_prologue(f, clk=clk, name=name)
     for write_name, address, we in write_ports:
         f.write(f'{3*VHDL_TAB}if {we} = \'1\' then\n')
         f.write(f'{4*VHDL_TAB}memory({address}) <= {write_name};\n')
         f.write(f'{3*VHDL_TAB}end if;\n')
-    write_synchronous_process_epilogue(f, clk=clk, name=name, indent=len(VHDL_TAB))
+    write_synchronous_process_epilogue(f, clk=clk, name=name)
     for read_name, address, _ in read_ports:
         f.write(f'{1*VHDL_TAB}{read_name} <= memory({address});\n')
diff --git a/b_asic/resources.py b/b_asic/resources.py
index 51520e66..69ff6ccf 100644
--- a/b_asic/resources.py
+++ b/b_asic/resources.py
@@ -931,6 +931,7 @@ class ProcessCollection:
         read_ports: int = 1,
         write_ports: int = 1,
         total_ports: int = 2,
+        input_sync: bool = True,
     ):
         """
         Generate VHDL code for memory based storage of processes (MemoryVariables).
@@ -960,6 +961,10 @@ class ProcessCollection:
         total_ports : int, default: 2
             The total number of ports used when splitting process collection based on
             memory variable access.
+        input_sync : bool, default: True
+            Add registers to the input signals (enable signal and data input signals).
+            Adding registers to the inputs allow pipelining of address generation (which is added automatically).
+            For large interleavers, this can improve timing significantly.
         """
         # Check that this is a ProcessCollection of (Plain)MemoryVariables
         is_memory_variable = all(
@@ -1024,6 +1029,7 @@ class ProcessCollection:
                 read_ports=read_ports,
                 write_ports=write_ports,
                 total_ports=total_ports,
+                input_sync=input_sync,
             )
 
     def generate_register_based_storage_vhdl(
diff --git a/test/test_resources.py b/test/test_resources.py
index 581274b8..582761e9 100644
--- a/test/test_resources.py
+++ b/test/test_resources.py
@@ -62,7 +62,7 @@ class TestProcessCollectionPlainMemoryVariable:
     def test_generate_register_based_vhdl(self):
         for rows in [2, 3, 4, 5, 7]:
             generate_matrix_transposer(
-                rows, min_lifetime=1
+                rows, min_lifetime=0
             ).generate_register_based_storage_vhdl(
                 filename=f'b_asic/codegen/testbench/streaming_matrix_transposition_register_{rows}x{rows}.vhdl',
                 entity_name=f'streaming_matrix_transposition_register_{rows}x{rows}',
-- 
GitLab