Compare revisions

Hugo Winbladh · Oscar Gustafsson · Mikael Henriksson · Mikael Henriksson · Oscar Gustafsson · Oscar Gustafsson
--- a/b_asic/architecture.py
+++ b/b_asic/architecture.py
@@ -764,19 +764,75 @@ of :class:`~b_asic.architecture.ProcessingElement`
                    d_out[i][v] += 1
        return [dict(d) for d in d_in], [dict(d) for d in d_out]

-    def resource_from_name(self, name: str):
+    def resource_from_name(self, name: str) -> Resource:
+        """
+        Get :class:`Resource` based on name.
+
+        Parameters
+        ----------
+        name : str
+            Name of the resource.
+
+        Returns
+        -------
+        :class:`Resource`
+
+        """
        re = {p.entity_name: p for p in chain(self.memories, self.processing_elements)}
        return re[name]

+    def remove_resource(
+        self,
+        resource: Union[str, Resource],
+    ) -> None:
+        """
+        Remove an empty :class:`Resource` from the architecture.
+
+        Parameters
+        ----------
+        resource : :class:`b_asic.architecture.Resource` or str
+            The resource or the resource name to remove.
+        """
+        if isinstance(resource, str):
+            resource = self.resource_from_name(resource)
+
+        if resource.collection:
+            raise ValueError("Resource must be empty")
+
+        if resource in self.memories:
+            self.memories.remove(resource)
+        elif resource in self.processing_elements:
+            self.processing_elements.remove(resource)
+        else:
+            raise ValueError('Resource not in architecture')
+
+    def assign_resources(self, heuristic: str = "left_edge") -> None:
+        """
+        Convenience method to assign all resources in the architecture.
+
+        Parameters
+        ----------
+        heuristic : str, default: "left_edge"
+            The heurstic to use.
+
+        See Also
+        --------
+        Memory.assign
+        ProcessingElement.assign
+
+        """
+        for resource in chain(self.memories, self.processing_elements):
+            resource.assign(heuristic=heuristic)
+
    def move_process(
        self,
        proc: Union[str, Process],
        re_from: Union[str, Resource],
        re_to: Union[str, Resource],
        assign: bool = False,
-    ):
+    ) -> None:
        """
-        Move a :class:`b_asic.process.Process` from one resource to another.
+        Move a :class:`b_asic.process.Process` from one :class:`Resource`  to another.

        Both the resource moved from and will become unassigned after a process has been
        moved, unless *assign* is set to True.

--- a/b_asic/codegen/testbench/streaming_matrix_transposition_tb.vhdl
+++ b/b_asic/codegen/testbench/streaming_matrix_transposition_tb.vhdl
@@ -10,9 +10,10 @@ use ieee.numeric_std.all;

 entity streaming_matrix_transposition_tester is
    generic(
-        WL          : integer;
-        ROWS        : integer;
-        COLS        : integer
+        WL              : integer;
+        ROWS            : integer;
+        COLS            : integer;
+        ENABLE_DEL_CC   : integer := 0  -- CCs after enable to start feeding the circuit
    );
    port(
        clk, rst, en : out std_logic;
@@ -40,21 +41,36 @@ begin
    -- Input generation
    input_gen_proc: process begin
        wait until en = '1';
-        for i in 0 to ROWS*COLS-1 loop
+        wait for ENABLE_DEL_CC*10 ns;
+        for i in 0 to 4*ROWS*COLS-1 loop
            wait until clk = '0';
            input <= std_logic_vector(to_unsigned(i, input'length));
        end loop;
        wait;
    end process;

+    -- Timeout test
+    timeout_test_proc: process begin
+        wait until en = '1';
+        wait for 1 ms;
+        report "Timeout failure: 1 ms passed after enable=1" severity failure;
+    end process;
+
    -- Output testing
    output_test_proc: process begin
        wait until en = '1';
        wait until output = std_logic_vector(to_unsigned(0, output'length));
-        for col in 0 to COLS-1 loop
-            for row in 0 to ROWS-1 loop
-                wait until clk = '0';
-                check(output = std_logic_vector(to_unsigned(row*COLS + col, output'length)));
+        for i in 0 to 3 loop
+            for col in 0 to COLS-1 loop
+                for row in 0 to ROWS-1 loop
+                    wait until clk = '0';
+                    check(
+                        output =
+                        std_logic_vector(
+                            to_unsigned(i*ROWS*COLS + row*COLS + col, output'length)
+                        )
+                    );
+                end loop;
            end loop;
        end loop;
        done <= true;
@@ -63,6 +79,48 @@ begin

 end architecture behav;

+
+----------------------------------------------------------------------------------------
+---                                TEST INSTANCES                                    ---
+----------------------------------------------------------------------------------------
+
+--
+-- 2x2 memory based matrix transposition
+--
+library ieee, vunit_lib;
+context vunit_lib.vunit_context;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+entity streaming_matrix_transposition_memory_2x2_tb is
+    generic (
+        runner_cfg  : string;   -- VUnit python pipe
+        tb_path     : string    -- Absolute path to this testbench
+    );
+end entity streaming_matrix_transposition_memory_2x2_tb;
+
+architecture behav of streaming_matrix_transposition_memory_2x2_tb is
+    constant WL : integer := 16;
+    signal done : boolean;
+    signal input, output : std_logic_vector(WL-1 downto 0);
+    signal clk, rst, en : std_logic;
+begin
+
+    -- VUnit test runner
+    process begin
+        test_runner_setup(runner, runner_cfg);
+        wait until done = true;
+        test_runner_cleanup(runner);
+    end process;
+
+    -- Run the test baby!
+    dut : entity work.streaming_matrix_transposition_memory_2x2
+        generic map(WL=>WL) port map(clk, rst, en, input, output);
+    tb : entity work.streaming_matrix_transposition_tester
+        generic map (WL=>WL, ROWS=>2, COLS=>2) port map(clk, rst, en, input, output, done);
+
+end architecture behav;
+
 --
 -- 3x3 memory based matrix transposition
 --
@@ -101,21 +159,21 @@ begin
 end architecture behav;

 --
-- 4x8 memory based matrix transposition
+-- 4x4 memory based matrix transposition
 --
 library ieee, vunit_lib;
 context vunit_lib.vunit_context;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

-entity streaming_matrix_transposition_memory_4x8_tb is
+entity streaming_matrix_transposition_memory_4x4_tb is
    generic (
        runner_cfg  : string;   -- VUnit python pipe
        tb_path     : string    -- Absolute path to this testbench
    );
-end entity streaming_matrix_transposition_memory_4x8_tb;
+end entity streaming_matrix_transposition_memory_4x4_tb;

-architecture behav of streaming_matrix_transposition_memory_4x8_tb is
+architecture behav of streaming_matrix_transposition_memory_4x4_tb is
    constant WL : integer := 16;
    signal done : boolean;
    signal input, output : std_logic_vector(WL-1 downto 0);
@@ -130,13 +188,51 @@ begin
    end process;

    -- Run the test baby!
-    dut : entity work.streaming_matrix_transposition_memory_4x8
+    dut : entity work.streaming_matrix_transposition_memory_4x4
        generic map(WL=>WL) port map(clk, rst, en, input, output);
    tb : entity work.streaming_matrix_transposition_tester
-        generic map (WL=>WL, ROWS=>4, COLS=>8) port map(clk, rst, en, input, output, done);
+        generic map (WL=>WL, ROWS=>4, COLS=>4, ENABLE_DEL_CC=>1)
+        port map(clk, rst, en, input, output, done);

 end architecture behav;

+--
+-- 5x5 memory based matrix transposition
+--
+library ieee, vunit_lib;
+context vunit_lib.vunit_context;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+entity streaming_matrix_transposition_memory_5x5_tb is
+    generic (
+        runner_cfg  : string;   -- VUnit python pipe
+        tb_path     : string    -- Absolute path to this testbench
+    );
+end entity streaming_matrix_transposition_memory_5x5_tb;
+
+architecture behav of streaming_matrix_transposition_memory_5x5_tb is
+    constant WL : integer := 16;
+    signal done : boolean;
+    signal input, output : std_logic_vector(WL-1 downto 0);
+    signal clk, rst, en : std_logic;
+begin
+
+    -- VUnit test runner
+    process begin
+        test_runner_setup(runner, runner_cfg);
+        wait until done = true;
+        test_runner_cleanup(runner);
+    end process;
+
+    -- Run the test baby!
+    dut : entity work.streaming_matrix_transposition_memory_5x5
+        generic map(WL=>WL) port map(clk, rst, en, input, output);
+    tb : entity work.streaming_matrix_transposition_tester
+        generic map (WL=>WL, ROWS=>5, COLS=>5, ENABLE_DEL_CC=>2)
+        port map(clk, rst, en, input, output, done);
+
+end architecture behav;

 --
 -- 7x7 memory based matrix transposition
@@ -171,27 +267,28 @@ begin
    dut : entity work.streaming_matrix_transposition_memory_7x7
        generic map(WL=>WL) port map(clk, rst, en, input, output);
    tb : entity work.streaming_matrix_transposition_tester
-        generic map (WL=>WL, ROWS=>7, COLS=>7) port map(clk, rst, en, input, output, done);
+        generic map (WL=>WL, ROWS=>7, COLS=>7, ENABLE_DEL_CC=>3)
+        port map(clk, rst, en, input, output, done);

 end architecture behav;


 --
-- 7x7 register based matrix transposition
+-- 4x8 memory based matrix transposition
 --
 library ieee, vunit_lib;
 context vunit_lib.vunit_context;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

-entity streaming_matrix_transposition_register_7x7_tb is
+entity streaming_matrix_transposition_memory_4x8_tb is
    generic (
        runner_cfg  : string;   -- VUnit python pipe
        tb_path     : string    -- Absolute path to this testbench
    );
-end entity streaming_matrix_transposition_register_7x7_tb;
+end entity streaming_matrix_transposition_memory_4x8_tb;

-architecture behav of streaming_matrix_transposition_register_7x7_tb is
+architecture behav of streaming_matrix_transposition_memory_4x8_tb is
    constant WL : integer := 16;
    signal done : boolean;
    signal input, output : std_logic_vector(WL-1 downto 0);
@@ -206,29 +303,30 @@ begin
    end process;

    -- Run the test baby!
-    dut : entity work.streaming_matrix_transposition_register_7x7
+    dut : entity work.streaming_matrix_transposition_memory_4x8
        generic map(WL=>WL) port map(clk, rst, en, input, output);
    tb : entity work.streaming_matrix_transposition_tester
-        generic map (WL=>WL, ROWS=>7, COLS=>7) port map(clk, rst, en, input, output, done);
+        generic map (WL=>WL, ROWS=>4, COLS=>8, ENABLE_DEL_CC=>2)
+        port map(clk, rst, en, input, output, done);

 end architecture behav;

 --
-- 5x5 register based matrix transposition
+-- 2x2 register based matrix transposition
 --
 library ieee, vunit_lib;
 context vunit_lib.vunit_context;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

-entity streaming_matrix_transposition_register_5x5_tb is
+entity streaming_matrix_transposition_register_2x2_tb is
    generic (
        runner_cfg  : string;   -- VUnit python pipe
        tb_path     : string    -- Absolute path to this testbench
    );
-end entity streaming_matrix_transposition_register_5x5_tb;
+end entity streaming_matrix_transposition_register_2x2_tb;

-architecture behav of streaming_matrix_transposition_register_5x5_tb is
+architecture behav of streaming_matrix_transposition_register_2x2_tb is
    constant WL : integer := 16;
    signal done : boolean;
    signal input, output : std_logic_vector(WL-1 downto 0);
@@ -243,10 +341,47 @@ begin
    end process;

    -- Run the test baby!
-    dut : entity work.streaming_matrix_transposition_register_5x5
+    dut : entity work.streaming_matrix_transposition_register_2x2
        generic map(WL=>WL) port map(clk, rst, en, input, output);
    tb : entity work.streaming_matrix_transposition_tester
-        generic map (WL=>WL, ROWS=>5, COLS=>5) port map(clk, rst, en, input, output, done);
+        generic map (WL=>WL, ROWS=>2, COLS=>2) port map(clk, rst, en, input, output, done);
+
+end architecture behav;
+
+--
+-- 3x3 register based matrix transposition
+--
+library ieee, vunit_lib;
+context vunit_lib.vunit_context;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+entity streaming_matrix_transposition_register_3x3_tb is
+    generic (
+        runner_cfg  : string;   -- VUnit python pipe
+        tb_path     : string    -- Absolute path to this testbench
+    );
+end entity streaming_matrix_transposition_register_3x3_tb;
+
+architecture behav of streaming_matrix_transposition_register_3x3_tb is
+    constant WL : integer := 16;
+    signal done : boolean;
+    signal input, output : std_logic_vector(WL-1 downto 0);
+    signal clk, rst, en : std_logic;
+begin
+
+    -- VUnit test runner
+    process begin
+        test_runner_setup(runner, runner_cfg);
+        wait until done = true;
+        test_runner_cleanup(runner);
+    end process;
+
+    -- Run the test baby!
+    dut : entity work.streaming_matrix_transposition_register_3x3
+        generic map(WL=>WL) port map(clk, rst, en, input, output);
+    tb : entity work.streaming_matrix_transposition_tester
+        generic map (WL=>WL, ROWS=>3, COLS=>3) port map(clk, rst, en, input, output, done);

 end architecture behav;

@@ -287,23 +422,22 @@ begin

 end architecture behav;

-
 --
-- 3x3 register based matrix transposition
+-- 5x5 register based matrix transposition
 --
 library ieee, vunit_lib;
 context vunit_lib.vunit_context;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

-entity streaming_matrix_transposition_register_3x3_tb is
+entity streaming_matrix_transposition_register_5x5_tb is
    generic (
        runner_cfg  : string;   -- VUnit python pipe
        tb_path     : string    -- Absolute path to this testbench
    );
-end entity streaming_matrix_transposition_register_3x3_tb;
+end entity streaming_matrix_transposition_register_5x5_tb;

-architecture behav of streaming_matrix_transposition_register_3x3_tb is
+architecture behav of streaming_matrix_transposition_register_5x5_tb is
    constant WL : integer := 16;
    signal done : boolean;
    signal input, output : std_logic_vector(WL-1 downto 0);
@@ -318,29 +452,29 @@ begin
    end process;

    -- Run the test baby!
-    dut : entity work.streaming_matrix_transposition_register_3x3
+    dut : entity work.streaming_matrix_transposition_register_5x5
        generic map(WL=>WL) port map(clk, rst, en, input, output);
    tb : entity work.streaming_matrix_transposition_tester
-        generic map (WL=>WL, ROWS=>3, COLS=>3) port map(clk, rst, en, input, output, done);
+        generic map (WL=>WL, ROWS=>5, COLS=>5) port map(clk, rst, en, input, output, done);

 end architecture behav;

 --
-- 2x2 register based matrix transposition
+-- 7x7 register based matrix transposition
 --
 library ieee, vunit_lib;
 context vunit_lib.vunit_context;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

-entity streaming_matrix_transposition_register_2x2_tb is
+entity streaming_matrix_transposition_register_7x7_tb is
    generic (
        runner_cfg  : string;   -- VUnit python pipe
        tb_path     : string    -- Absolute path to this testbench
    );
-end entity streaming_matrix_transposition_register_2x2_tb;
+end entity streaming_matrix_transposition_register_7x7_tb;

-architecture behav of streaming_matrix_transposition_register_2x2_tb is
+architecture behav of streaming_matrix_transposition_register_7x7_tb is
    constant WL : integer := 16;
    signal done : boolean;
    signal input, output : std_logic_vector(WL-1 downto 0);
@@ -355,14 +489,13 @@ begin
    end process;

    -- Run the test baby!
-    dut : entity work.streaming_matrix_transposition_register_2x2
+    dut : entity work.streaming_matrix_transposition_register_7x7
        generic map(WL=>WL) port map(clk, rst, en, input, output);
    tb : entity work.streaming_matrix_transposition_tester
-        generic map (WL=>WL, ROWS=>2, COLS=>2) port map(clk, rst, en, input, output, done);
+        generic map (WL=>WL, ROWS=>7, COLS=>7) port map(clk, rst, en, input, output, done);

 end architecture behav;

-
 --
 -- 4x8 register based matrix transposition
 --

--- a/b_asic/codegen/vhdl/architecture.py
+++ b/b_asic/codegen/vhdl/architecture.py
--- a/b_asic/codegen/vhdl/common.py
+++ b/b_asic/codegen/vhdl/common.py
@@ -133,6 +133,17 @@ def signal_declaration(
        )


+def alias_declaration(
+    f: TextIO,
+    name: str,
+    signal_type: str,
+    value: Optional[str] = None,
+    name_pad: Optional[int] = None,
+):
+    name_pad = name_pad or 0
+    write(f, 1, f'alias {name:<{name_pad}} : {signal_type} is {value};')
+
+
 def constant_declaration(
    f: TextIO,
    name: str,

--- a/b_asic/codegen/vhdl/entity.py
+++ b/b_asic/codegen/vhdl/entity.py
@@ -43,7 +43,7 @@ def memory_based_storage(
    write_lines(
        f,
        [
-            (0, '-- Clock, synchronous reset and enable signals'),
+            (2, '-- Clock, synchronous reset and enable signals'),
            (2, 'clk : in std_logic;'),
            (2, 'rst : in std_logic;'),
            (2, 'en  : in std_logic;'),
@@ -53,9 +53,9 @@ def memory_based_storage(

    # Write the input port specification
    f.write(f'{2*VHDL_TAB}-- Memory port I/O\n')
-    read_ports: set[Port] = set(
+    read_ports: set[Port] = {
        read_port for mv in collection for read_port in mv.read_ports
-    )  # type: ignore
+    }  # type: ignore
    for idx, read_port in enumerate(read_ports):
        port_name = read_port if isinstance(read_port, int) else read_port.name
        port_name = 'p_' + str(port_name) + '_in'

--- a/b_asic/core_operations.py
+++ b/b_asic/core_operations.py
@@ -73,7 +73,6 @@ class Constant(AbstractOperation):
    def __str__(self) -> str:
        return f"{self.value}"

-
 class Addition(AbstractOperation):
    """
    Binary addition operation.
@@ -1263,3 +1262,45 @@ class Shift(AbstractOperation):
        if not isinstance(value, int):
            raise TypeError("value must be an int")
        self.set_param("value", value)
+
+class Sink(AbstractOperation):
+    r"""
+    Sink operation.
+
+    Used for ignoring the output from another operation to avoid dangling output nodes.
+
+    Parameters
+    ==========
+
+    name : Name, optional
+        Operation name.
+    """
+
+    _execution_time = 0
+    is_linear = True
+
+    def __init__(self, name: Name = ""):
+        """Construct a Sink operation."""
+        super().__init__(
+            input_count=1,
+            output_count=0,
+            name=name,
+            latency_offsets={"in0": 0},
+        )
+
+    @classmethod
+    def type_name(cls) -> TypeName:
+        return TypeName("sink")
+
+    def evaluate(self):
+        raise NotImplementedError
+
+    @property
+    def latency(self) -> int:
+        return self.latency_offsets["in0"]
+
+    def __repr__(self) -> str:
+        return "Sink()"
+
+    def __str__(self) -> str:
+        return "sink"
--- a/b_asic/process.py
+++ b/b_asic/process.py
@@ -30,9 +30,17 @@ class Process:
        self._name = name

    def __lt__(self, other):
-        return self._start_time < other.start_time or (
-            self._start_time == other.start_time
-            and self.execution_time > other.execution_time
+        return (
+            self._start_time < other.start_time
+            or (
+                self._start_time == other.start_time
+                and self.execution_time > other.execution_time
+            )
+            or (  # Sorting on name to possibly get deterministic behavior
+                self._start_time == other.start_time
+                and self.execution_time == other.execution_time
+                and self._name < other.name
+            )
        )

    @property

--- a/b_asic/resources.py
+++ b/b_asic/resources.py
 import io
 import re
-from collections import Counter
+from collections import Counter, defaultdict
 from functools import reduce
+from math import log2
 from typing import Dict, Iterable, List, Optional, Tuple, TypeVar, Union

 import matplotlib.pyplot as plt
@@ -97,9 +98,10 @@ def draw_exclusion_graph_coloring(
    color_dict: Dict[Process, int],
    ax: Optional[Axes] = None,
    color_list: Optional[Union[List[str], List[Tuple[float, float, float]]]] = None,
+    **kwargs,
 ) -> None:
    """
-    Helper function for drawing a colored exclusion graphs.
+    Helper function for drawing colored exclusion graphs.

    Example usage:

@@ -131,6 +133,8 @@ def draw_exclusion_graph_coloring(
        A Matplotlib :class:`~matplotlib.axes.Axes` object to draw the exclusion graph.
    color_list : iterable of color, optional
        A list of colors in Matplotlib format.
+    **kwargs : Any
+        Named arguments passed on to :func:`networkx.draw_networkx`

    Returns
    -------
@@ -165,6 +169,7 @@ def draw_exclusion_graph_coloring(
        node_color=node_color_list,
        ax=ax,
        pos=nx.spring_layout(exclusion_graph, seed=1),
+        **kwargs,
    )


@@ -497,6 +502,17 @@ class ProcessCollection:
        else:
            self.collection.remove(process)

+    def __contains__(self, process: Process) -> bool:
+        """
+        Test if a process is part of this ProcessCollection
+
+        Parameters
+        ----------
+        process : :class:`~b_asic.process.Process`
+            The process to test.
+        """
+        return process in self.collection
+
    def plot(
        self,
        ax: Optional[Axes] = None,
@@ -848,7 +864,7 @@ class ProcessCollection:

    def split_on_execution_time(
        self,
-        heuristic: str = "graph_color",
+        heuristic: str = "left_edge",
        coloring_strategy: str = "saturation_largest_first",
    ) -> List["ProcessCollection"]:
        """
@@ -886,7 +902,7 @@ class ProcessCollection:

    def split_on_ports(
        self,
-        heuristic: str = "graph_color",
+        heuristic: str = "left_edge",
        read_ports: Optional[int] = None,
        write_ports: Optional[int] = None,
        total_ports: Optional[int] = None,
@@ -903,7 +919,7 @@ class ProcessCollection:
            Valid options are:

            * "graph_color"
-            * "..."
+            * "left_edge"

        read_ports : int, optional
            The number of read ports used when splitting process collection based on
@@ -926,9 +942,106 @@ class ProcessCollection:
        )
        if heuristic == "graph_color":
            return self._split_ports_graph_color(read_ports, write_ports, total_ports)
+        elif heuristic == "left_edge":
+            return self.split_ports_sequentially(
+                read_ports,
+                write_ports,
+                total_ports,
+                sequence=sorted(self),
+            )
        else:
            raise ValueError("Invalid heuristic provided.")

+    def split_ports_sequentially(
+        self,
+        read_ports: int,
+        write_ports: int,
+        total_ports: int,
+        sequence: List[Process],
+    ) -> List["ProcessCollection"]:
+        """
+        Split this collection into multiple new collections by sequentially assigning
+        processes in the order of `sequence`.
+
+        This method takes the processes from `sequence`, in order, and assignes them to
+        to multiple new `ProcessCollection` based on port collisions in a first-come
+        first-served manner. The first `Process` in `sequence` is assigned first, and
+        the last `Proccess` in `sequence is assigned last.
+
+        Parameters
+        ----------
+        read_ports : int
+            The number of read ports used when splitting process collection based on
+            memory variable access.
+        write_ports : int
+            The number of write ports used when splitting process collection based on
+            memory variable access.
+        total_ports : int
+            The total number of ports used when splitting process collection based on
+            memory variable access.
+        sequence : list of `Process`
+            A list of the processes used to determine the order in which processes are
+            assigned.
+
+        Returns
+        -------
+        list of `ProcessCollection`
+            A set of new ProcessCollection objects with the process splitting.
+        """
+
+        def ports_collide(proc: Process, collection: ProcessCollection):
+            """
+            Predicate test if insertion of a process `proc` results in colliding ports
+            when inserted to `collection` based on the `read_ports`, `write_ports`, and
+            `total_ports`.
+            """
+
+            # Test the number of concurrent write accesses
+            collection_writes = defaultdict(int, collection.write_port_accesses())
+            if collection_writes[proc.start_time] >= write_ports:
+                return True
+
+            # Test the number of concurrent read accesses
+            collection_reads = defaultdict(int, collection.read_port_accesses())
+            for proc_read_time in proc.read_times:
+                if collection_reads[proc_read_time % self.schedule_time] >= read_ports:
+                    return True
+
+            # Test the number of total accesses
+            collection_total_accesses = defaultdict(
+                int, Counter(collection_writes) + Counter(collection_reads)
+            )
+            for access_time in [proc.start_time, *proc.read_times]:
+                if collection_total_accesses[access_time] >= total_ports:
+                    return True
+
+            # No collision detected
+            return False
+
+        # Make sure that processes from `sequence` and and `self` are equal
+        if set(self.collection) != set(sequence):
+            raise KeyError("processes in `sequence` must be equal to processes in self")
+
+        collections: List[ProcessCollection] = []
+        for process in sequence:
+            process_added = False
+            for collection in collections:
+                if not ports_collide(process, collection):
+                    collection.add_process(process)
+                    process_added = True
+                    break
+            if not process_added:
+                # Stuff the process in a new collection
+                collections.append(
+                    ProcessCollection(
+                        [process],
+                        schedule_time=self.schedule_time,
+                        cyclic=self._cyclic,
+                    )
+                )
+        # Return the list of created ProcessCollections
+        return collections
+
    def _split_ports_graph_color(
        self,
        read_ports: int,
@@ -1142,7 +1255,10 @@ class ProcessCollection:
        read_ports: int = 1,
        write_ports: int = 1,
        total_ports: int = 2,
+        *,
        input_sync: bool = True,
+        adr_mux_size: Optional[int] = None,
+        adr_pipe_depth: Optional[int] = None,
    ):
        """
        Generate VHDL code for memory based storage of processes (MemoryVariables).
@@ -1177,6 +1293,13 @@ class ProcessCollection:
            Adding registers to the inputs allow pipelining of address generation
            (which is added automatically). For large interleavers, this can improve
            timing significantly.
+        adr_mux_size : int, optional
+            Size of multiplexer if using address generation pipelining. Set to `None`
+            for no multiplexer pipelining. If any other value than `None`, `input_sync`
+            must also be set.
+        adr_pipe_depth : int, optional
+            Depth of address generation pipelining. Set to `None` for no multiplexer
+            pipelining. If any other value than None, `input_sync` must also be set.
        """
        # Check that entity name is a valid VHDL identifier
        if not is_valid_vhdl_identifier(entity_name):
@@ -1232,6 +1355,34 @@ class ProcessCollection:
                    ' generate HDL for this ProcessCollection'
                )

+        # Sanitize the address logic pipeline settings
+        if adr_mux_size is not None and adr_pipe_depth is not None:
+            if adr_mux_size < 1:
+                raise ValueError(
+                    f'adr_mux_size={adr_mux_size} need to be greater than zero'
+                )
+            if adr_pipe_depth < 0:
+                raise ValueError(
+                    f'adr_pipe_depth={adr_pipe_depth} needs to be non-negative'
+                )
+            if not input_sync:
+                raise ValueError('input_sync needs to be set to use address pipelining')
+            if not log2(adr_mux_size).is_integer():
+                raise ValueError(
+                    f'adr_mux_size={adr_mux_size} needs to be interger power of two'
+                )
+            if adr_mux_size**adr_pipe_depth > assignment[0].schedule_time:
+                raise ValueError(
+                    f'adr_mux_size={adr_mux_size}, adr_pipe_depth={adr_pipe_depth} => '
+                    'more multiplexer inputs than schedule_time='
+                    f'{assignment[0].schedule_time}'
+                )
+        else:
+            if adr_mux_size is not None or adr_pipe_depth is not None:
+                raise ValueError(
+                    'both or none of adr_mux_size and adr_pipe_depth needs to be set'
+                )
+
        with open(filename, 'w') as f:
            from b_asic.codegen.vhdl import architecture, common, entity

@@ -1249,6 +1400,8 @@ class ProcessCollection:
                write_ports=write_ports,
                total_ports=total_ports,
                input_sync=input_sync,
+                adr_mux_size=1 if adr_mux_size is None else adr_mux_size,
+                adr_pipe_depth=0 if adr_pipe_depth is None else adr_pipe_depth,
            )

    def split_on_length(

--- a/examples/fivepointwinograddft.py
+++ b/examples/fivepointwinograddft.py
@@ -181,3 +181,23 @@ arch = Architecture(
 )

 arch
+
+# %%
+# Move memory variables to optimize architecture
+arch.move_process('addsub2.0', 'memory3', 'memory2')
+arch.move_process('bfly2.0', 'memory2', 'memory3')
+arch.move_process('cmul2.0', 'memory1', 'memory0')
+arch.move_process('bfly3.0', 'memory0', 'memory1')
+arch.move_process('cmul3.0', 'memory4', 'memory0')
+
+arch.assign_resources()
+
+# %%
+# Memory 4 is now empty, so remove it.
+
+arch.remove_resource('memory4')
+
+for memory in arch.memories:
+    memory.show_content(title=f"Improved {memory.entity_name}")
+
+arch
--- a/examples/secondorderdirectformiir_architecture.py
+++ b/examples/secondorderdirectformiir_architecture.py
@@ -98,43 +98,3 @@ arch = Architecture(
 # %%
 # The architecture can be rendered in enriched shells.
 arch
-
-# %%
-# To reduce the amount of interconnect, the ``cuml2.0`` variable can be moved from
-# ``memory0`` to ``memory2``.  In this way, ``memory0`` only gets variables from the
-# adder and an input multiplexer can be avoided. The memories must be assigned again as
-# the contents have changed.
-arch.move_process('cmul2.0', 'memory0', 'memory2')
-memories[0].assign()
-memories[2].assign()
-
-memories[0].show_content("New assigned memory0")
-memories[2].show_content("New assigned memory2")
-
-# %%
-# Looking at the architecture it is clear that there is now only one input to
-# ``memory0``, so no input multiplexer is required.
-arch
-
-# %%
-# It is of course also possible to move ``add3.0`` to ``memory2`` to save one memory
-# cell. It is possible to pass ``assign=True`` to perform assignment after moving.
-arch.move_process('add3.0', 'memory0', 'memory2', assign=True)
-
-memories[0].show_content("New assigned memory0")
-memories[2].show_content("New assigned memory2")
-
-# %%
-# However, this comes at the expense of an additional input to ``memory2``.
-arch
-
-# %%
-# Finally, by noting that ``cmul0.0`` is the only variable from ``memory1`` going to
-# ``in0`` of ``adder``, another multiplexer can be reduced by:
-arch.move_process('cmul0.0', 'memory1', 'memory2', assign=True)
-memories[1].show_content("New assigned memory1")
-memories[2].show_content("New assigned memory2")
-
-# %%
-# Leading to
-arch
--- a/examples/threepointwinograddft.py
+++ b/examples/threepointwinograddft.py
@@ -57,6 +57,7 @@ sfg.set_execution_time_of_type(AddSub.type_name(), 1)
 schedule = Schedule(sfg, cyclic=True)
 schedule.show()

+# %%
 # Reschedule to only use one AddSub and one ConstantMultiplication per time unit
 schedule.set_schedule_time(10)
 schedule.move_operation('out0', 11)
@@ -88,6 +89,7 @@ schedule.move_operation('addsub2', -1)
 schedule.move_operation('addsub4', -4)
 schedule.show()

+# %%
 # Extract memory variables and operation executions
 operations = schedule.get_operations()
 adders = operations.get_by_type_name(AddSub.type_name())
@@ -123,7 +125,8 @@ for i, mem in enumerate(mem_vars_set):
    memory.assign("left_edge")
    memory.show_content(title=f"Assigned {memory.entity_name}")

-
+# %%
+# Create architecture
 arch = Architecture(
    {addsub, multiplier, pe_in, pe_out}, memories, direct_interconnects=direct
 )
@@ -131,7 +134,7 @@ arch = Architecture(
 arch

 # %%
-# Move memory variables
+# Move memory variables to reduce the size of memory1
 arch.move_process('addsub1.0', memories[2], memories[1])
 arch.move_process('addsub3.0', memories[1], memories[2], assign=True)
 memories[1].assign()

--- a/test/test_architecture.py
+++ b/test/test_architecture.py
@@ -157,10 +157,7 @@ def test_architecture(schedule_direct_form_iir_lp_filter: Schedule):

    # Graph representation
    # Parts are non-deterministic, but this first part seems OK
-    s = (
-        'digraph {\n\tnode [shape=box]\n\tsplines=spline\n\tsubgraph'
-        ' cluster_memories'
-    )
+    s = 'digraph {\n\tnode [shape=box]\n\tsplines=spline\n\tsubgraph cluster_memories'
    assert architecture._digraph().source.startswith(s)
    s = 'digraph {\n\tnode [shape=box]\n\tsplines=spline\n\tMEM0'
    assert architecture._digraph(cluster=False).source.startswith(s)
@@ -229,9 +226,9 @@ def test_move_process(schedule_direct_form_iir_lp_filter: Schedule):
    architecture.move_process('in0.0', memories[1], memories[0])
    assert memories[0].collection.from_name('in0.0')

-    assert processing_elements[1].collection.from_name('add0')
-    architecture.move_process('add0', processing_elements[1], processing_elements[0])
    assert processing_elements[0].collection.from_name('add0')
+    architecture.move_process('add0', processing_elements[0], processing_elements[1])
+    assert processing_elements[1].collection.from_name('add0')

    # Processes leave the resources they have moved from
    with pytest.raises(KeyError):
@@ -239,7 +236,7 @@ def test_move_process(schedule_direct_form_iir_lp_filter: Schedule):
    with pytest.raises(KeyError):
        memories[1].collection.from_name('in0.0')
    with pytest.raises(KeyError):
-        processing_elements[1].collection.from_name('add0')
+        processing_elements[0].collection.from_name('add0')

    # Processes can only be moved when the source and destination process-types match
    with pytest.raises(TypeError, match="cmul3.0 not of type"):

--- a/test/test_core_operations.py
+++ b/test/test_core_operations.py
@@ -20,6 +20,8 @@ from b_asic import (
    SquareRoot,
    Subtraction,
    SymmetricTwoportAdaptor,
+    Sink,
+    SFG,
 )


@@ -404,3 +406,16 @@ class TestDepends:
        bfly1 = Butterfly()
        assert set(bfly1.inputs_required_for_output(0)) == {0, 1}
        assert set(bfly1.inputs_required_for_output(1)) == {0, 1}
+
+class TestSink:
+    def test_create_sfg_with_sink(self):
+        bfly = Butterfly()
+        sfg = bfly.to_sfg()
+        s = Sink()
+        sfg1 = sfg.replace_operation(s, "out0")
+        sfg2 = SFG(sfg1.input_operations, sfg1.output_operations[1:])
+
+        assert sfg2.output_count == 1
+        assert sfg2.input_count == 2
+
+        assert sfg.evaluate_output(1, [0,1]) == sfg2.evaluate_output(0, [0,1])
--- a/test/test_resources.py
+++ b/test/test_resources.py
 import re

 import matplotlib.pyplot as plt
-import pytest
 import matplotlib.testing.decorators
+import pytest

 from b_asic.core_operations import ConstantMultiplication
 from b_asic.process import PlainMemoryVariable
@@ -14,25 +14,66 @@ from b_asic.resources import ProcessCollection, _ForwardBackwardTable


 class TestProcessCollectionPlainMemoryVariable:
-    @matplotlib.testing.decorators.image_comparison(['test_draw_process_collection.png'])
+    @matplotlib.testing.decorators.image_comparison(
+        ['test_draw_process_collection.png']
+    )
    def test_draw_process_collection(self, simple_collection):
        fig, ax = plt.subplots()
        simple_collection.plot(ax=ax, show_markers=False)
        return fig

-    @matplotlib.testing.decorators.image_comparison(['test_draw_matrix_transposer_4.png'])
+    @matplotlib.testing.decorators.image_comparison(
+        ['test_draw_matrix_transposer_4.png']
+    )
    def test_draw_matrix_transposer_4(self):
        fig, ax = plt.subplots()
        generate_matrix_transposer(4).plot(ax=ax)  # type: ignore
        return fig

-    def test_split_memory_variable(self, simple_collection: ProcessCollection):
+    def test_split_memory_variable_graph_color(
+        self, simple_collection: ProcessCollection
+    ):
        collection_split = simple_collection.split_on_ports(
            heuristic="graph_color", read_ports=1, write_ports=1, total_ports=2
        )
        assert len(collection_split) == 3

-    @matplotlib.testing.decorators.image_comparison(['test_left_edge_cell_assignment.png'])
+    def test_contains(self):
+        collection = ProcessCollection([], schedule_time=10, cyclic=True)
+        m1 = PlainMemoryVariable(0, 0, {0: 3})
+        assert m1 not in collection
+        collection.add_process(m1)
+        assert m1 in collection
+        collection.remove_process(m1)
+        assert m1 not in collection
+
+    def test_split_sequence_raises(self, simple_collection: ProcessCollection):
+        with pytest.raises(KeyError, match="processes in `sequence` must be"):
+            simple_collection.split_ports_sequentially(
+                read_ports=1, write_ports=1, total_ports=2, sequence=[]
+            )
+
+    def test_split_memory_variable_left_edge(
+        self, simple_collection: ProcessCollection
+    ):
+        split = simple_collection.split_on_ports(
+            heuristic="left_edge", read_ports=1, write_ports=1, total_ports=2
+        )
+        assert len(split) == 3
+
+        split = simple_collection.split_on_ports(
+            heuristic="left_edge", read_ports=1, write_ports=2, total_ports=2
+        )
+        assert len(split) == 3
+
+        split = simple_collection.split_on_ports(
+            heuristic="left_edge", read_ports=2, write_ports=2, total_ports=2
+        )
+        assert len(split) == 2
+
+    @matplotlib.testing.decorators.image_comparison(
+        ['test_left_edge_cell_assignment.png']
+    )
    def test_left_edge_cell_assignment(self, simple_collection: ProcessCollection):
        fig, ax = plt.subplots(1, 2)
        assignment = list(simple_collection._left_edge_assignment())
@@ -45,23 +86,39 @@ class TestProcessCollectionPlainMemoryVariable:
        collection = generate_matrix_transposer(4, min_lifetime=5)
        assignment_left_edge = collection._left_edge_assignment()
        assignment_graph_color = collection.split_on_execution_time(
-            coloring_strategy='saturation_largest_first'
+            heuristic="graph_color", coloring_strategy='saturation_largest_first'
        )
        assert len(assignment_left_edge) == 18
        assert len(assignment_graph_color) == 16

    def test_generate_memory_based_vhdl(self):
-        for rows in [2, 3, 4, 5, 7]:
-            collection = generate_matrix_transposer(rows, min_lifetime=0)
+        # fmt: off
+        variants = [
+            #  rows ,  cols , #mux  , #pipe  #
+            # ------------------------------ #
+            (   2   ,   2   ,  None ,  None ),
+            (   3   ,   3   ,   1   ,   0   ),
+            (   4   ,   4   ,   4   ,   1   ),
+            (   5   ,   5   ,   4   ,   2   ),
+            (   7   ,   7   ,   4   ,   3   ),
+            (   4   ,   8   ,   2   ,   2   ),
+        ]
+        # fmt: on
+        for rows, cols, mux_size, pipe_depth in variants:
+            collection = generate_matrix_transposer(
+                rows=rows, cols=cols, min_lifetime=0
+            )
            assignment = collection.split_on_execution_time(heuristic="graph_color")
            collection.generate_memory_based_storage_vhdl(
                filename=(
                    'b_asic/codegen/testbench/'
-                    f'streaming_matrix_transposition_memory_{rows}x{rows}.vhdl'
+                    f'streaming_matrix_transposition_memory_{rows}x{cols}.vhdl'
                ),
-                entity_name=f'streaming_matrix_transposition_memory_{rows}x{rows}',
+                entity_name=f'streaming_matrix_transposition_memory_{rows}x{cols}',
                assignment=assignment,
                word_length=16,
+                adr_mux_size=mux_size,
+                adr_pipe_depth=pipe_depth,
            )

    def test_generate_register_based_vhdl(self):
@@ -79,16 +136,6 @@ class TestProcessCollectionPlainMemoryVariable:

    def test_rectangular_matrix_transposition(self):
        collection = generate_matrix_transposer(rows=4, cols=8, min_lifetime=2)
-        assignment = collection.split_on_execution_time(heuristic="graph_color")
-        collection.generate_memory_based_storage_vhdl(
-            filename=(
-                'b_asic/codegen/testbench/streaming_matrix_transposition_memory_'
-                '4x8.vhdl'
-            ),
-            entity_name='streaming_matrix_transposition_memory_4x8',
-            assignment=assignment,
-            word_length=16,
-        )
        collection.generate_register_based_storage_vhdl(
            filename=(
                'b_asic/codegen/testbench/streaming_matrix_transposition_register_'
@@ -158,7 +205,9 @@ class TestProcessCollectionPlainMemoryVariable:
        assert len(simple_collection) == 7
        assert new_proc not in simple_collection

-    @matplotlib.testing.decorators.image_comparison(['test_max_min_lifetime_bar_plot.png'])
+    @matplotlib.testing.decorators.image_comparison(
+        ['test_max_min_lifetime_bar_plot.png']
+    )
    def test_max_min_lifetime_bar_plot(self):
        fig, ax = plt.subplots()
        collection = ProcessCollection(
No results found