From d7c9391e36102588e1b5cc9b46d132633c9e4267 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 16:44:54 +0000
Subject: [PATCH 01/15] [Feature] Timeout template added

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 14 ++++++
 src/finn/custom_op/fpgadataflow/templates.py  | 45 +++++++++++++++++++
 2 files changed, 59 insertions(+)
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..c03a9029db 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -474,3 +474,17 @@ def get_ap_int_max_w(self):
         ret = max([instream, outstream])
         assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"]
+
+    def timeout_condition(self):
+        """Set timeout condition for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+    def timeout_read_stream(self):
+        """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+            "debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname())
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..7ef74118ec 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -58,6 +58,51 @@
 
 """
 
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include <vector>
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
 # templates for single node ip generation
 
 # cpp file

From 6da0ce4d10db86f2eea3bb513164c752401956d8 Mon Sep 17 00:00:00 2001
From: mdaniowi <mdaniowi@amd.com>
Date: Fri, 20 Sep 2024 16:02:40 +0100
Subject: [PATCH 02/15] [Feature] npy2vectorstream.hpp include added to
 docompute_template

---
 src/finn/custom_op/fpgadataflow/templates.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 7ef74118ec..d2100a7516 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 

From 39a2efef2fca5f356b9d32017227f1a044a0a0da Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 15:01:54 +0000
Subject: [PATCH 03/15] [Feature] Moving operations past Split added,
 MoveIdenticalOpPastJoinOp refactored and derived from by
 MoveTransposePastJoinAdd, MoveMulPastJoinAdd, MoveAddPastJoinAdd,
 MoveTransposePastJoinConcat, MoveAffinePastJoinConcat

---
 src/finn/transformation/streamline/reorder.py | 399 ++++++++++++++++--
 .../test_move_identical_op_past_join_add.py   | 150 +++++++
 ...test_move_identical_op_past_join_concat.py | 183 ++++++++
 .../test_move_identical_op_past_join_op.py    | 114 -----
 .../test_move_identical_op_past_split.py      | 145 +++++++
 5 files changed, 839 insertions(+), 152 deletions(-)
 create mode 100644 tests/transformation/streamline/test_move_identical_op_past_join_add.py
 create mode 100644 tests/transformation/streamline/test_move_identical_op_past_join_concat.py
 delete mode 100644 tests/transformation/streamline/test_move_identical_op_past_join_op.py
 create mode 100644 tests/transformation/streamline/test_move_identical_op_past_split.py

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 9a7e9d0723..33751cb4d8 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -518,7 +518,9 @@ def apply(self, model):
 
 
 class MoveLinearPastEltwiseAdd(Transformation):
-    """Move linear operations (mul, add) past elementwise add operations where possible.
+    """
+    DEPRECATED, use MoveAddPastJoinAdd() and MoveMulPastJoinAdd()
+    Move linear operations (mul, add) past elementwise add operations where possible.
     Specifically,matches and transforms the following patterns:
     (x*C) + (y*C) -> (x + y) * C
     (x+A) + (y+B) -> (x + y) + (A + B)
@@ -918,6 +920,121 @@ def __init__(self):
         super().__init__(["Transpose"])
 
 
+def permute_shape(shape, perm):
+    new_shape = np.zeros(len(shape))
+    for i, p in enumerate(perm):
+        new_shape[i] = shape[p]
+    return [int(el) for el in new_shape]
+
+
+class MoveScalarLinearPastSplit(Transformation):
+    """
+    Move scalar Mul and Add nodes past channel split operation.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.ops_to_move = ["Mul", "Add"]
+        self.fork_ops = ["Split"]
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            # if n.op_type in self.fork_ops and model.is_fork_node(n):
+            if n.op_type in self.fork_ops:
+                producer = model.find_producer(n.input[0])
+                if producer is not None and producer.op_type in self.ops_to_move:
+                    linear_param = model.get_initializer(producer.input[1])
+                    # Check if single input
+                    if len(producer.input) != 2 or linear_param is None:
+                        continue
+                    # Check if scalar
+                    if np.prod(linear_param.shape) != 1:
+                        continue
+                    split_outputs = n.output
+                    for split_output_idx, old_split_output in enumerate(split_outputs):
+                        new_mul_node = deepcopy(producer)
+                        new_split_output = model.make_new_valueinfo_name()
+                        model.set_tensor_datatype(
+                            new_split_output, model.get_tensor_datatype(producer.input[0])
+                        )
+
+                        model.set_tensor_shape(
+                            new_split_output, model.get_tensor_shape(old_split_output)
+                        )
+
+                        n.output[split_output_idx] = new_split_output
+                        new_mul_node.input[0] = new_split_output
+                        new_mul_node.output[0] = old_split_output
+
+                        graph.node.insert(node_ind, new_mul_node)
+                        node_ind += 1
+
+                    # remove the mul node
+                    n.input[0] = producer.input[0]
+                    graph.node.remove(producer)
+                    graph_modified = True
+
+        if graph_modified:
+            model = model.transform(SortGraph(), make_deepcopy=False, cleanup=False)
+
+        return (model, graph_modified)
+
+
+class MoveTransposePastSplit(Transformation):
+    def __init__(self):
+        super().__init__()
+        self.ops_to_move = ["Transpose"]
+        self.fork_ops = ["Split"]
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            # if n.op_type in self.fork_ops and model.is_fork_node(n):
+            if n.op_type in self.fork_ops:
+                producer = model.find_producer(n.input[0])
+                if producer is not None and producer.op_type in self.ops_to_move:
+                    initial_perm = get_by_name(producer.attribute, "perm").ints
+                    reverse_perm = np.argsort(initial_perm)
+                    split_outputs = n.output
+                    for split_output_idx, old_split_output in enumerate(split_outputs):
+                        new_trans_node = deepcopy(producer)
+                        new_split_output = model.make_new_valueinfo_name()
+                        old_split_output_shape = model.get_tensor_shape(old_split_output)
+                        model.set_tensor_datatype(
+                            new_split_output, model.get_tensor_datatype(producer.input[0])
+                        )
+
+                        model.set_tensor_shape(
+                            new_split_output, permute_shape(old_split_output_shape, reverse_perm)
+                        )
+
+                        n.output[split_output_idx] = new_split_output
+                        new_trans_node.input[0] = new_split_output
+                        new_trans_node.output[0] = old_split_output
+
+                        graph.node.insert(node_ind, new_trans_node)
+                        node_ind += 1
+
+                    # remove the transpose node and change the split axis
+                    old_split_axis = get_by_name(n.attribute, "axis").i
+                    get_by_name(n.attribute, "axis").i = initial_perm[old_split_axis]
+                    n.input[0] = producer.input[0]
+                    graph.node.remove(producer)
+                    graph_modified = True
+
+        if graph_modified:
+            model = model.transform(SortGraph(), make_deepcopy=False, cleanup=False)
+
+        return (model, graph_modified)
+
+
 class MoveMaxPoolPastMultiThreshold(Transformation):
     """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph."""
 
@@ -1188,13 +1305,8 @@ def apply(self, model):
 
 class MoveIdenticalOpPastJoinOp(Transformation):
     """
-    Move identical operations on different branches past the common join node.
-    This transformation assumes that the identical operations only change the
-    data layout. For linear operations, see the transformation MoveLinearPastEltwiseAdd.
-    Specifically, this transformation matches and transforms the following patterns:
-    f(x) + f(y) -> f(x + y)
-    where f(.) is currently only supporting 'Transpose', and an 'Add' node is
-    the join node.
+    Move multiple identical operations on different branches past the common join node.
+    It assumes the shape to be preserved by the join op in the default move_node() method
     """
 
     def __init__(self, identical_op_list, join_node_list):
@@ -1202,52 +1314,77 @@ def __init__(self, identical_op_list, join_node_list):
         self.ops_to_move = identical_op_list
         self.join_node_op = join_node_list
 
-    def move_node(self, model, n, prod0, prod1):
-        # Found! move one of the identical_ops to output, remove the other one
-        identical_op0_in0 = prod0.input[0]
-        identical_op1_in0 = prod1.input[0]
-        add_in0 = n.input[0]
-        add_out = n.output[0]
+    def move_node(self, model, n, producers):
+        """
+        Should be overwritten for some operations
+
+        Returns:
+            bool: whether moving the node was successful
+        """
+        identical_ops_inputs = [p.input[0] for p in producers]
+        # join_in0 = n.input[0]
+        join_out = n.output[0]
 
-        # Rewire
-        n.input[0] = identical_op0_in0
-        n.input[1] = identical_op1_in0
+        # Rewire join op inputs
+        for i in range(len(n.input)):
+            n.input[i] = identical_ops_inputs[i]
 
         # Output tensor of the join node must have the same shape as
         # its input tensor (original shape is preserved)
-        new_shape = model.get_tensor_shape(identical_op0_in0)
+        new_join_output = model.make_new_valueinfo_name()
+        new_shape = model.get_tensor_shape(identical_ops_inputs[0])
+        new_layout = model.get_tensor_layout(identical_ops_inputs[0])
 
         # Set new tensor shape
-        model.set_tensor_shape(tensor_name=add_in0, tensor_shape=new_shape)
-
-        n.output[0] = add_in0
-        prod0.input[0] = add_in0
-        prod0.output[0] = add_out
-
-        model.graph.node.remove(prod1)
+        model.set_tensor_shape(new_join_output, new_shape)
+        if new_layout:
+            model.set_tensor_layout(new_join_output, new_layout)
+
+        # Rewire join op outputs (reuse the first join input tensor)
+        n.output[0] = new_join_output
+        producers[0].input[0] = new_join_output
+        producers[0].output[0] = join_out
+
+        for prod in producers[1:]:
+            model.graph.node.remove(prod)
+
+        return True
+
+    def are_producers_identical(self, model, producers):
+        """
+        Checks only op_types
+        Should be overwritten for additional checks
+        """
+        op_types = [prod.op_type for prod in producers]
+        for op in op_types:
+            if op != op_types[0]:
+                return False
+        return True
 
     def apply(self, model):
         graph = model.graph
         graph_modified = False
         for n in graph.node:
             if n.op_type in self.join_node_op and model.is_join_node(n):
-                in0 = n.input[0]
-                in1 = n.input[1]
-                if in0 is None or in1 is None:
+                inputs = n.input
+                if None in inputs:
                     continue
 
-                prod0 = model.find_producer(in0)
-                prod1 = model.find_producer(in1)
-                # Checks if the join node is preceded by
-                # two different, but identical operations
-                if prod0 == prod1:
+                producers = [model.find_producer(inp) for inp in inputs]
+                if producers[0].op_type not in self.ops_to_move:
+                    continue
+                identical_ops = self.are_producers_identical(model, producers)
+                if not identical_ops:
+                    warnings.warn("Producers not identical, skipping")
                     continue
 
-                identical_op = prod0.op_type == prod1.op_type
-
-                if identical_op and prod0.op_type in self.ops_to_move:
-                    self.move_node(model, n, prod0, prod1)
-                    graph_modified = True
+                # check for producers that are fork nodes (need to fork them before our transform)
+                for prod in producers:
+                    if model.is_fork_node(prod) and not model.is_join_node(prod):
+                        model = model.transform(MoveOpPastFork(self.ops_to_move))
+                        # topology modified, "ask" ModelWrapper to apply this transform again
+                        return (model, True)
+                graph_modified = self.move_node(model, n, producers)
 
         if graph_modified:
             model = model.transform(SortGraph(), make_deepcopy=False, cleanup=False)
@@ -1258,3 +1395,189 @@ def apply(self, model):
 class MoveTransposePastJoinAdd(MoveIdenticalOpPastJoinOp):
     def __init__(self):
         super().__init__(["Transpose"], ["Add"])
+
+    def are_producers_identical(self, model, producers):
+        if not super().are_producers_identical(model, producers):
+            return False
+        first_perm = get_by_name(producers[0].attribute, "perm").ints
+        for producer in producers:
+            if first_perm != get_by_name(producer.attribute, "perm").ints:
+                False
+        return True
+
+
+class MoveMulPastJoinAdd(MoveIdenticalOpPastJoinOp):
+    def __init__(self):
+        super().__init__(["Mul"], ["Add"])
+
+    def are_producers_identical(self, model, producers):
+        if not super().are_producers_identical(model, producers):
+            return False
+        first_mul = model.get_initializer(producers[0].input[1])
+        if first_mul is None:
+            return False
+        for producer in producers:
+            if first_mul != model.get_initializer(producer.input[1]):
+                return False
+        return True
+
+
+class MoveAddPastJoinAdd(MoveIdenticalOpPastJoinOp):
+    def __init__(self):
+        super().__init__(["Add"], ["Add"])
+
+    def are_producers_identical(self, model, producers):
+        if not super().are_producers_identical(model, producers):
+            return False
+        for producer in producers:
+            if model.get_initializer(producer.input[1]) is None:
+                return False
+        return True
+
+    def move_node(self, model, n, producers):
+        """
+        We use the base move_node method to move the first producer
+        past the join node (and delete the rest)
+        """
+        add_inits = [model.get_initializer(producer.input[1]) for producer in producers]
+        new_init = np.sum(add_inits)
+        model.set_initializer(producers[0].input[1], new_init)
+        super().move_node(model, n, producers)
+
+        return True
+
+
+class MoveTransposePastJoinConcat(MoveIdenticalOpPastJoinOp):
+    def __init__(self):
+        super().__init__(["Transpose"], ["Concat"])
+
+    def are_producers_identical(self, model, producers):
+        if not super().are_producers_identical(model, producers):
+            return False
+        first_perm = get_by_name(producers[0].attribute, "perm").ints
+        for producer in producers:
+            if first_perm != get_by_name(producer.attribute, "perm").ints:
+                False
+        return True
+
+    def move_node(self, model, n, producers):
+        trans_inputs = [prod.input[0] for prod in producers]
+        # concat_in0 = n.input[0]
+        concat_out = n.output[0]
+        # Rewire concat inputs
+        for i in range(len(n.input)):
+            n.input[i] = trans_inputs[i]
+
+        new_concat_out = model.make_new_valueinfo_name()  # reuse tensor
+        # reverse the permutation of the concat output
+        transpose_perm = get_by_name(producers[0].attribute, "perm").ints
+        reverse_perm = np.argsort(transpose_perm)
+        new_concat_out_shape = permute_shape(model.get_tensor_shape(concat_out), reverse_perm)
+        new_concat_out_layout = model.get_tensor_layout(trans_inputs[0])
+        # Set tensor layout and shape of the new concatenation output
+        model.set_tensor_shape(new_concat_out, new_concat_out_shape)
+        if new_concat_out_layout:
+            model.set_tensor_layout(new_concat_out, new_concat_out_layout)
+        # Change concatenation axis
+        old_concat_axis = get_by_name(n.attribute, "axis").i
+        get_by_name(n.attribute, "axis").i = transpose_perm[old_concat_axis]
+
+        # Rewire concat output
+        n.output[0] = new_concat_out
+        producers[0].input[0] = new_concat_out
+        producers[0].output[0] = concat_out
+
+        for prod in producers[1:]:
+            model.graph.node.remove(prod)
+
+        return True
+
+
+class MoveAffinePastJoinConcat(MoveIdenticalOpPastJoinOp):
+    """
+    Applies to scalar linear or channelwise affine ops with the same parameter value
+    """
+
+    def __init__(self, linear_ops=["Mul", "Add"]):
+        super().__init__(linear_ops, ["Concat"])
+
+    def are_producers_identical_scalar_ops(self, model, producers):
+        first_param = model.get_initializer(producers[0].input[1])
+        for producer in producers:
+            producer_param = model.get_initializer(producer.input[1])
+            if (first_param != producer_param).any() or np.prod(producer_param.shape) != 1:
+                return False
+
+        return True
+
+    def are_producers_channelwise_ops(self, channel_dim, model, producers):
+        for producer in producers:
+            producer_input = producer.input[0]
+            num_channels = model.get_tensor_shape(producer_input)[channel_dim]
+            producer_param = model.get_initializer(producer.input[1])
+            if (
+                len(producer_param.shape) < channel_dim
+                or producer_param.shape[channel_dim] != num_channels
+            ):
+                return False
+
+        return True
+
+    def move_node(self, model, n, producers):
+        # check if single input
+        for producer in producers:
+            producer_init = model.get_initializer(producer.input[1])
+            if len(producer.input) != 2 or producer_init is None:
+                warnings.warn("Producer found that is not single-input, skipping")
+                return False
+
+        # decide if producers are identical scalar ops or channelwise ops
+        channelwise_op = False
+        identical_scalar_op = self.are_producers_identical_scalar_ops(model, producers)
+        if not identical_scalar_op:
+            channel_dim = get_by_name(n.attribute, "axis").i
+            channelwise_op = self.are_producers_channelwise_ops(channel_dim, model, producers)
+            if not channelwise_op:
+                warnings.warn(
+                    "Producers are neither identical scalar ops nor channelwise ops, skipping"
+                )
+                return False
+
+        # Rewire concat inputs
+        producers_inputs = [prod.input[0] for prod in producers]
+        concat_out = n.output[0]
+        for i in range(len(n.input)):
+            n.input[i] = producers_inputs[i]
+        # Set tensor layout and shape of the new concatenation output
+        new_concat_out = model.make_new_valueinfo_name()
+        new_concat_out_layout = model.get_tensor_layout(producers_inputs[0])
+        model.set_tensor_shape(new_concat_out, model.get_tensor_shape(concat_out))
+        if new_concat_out_layout:
+            model.set_tensor_layout(new_concat_out, new_concat_out_layout)
+        model.set_tensor_datatype(new_concat_out, model.get_tensor_datatype(producers_inputs[0]))
+
+        if channelwise_op:
+            # concatenate op params of producers into one mul tensor
+            producers_params = [model.get_initializer(prod.input[1]) for prod in producers]
+            new_mul_tensor = np.concatenate(producers_params, axis=channel_dim)
+            model.set_initializer(producers[0].input[1], new_mul_tensor)
+
+        # Rewire concat output
+        n.output[0] = new_concat_out
+        producers[0].input[0] = new_concat_out
+        producers[0].output[0] = concat_out
+
+        for prod in producers[1:]:
+            model.graph.node.remove(prod)
+
+        return True
+
+
+class MoveMulPastJoinConcat(MoveAffinePastJoinConcat):
+    def __init__(self):
+        super().__init__(["Mul"])
+
+
+class MoveAddPastJoinConcat(MoveAffinePastJoinConcat):
+    def __init__(self):
+        super().__init__(["Add"])
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_add.py b/tests/transformation/streamline/test_move_identical_op_past_join_add.py
new file mode 100644
index 0000000000..7226d31589
--- /dev/null
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_add.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+from onnx import TensorProto
+from onnx import helper as oh
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.streamline.reorder import (
+    MoveAddPastJoinAdd,
+    MoveMulPastJoinAdd,
+    MoveTransposePastJoinAdd,
+)
+
+
+def create_add_model(identical_op):
+    perm = None
+    if "Transpose" in identical_op:
+        perm = identical_op.split("_")[1]
+        identical_op = identical_op.split("_")[0]
+        perm = [int(char) for char in perm]
+    if perm == [0, 2, 3, 1]:
+        in_shape = [1, 64, 10, 9]
+        out_shape = [1, 10, 9, 64]
+    elif perm == [0, 3, 1, 2]:
+        in_shape = [1, 10, 9, 64]
+        out_shape = [1, 64, 10, 9]
+    else:
+        in_shape = [1, 64, 10, 9]
+        out_shape = in_shape
+    op_value = 1.5
+
+    op1_node = oh.make_node(identical_op, inputs=["in1"], outputs=["op1_out"])
+
+    op2_node = oh.make_node(identical_op, inputs=["in2"], outputs=["op2_out"])
+
+    if identical_op == "Transpose":
+        new_attr = oh.make_attribute("perm", perm)
+        op1_node.attribute.append(new_attr)
+        op2_node.attribute.append(new_attr)
+    elif identical_op == "Mul" or identical_op == "Add":
+        op1_init = oh.make_tensor_value_info("op1_param", TensorProto.FLOAT, [1])
+        op2_init = oh.make_tensor_value_info("op2_param", TensorProto.FLOAT, [1])
+        op1_node.input.append(op1_init.name)
+        op2_node.input.append(op2_init.name)
+
+    add_node = oh.make_node("Add", inputs=["op1_out", "op2_out"], outputs=["out_join1"])
+
+    in1 = oh.make_tensor_value_info("in1", TensorProto.FLOAT, in_shape)
+    in2 = oh.make_tensor_value_info("in2", TensorProto.FLOAT, in_shape)
+    op1_out = oh.make_tensor_value_info("op1_out", TensorProto.FLOAT, out_shape)
+    op2_out = oh.make_tensor_value_info("op2_out", TensorProto.FLOAT, out_shape)
+    out_join1 = oh.make_tensor_value_info("out_join1", TensorProto.FLOAT, out_shape)
+
+    graph = oh.make_graph(
+        nodes=[op1_node, op2_node, add_node],
+        name="test_graph",
+        inputs=[in1, in2],
+        outputs=[out_join1],
+        value_info=[
+            op1_out,
+            op2_out,
+        ],
+    )
+
+    onnx_model = qonnx_make_model(graph, producer_name="test_model")
+    model = ModelWrapper(onnx_model)
+    if identical_op == "Mul" or identical_op == "Add":
+        model.set_initializer("op1_param", np.array(op_value).astype(np.float32))
+        model.set_initializer("op2_param", np.array(op_value).astype(np.float32))
+
+    return model
+
+
+transform_dict = {
+    "Transpose_0231": MoveTransposePastJoinAdd(),
+    "Transpose_0312": MoveTransposePastJoinAdd(),
+    "Mul": MoveMulPastJoinAdd(),
+    "Add": MoveAddPastJoinAdd(),
+}
+
+
+@pytest.mark.streamline
+# Permutation of transpose node
+@pytest.mark.parametrize("identical_op", ["Transpose_0231", "Transpose_0312", "Mul", "Add"])
+def test_move_identical_op_past_join_op(identical_op):
+    model = create_add_model(identical_op)
+    # build_dir = os.environ["FINN_BUILD_DIR"]
+    # model.save(join(build_dir, "add_pytest_model_{}.onnx".format(identical_op)))
+
+    # Create input data
+    input0_tensor_name = model.graph.input[0].name
+    input1_tensor_name = model.graph.input[1].name
+
+    # Note: it is assumed that both tensors have the same shape and data type
+    input_shape = model.get_tensor_shape(input0_tensor_name)
+    input_dtype = model.get_tensor_datatype(input0_tensor_name)
+    input_val = gen_finn_dt_tensor(input_dtype, input_shape)
+    input_dict = {}
+    input_dict[input0_tensor_name] = input_val
+    input_dict[input1_tensor_name] = input_val
+
+    model_transformed = model.transform(transform_dict[identical_op])
+    # model_transformed.save(join(build_dir, "add_pytest_model_{}_trans.onnx".format(identical_op)))
+
+    assert oxe.compare_execution(model, model_transformed, input_dict)
+
+    # Check if order changed
+    node0_optype_model = model.find_consumers(model.graph.input[0].name)[0].op_type
+    node1_optype_model = model.find_consumers(model.graph.input[1].name)[0].op_type
+    node0_optype_model_transformed = model_transformed.find_consumers(
+        model_transformed.graph.input[0].name
+    )[0].op_type
+    node1_optype_model_transformed = model_transformed.find_consumers(
+        model_transformed.graph.input[1].name
+    )[0].op_type
+    last_node_optype_model_transformed = model_transformed.find_producer(
+        model_transformed.graph.output[0].name
+    ).op_type
+    assert node0_optype_model == last_node_optype_model_transformed
+    assert node1_optype_model == last_node_optype_model_transformed
+    assert node0_optype_model_transformed == node1_optype_model_transformed == "Add"
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_concat.py b/tests/transformation/streamline/test_move_identical_op_past_join_concat.py
new file mode 100644
index 0000000000..2dcf90d10a
--- /dev/null
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_concat.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+import os
+from onnx import TensorProto
+from onnx import helper as oh
+from os.path import join
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.streamline.reorder import (
+    MoveAddPastJoinConcat,
+    MoveMulPastJoinConcat,
+    MoveTransposePastJoinConcat,
+)
+
+
+def create_concat_model(identical_op):
+    perm = None
+    channelwise = False
+    if "Transpose" in identical_op:
+        perm = identical_op.split("_")[1]
+        identical_op = identical_op.split("_")[0]
+        perm = [int(char) for char in perm]
+    if "channelwise" in identical_op:
+        channelwise = True
+        identical_op = identical_op.split("_")[0]
+    if perm == [0, 2, 3, 1]:
+        in_shape1 = [1, 64, 10, 9]
+        in_shape2 = [1, 32, 10, 9]
+        out_shape1 = [1, 10, 9, 64]
+        out_shape2 = [1, 10, 9, 32]
+        out_join_shape = [1, 10, 9, 96]
+        concat_axis = 3
+    elif perm == [0, 3, 1, 2]:
+        in_shape1 = [1, 10, 9, 64]
+        in_shape2 = [1, 10, 9, 32]
+        out_shape1 = [1, 64, 10, 9]
+        out_shape2 = [1, 32, 10, 9]
+        out_join_shape = [1, 96, 10, 9]
+        concat_axis = 1
+    else:
+        in_shape1 = [1, 64, 10, 9]
+        in_shape2 = [1, 32, 10, 9]
+        out_shape1 = in_shape1
+        out_shape2 = in_shape2
+        out_join_shape = [1, 96, 10, 9]
+        concat_axis = 1
+        if channelwise:
+            op1_param_shape = [1, 64, 1, 1]
+            op2_param_shape = [1, 32, 1, 1]
+            op1_param = np.ones((1, 64, 1, 1)) * 2
+            op2_param = np.ones((1, 32, 1, 1)) * 3
+        else:
+            op1_param_shape = [1]
+            op2_param_shape = [1]
+            op1_param = 1.5
+            op2_param = 1.5
+
+    op1_node = oh.make_node(identical_op, inputs=["in1"], outputs=["op1_out"])
+
+    op2_node = oh.make_node(identical_op, inputs=["in2"], outputs=["op2_out"])
+
+    if identical_op == "Transpose":
+        new_attr = oh.make_attribute("perm", perm)
+        op1_node.attribute.append(new_attr)
+        op2_node.attribute.append(new_attr)
+    elif identical_op == "Mul" or identical_op == "Add":
+        op1_init = oh.make_tensor_value_info("op1_param", TensorProto.FLOAT, op1_param_shape)
+        op2_init = oh.make_tensor_value_info("op2_param", TensorProto.FLOAT, op2_param_shape)
+        op1_node.input.append(op1_init.name)
+        op2_node.input.append(op2_init.name)
+
+    concat_node = oh.make_node(
+        "Concat", inputs=["op1_out", "op2_out"], outputs=["out_join1"], axis=concat_axis
+    )
+
+    in1 = oh.make_tensor_value_info("in1", TensorProto.FLOAT, in_shape1)
+    in2 = oh.make_tensor_value_info("in2", TensorProto.FLOAT, in_shape2)
+    op1_out = oh.make_tensor_value_info("op1_out", TensorProto.FLOAT, out_shape1)
+    op2_out = oh.make_tensor_value_info("op2_out", TensorProto.FLOAT, out_shape2)
+    out_join1 = oh.make_tensor_value_info("out_join1", TensorProto.FLOAT, out_join_shape)
+
+    graph = oh.make_graph(
+        nodes=[op1_node, op2_node, concat_node],
+        name="test_graph",
+        inputs=[in1, in2],
+        outputs=[out_join1],
+        value_info=[
+            op1_out,
+            op2_out,
+        ],
+    )
+
+    onnx_model = qonnx_make_model(graph, producer_name="test_model")
+    model = ModelWrapper(onnx_model)
+    if identical_op == "Mul" or identical_op == "Add":
+        model.set_initializer("op1_param", np.array(op1_param).astype(np.float32))
+        model.set_initializer("op2_param", np.array(op2_param).astype(np.float32))
+
+    return model
+
+
+transform_dict = {
+    "Transpose_0231": MoveTransposePastJoinConcat(),
+    "Transpose_0312": MoveTransposePastJoinConcat(),
+    "Mul": MoveMulPastJoinConcat(),
+    "Mul_channelwise": MoveMulPastJoinConcat(),
+    "Add": MoveAddPastJoinConcat(),
+    "Add_channelwise": MoveAddPastJoinConcat(),
+}
+
+
+@pytest.mark.streamline
+# Permutation of transpose node
+@pytest.mark.parametrize(
+    "identical_op",
+    ["Transpose_0231", "Transpose_0312", "Mul", "Add", "Mul_channelwise", "Add_channelwise"],
+)
+def test_move_identical_op_past_join_concat(identical_op):
+    model = create_concat_model(identical_op)
+    build_dir = os.environ["FINN_BUILD_DIR"]
+    model.save(join(build_dir, "concat_pytest_model_{}.onnx".format(identical_op)))
+
+    # Create input data
+    input0_tensor_name = model.graph.input[0].name
+    input1_tensor_name = model.graph.input[1].name
+
+    # Note: it is assumed that both tensors have the same shape and data type
+    input_dict = {}
+    input_dict[input0_tensor_name] = gen_finn_dt_tensor(
+        model.get_tensor_datatype(input0_tensor_name), model.get_tensor_shape(input0_tensor_name)
+    )
+    input_dict[input1_tensor_name] = gen_finn_dt_tensor(
+        model.get_tensor_datatype(input1_tensor_name), model.get_tensor_shape(input1_tensor_name)
+    )
+
+    model_transformed = model.transform(transform_dict[identical_op])
+    model_transformed.save(
+        join(build_dir, "concat_pytest_model_{}_trans.onnx".format(identical_op))
+    )
+
+    assert oxe.compare_execution(model, model_transformed, input_dict)
+
+    # Check if order changed
+    node0_input0_model = model.find_consumers(model.graph.input[0].name)[0].op_type
+    node1_input1_model = model.find_consumers(model.graph.input[1].name)[0].op_type
+    node0_input0_model_transformed = model_transformed.find_consumers(
+        model_transformed.graph.input[0].name
+    )[0].op_type
+    node1_input1_model_transformed = model_transformed.find_consumers(
+        model_transformed.graph.input[1].name
+    )[0].op_type
+    assert node0_input0_model != node0_input0_model_transformed
+    assert node1_input1_model != node1_input1_model_transformed
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
deleted file mode 100644
index dd83681fc2..0000000000
--- a/tests/transformation/streamline/test_move_identical_op_past_join_op.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import pytest
-
-from onnx import TensorProto
-from onnx import helper as oh
-from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
-
-import finn.core.onnx_exec as oxe
-from finn.transformation.streamline.reorder import MoveTransposePastJoinAdd
-
-
-def create_model(perm):
-    if perm == [0, 3, 1, 2]:
-        in_shape = [1, 128, 1, 256]
-        out_shape = [1, 256, 128, 1]
-    if perm == [0, 2, 3, 1]:
-        in_shape = [1, 256, 128, 1]
-        out_shape = [1, 128, 1, 256]
-
-    Transpose1_node = oh.make_node(
-        "Transpose", inputs=["in_transpose1"], outputs=["out_transpose1"], perm=perm
-    )
-
-    Transpose2_node = oh.make_node(
-        "Transpose", inputs=["in_transpose2"], outputs=["out_transpose2"], perm=perm
-    )
-
-    Join1_node = oh.make_node(
-        "Add", inputs=["out_transpose1", "out_transpose2"], outputs=["out_join1"]
-    )
-
-    in_transpose1 = oh.make_tensor_value_info("in_transpose1", TensorProto.FLOAT, in_shape)
-    in_transpose2 = oh.make_tensor_value_info("in_transpose2", TensorProto.FLOAT, in_shape)
-    out_transpose1 = oh.make_tensor_value_info("out_transpose1", TensorProto.FLOAT, out_shape)
-    out_transpose2 = oh.make_tensor_value_info("out_transpose2", TensorProto.FLOAT, out_shape)
-    out_join1 = oh.make_tensor_value_info("out_join1", TensorProto.FLOAT, out_shape)
-
-    graph = oh.make_graph(
-        nodes=[Transpose1_node, Transpose2_node, Join1_node],
-        name="test_graph",
-        inputs=[in_transpose1, in_transpose2],
-        outputs=[out_join1],
-        value_info=[
-            out_transpose1,
-            out_transpose2,
-        ],
-    )
-
-    onnx_model = qonnx_make_model(graph, producer_name="test_model")
-    model = ModelWrapper(onnx_model)
-
-    return model
-
-
-@pytest.mark.streamline
-# Permutation of transpose node
-@pytest.mark.parametrize("perm", [[0, 3, 1, 2], [0, 2, 3, 1]])
-def test_move_identical_op_past_join_op(perm):
-    model = create_model(perm)
-
-    # Create input data
-    input0_tensor_name = model.graph.input[0].name
-    input1_tensor_name = model.graph.input[1].name
-
-    # Note: it is assumed that both tensors have the same shape and data type
-    input_shape = model.get_tensor_shape(input0_tensor_name)
-    input_dtype = model.get_tensor_datatype(input0_tensor_name)
-    input_val = gen_finn_dt_tensor(input_dtype, input_shape)
-    input_dict = {}
-    input_dict[input0_tensor_name] = input_val
-    input_dict[input1_tensor_name] = input_val
-
-    model_transformed = model.transform(MoveTransposePastJoinAdd())
-
-    assert oxe.compare_execution(model, model_transformed, input_dict)
-
-    # Check if order changed
-    node0_input0_model = model.find_consumers(model.graph.input[0].name)[0].op_type
-    node1_input1_model = model.find_consumers(model.graph.input[1].name)[0].op_type
-    node0_input0_model_transformed = model_transformed.find_consumers(
-        model_transformed.graph.input[0].name
-    )[0].op_type
-    node1_input1_model_transformed = model_transformed.find_consumers(
-        model_transformed.graph.input[1].name
-    )[0].op_type
-    assert node0_input0_model != node0_input0_model_transformed
-    assert node1_input1_model != node1_input1_model_transformed
diff --git a/tests/transformation/streamline/test_move_identical_op_past_split.py b/tests/transformation/streamline/test_move_identical_op_past_split.py
new file mode 100644
index 0000000000..a104f179be
--- /dev/null
+++ b/tests/transformation/streamline/test_move_identical_op_past_split.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+from onnx import TensorProto
+from onnx import helper as oh
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.streamline.reorder import (
+    MoveScalarLinearPastSplit,
+    MoveTransposePastSplit,
+)
+
+
+def create_split_model(identical_op):
+    perm = None
+    if "Transpose" in identical_op:
+        perm = identical_op.split("_")[1]
+        identical_op = identical_op.split("_")[0]
+        perm = [int(char) for char in perm]
+    if perm == [0, 2, 3, 1]:
+        in_shape = [1, 96, 10, 9]
+        out_shape = [1, 10, 9, 96]
+        out1_split_shape = [1, 10, 9, 32]
+        out2_split_shape = [1, 10, 9, 64]
+        split_axis = 3
+    elif perm == [0, 3, 1, 2]:
+        in_shape = [1, 10, 9, 96]
+        out_shape = [1, 96, 10, 9]
+        out1_split_shape = [1, 32, 10, 9]
+        out2_split_shape = [1, 64, 10, 9]
+        split_axis = 1
+    else:
+        in_shape = [1, 96, 10, 9]
+        out_shape = in_shape
+        out1_split_shape = [1, 32, 10, 9]
+        out2_split_shape = [1, 64, 10, 9]
+        split_axis = 1
+    op_value = 1.5
+    split = [32, 64]
+
+    op_node = oh.make_node(identical_op, inputs=["in1"], outputs=["op_out"])
+
+    if identical_op == "Transpose":
+        new_attr = oh.make_attribute("perm", perm)
+        op_node.attribute.append(new_attr)
+    elif identical_op == "Mul" or identical_op == "Add":
+        op_init = oh.make_tensor_value_info("op_param", TensorProto.FLOAT, [1])
+        op_node.input.append(op_init.name)
+
+    in1 = oh.make_tensor_value_info("in1", TensorProto.FLOAT, in_shape)
+    op_out = oh.make_tensor_value_info("op_out", TensorProto.FLOAT, out_shape)
+    out1_split = oh.make_tensor_value_info("out1_split", TensorProto.FLOAT, out1_split_shape)
+    out2_split = oh.make_tensor_value_info("out2_split", TensorProto.FLOAT, out2_split_shape)
+    split_init = oh.make_tensor_value_info("split", TensorProto.INT64, [2])
+
+    split_node = oh.make_node(
+        "Split", [op_out.name, split_init.name], [out1_split.name, out2_split.name], axis=split_axis
+    )
+
+    graph = oh.make_graph(
+        nodes=[op_node, split_node],
+        name="test_graph",
+        inputs=[in1],
+        outputs=[out1_split, out2_split],
+        value_info=[op_out],
+    )
+
+    model = oh.make_model(graph)
+    model = ModelWrapper(model)
+    model.set_initializer(split_init.name, np.array(split, dtype=np.int64))
+    if identical_op == "Mul" or identical_op == "Add":
+        model.set_initializer(op_init.name, np.array(op_value).astype(np.float32))
+    model = model.transform(GiveUniqueNodeNames())
+
+    return model
+
+
+transform_dict = {
+    "Transpose_0231": MoveTransposePastSplit(),
+    "Transpose_0312": MoveTransposePastSplit(),
+    "Mul": MoveScalarLinearPastSplit(),
+    "Add": MoveScalarLinearPastSplit(),
+}
+
+
+@pytest.mark.streamline
+# Permutation of transpose node
+@pytest.mark.parametrize("identical_op", ["Transpose_0231", "Transpose_0312", "Mul", "Add"])
+def test_move_identical_op_past_join_concat(identical_op):
+    model = create_split_model(identical_op)
+    # build_dir = os.environ["FINN_BUILD_DIR"]
+    # model.save(join(build_dir, "split_pytest_model_{}.onnx".format(identical_op)))
+
+    # Create input data
+    input0_tensor_name = model.graph.input[0].name
+
+    # Note: it is assumed that both tensors have the same shape and data type
+    input_dict = {}
+    input_dict[input0_tensor_name] = gen_finn_dt_tensor(
+        model.get_tensor_datatype(input0_tensor_name), model.get_tensor_shape(input0_tensor_name)
+    )
+
+    model_transformed = model.transform(transform_dict[identical_op])
+    # model_transformed.save(
+    #     join(build_dir, "split_pytest_model_{}_trans.onnx".format(identical_op))
+    # )
+
+    assert oxe.compare_execution(model, model_transformed, input_dict)
+
+    # Check if order changed
+    node0_input0_model = model.find_consumers(model.graph.input[0].name)[0].op_type
+    node0_input0_model_transformed = model_transformed.find_consumers(
+        model_transformed.graph.input[0].name
+    )[0].op_type
+    assert node0_input0_model != node0_input0_model_transformed

From 51a9199858166673893f8d7bea0d3f8805769232 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Wed, 18 Sep 2024 14:50:31 +0000
Subject: [PATCH 04/15] [Deprecated] MoveLinearPastEltwiseAdd() removed from
 the codebase

---
 src/finn/transformation/streamline/reorder.py |  81 --------
 .../streamline/test_linear_past_eltwise.py    | 192 ------------------
 2 files changed, 273 deletions(-)
 delete mode 100644 tests/transformation/streamline/test_linear_past_eltwise.py

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 33751cb4d8..8688145453 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -517,87 +517,6 @@ def apply(self, model):
         return (model, graph_modified)
 
 
-class MoveLinearPastEltwiseAdd(Transformation):
-    """
-    DEPRECATED, use MoveAddPastJoinAdd() and MoveMulPastJoinAdd()
-    Move linear operations (mul, add) past elementwise add operations where possible.
-    Specifically,matches and transforms the following patterns:
-    (x*C) + (y*C) -> (x + y) * C
-    (x+A) + (y+B) -> (x + y) + (A + B)
-    where x and y are dynamic inputs, A, B, C are constant tensors (in general).
-    """
-
-    def move_node(self, graph, n, prod0, prod1, node_ind):
-        # found! move one of the muls to output, remove the other one
-        lin0_in0 = prod0.input[0]
-        lin1_in0 = prod1.input[0]
-        in0 = n.input[0]
-        out = n.output[0]
-        # TODO: check shapes don't change through scalar mul or add
-        # connect the eltwise add inputs to mul inputs
-        n.input[0] = lin0_in0
-        n.input[1] = lin1_in0
-        # connect mul0 output to eltwise add output
-        prod0.output[0] = out
-        # connect the input of mul0 and output of eltwise add together
-        n.output[0] = in0
-        prod0.input[0] = in0
-        # move prod0 node past eltwise add node, and remove prod1
-        graph.node.remove(prod1)
-        graph.node.remove(prod0)
-        graph.node.insert(node_ind - 2, prod0)
-
-    def apply(self, model):
-        graph = model.graph
-        node_ind = 0
-        graph_modified = False
-        nodes = [n for n in graph.node]
-        for n in nodes:
-            node_ind += 1
-            if n.op_type == "Add":
-                # check for tensors on both inputs (eltwise add)
-                # scalar add has an initializer on one input
-                in0 = n.input[0]
-                in1 = n.input[1]
-                if in0 is None or in1 is None:
-                    continue
-                A = model.get_initializer(in0)
-                B = model.get_initializer(in1)
-                if A is not None or B is not None:
-                    continue
-                # check for mul with same initializer on both inputs
-                prod0 = model.find_producer(in0)
-                prod1 = model.find_producer(in1)
-                # Also check case when both branches are empty and come
-                # from the same node: (prod0 == prod1)
-                # Other transform should handle that
-                if prod0 is None or prod1 is None or (prod0 == prod1):
-                    continue
-                if len(prod0.input) < 2 or len(prod1.input) < 2:
-                    continue
-                init0 = model.get_initializer(prod0.input[1])
-                init1 = model.get_initializer(prod1.input[1])
-                # if either initializer is None, skip
-                if init0 is None or init1 is None:
-                    continue
-                if prod0.op_type == "Mul" and prod1.op_type == "Mul":
-                    if np.array_equal(init0, init1):
-                        self.move_node(graph, n, prod0, prod1, node_ind)
-                        node_ind -= 1
-                        graph_modified = True
-                elif prod0.op_type == "Add" and prod1.op_type == "Add":
-                    init = init0 + init1
-                    # update initializer of prod0, which we'll move
-                    model.set_initializer(prod0.input[1], init)
-                    self.move_node(graph, n, prod0, prod1, node_ind)
-                    node_ind -= 1
-                    graph_modified = True
-                else:
-                    continue
-        model = model.transform(InferShapes())
-        return (model, graph_modified)
-
-
 class MoveScalarLinearPastInvariants(Transformation):
     """Move scalar linear operations (mul, add) past functions which are invariant
     to them. Specifically, matches and transforms the following patterns:
diff --git a/tests/transformation/streamline/test_linear_past_eltwise.py b/tests/transformation/streamline/test_linear_past_eltwise.py
deleted file mode 100644
index 70fc395652..0000000000
--- a/tests/transformation/streamline/test_linear_past_eltwise.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import pytest
-
-import numpy as np
-import os
-from onnx import TensorProto, helper
-from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.transformation.fold_constants import FoldConstants
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import qonnx_make_model
-
-import finn.core.onnx_exec as oxe
-from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
-
-export_onnx_path = "test_linear_past_eltwise.onnx"
-np_default_dtype = np.float32
-
-# construct a synthetic graph to test:
-# topk insertion, topk conversion to hls, add conversion to hls
-# graph should just be a sum
-
-
-def make_model(shape):
-    inp1 = helper.make_tensor_value_info("inp1", TensorProto.FLOAT, shape)
-    inp2 = helper.make_tensor_value_info("inp2", TensorProto.FLOAT, shape)
-    inp1_add = helper.make_tensor_value_info("inp1_add", TensorProto.FLOAT, shape)
-    inp1_add_ct = helper.make_tensor_value_info("inp1_add_ct", TensorProto.FLOAT, [1])
-    inp2_add = helper.make_tensor_value_info("inp2_add", TensorProto.FLOAT, shape)
-    inp2_add_ct = helper.make_tensor_value_info("inp2_add_ct", TensorProto.FLOAT, [1])
-    inp1_mul = helper.make_tensor_value_info("inp1_mul", TensorProto.FLOAT, shape)
-    inp1_mul_ct = helper.make_tensor_value_info("inp1_mul_ct", TensorProto.FLOAT, [1])
-    inp2_mul = helper.make_tensor_value_info("inp2_mul", TensorProto.FLOAT, shape)
-    inp2_mul_ct = helper.make_tensor_value_info("inp2_mul_ct", TensorProto.FLOAT, [1])
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
-
-    add1_node = helper.make_node("Add", [inp1.name, inp1_add_ct.name], [inp1_add.name])
-    add2_node = helper.make_node("Add", [inp2.name, inp2_add_ct.name], [inp2_add.name])
-    mul1_node = helper.make_node("Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name])
-    mul2_node = helper.make_node("Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name])
-    eltwise_add_node = helper.make_node("Add", [inp1_mul.name, inp2_mul.name], [outp.name])
-    graph = helper.make_graph(
-        nodes=[add1_node, add2_node, mul1_node, mul2_node, eltwise_add_node],
-        name="graph",
-        inputs=[inp1, inp2],
-        outputs=[outp],
-    )
-
-    model = qonnx_make_model(graph, producer_name="add-model")
-    model = ModelWrapper(model)
-
-    # set initializers for scalar add/mul nodes
-    model.set_initializer(add1_node.input[1], np.array([7.0], dtype=np_default_dtype))
-    model.set_initializer(add2_node.input[1], np.array([8.0], dtype=np_default_dtype))
-    model.set_initializer(mul1_node.input[1], np.array([3.0], dtype=np_default_dtype))
-    model.set_initializer(mul2_node.input[1], np.array([3.0], dtype=np_default_dtype))
-
-    return model
-
-
-@pytest.mark.streamline
-# channels
-@pytest.mark.parametrize("ch", [64])
-# ifmdim
-@pytest.mark.parametrize("ifmdim", [-1, 7])
-def test_linear_past_eltwise_add(ch, ifmdim):
-    # generate test vectors of correct shape
-    if ifmdim == -1:
-        input_tensor_shape = (1, ch)
-    else:
-        input_tensor_shape = (1, ch, ifmdim, ifmdim)
-
-    model = make_model(input_tensor_shape)
-    model.save(export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
-    model = model.transform(InferShapes())
-    model = model.transform(FoldConstants())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
-
-    x1 = np.random.randn(*input_tensor_shape).astype(np.float32)
-    x2 = np.random.randn(*input_tensor_shape).astype(np.float32)
-
-    # generate expected value from streamlined net
-    input_dict = {model.graph.input[0].name: x1, model.graph.input[1].name: x2}
-
-    output_dict = oxe.execute_onnx(model, input_dict, True)
-    produced_sum = output_dict[model.graph.output[0].name]
-    expected_sum = 3.0 * ((x1 + x2) + 15.0)
-    assert np.isclose(expected_sum, produced_sum, atol=1e-3).all()
-    assert len(model.get_nodes_by_op_type("Add")) == 3
-    assert len(model.get_nodes_by_op_type("Mul")) == 2
-
-    model = model.transform(MoveLinearPastEltwiseAdd())
-
-    # verify again, to check we didnt break anything
-    output_dict = oxe.execute_onnx(model, input_dict, True)
-    produced_sum = output_dict[model.graph.output[0].name]
-    assert np.isclose(expected_sum, produced_sum, atol=1e-3).all()
-    assert len(model.get_nodes_by_op_type("Add")) == 2
-    assert len(model.get_nodes_by_op_type("Mul")) == 1
-
-    os.remove(export_onnx_path)
-
-
-@pytest.mark.streamline
-@pytest.mark.parametrize("ch", [64, 1])
-# ifmdim
-@pytest.mark.parametrize("ifmdim", [-1, 7])
-def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim):
-    # generate test vectors of correct shape
-    if ifmdim == -1:
-        input_shape = (1, ch)
-    else:
-        input_shape = (1, ch, ifmdim, ifmdim)
-
-    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
-    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
-
-    num_of_params = 6
-    value_info = []
-    for i in range(num_of_params):
-        value_info += [helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)]
-
-    modelproto = qonnx_make_model(
-        helper.make_graph(
-            name="test",
-            inputs=[top_in],
-            outputs=[top_out],
-            value_info=value_info,
-            nodes=[
-                helper.make_node("Add", ["top_in", "p0"], ["fork1"]),
-                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
-                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
-                helper.make_node("Add", ["t2", "t3"], ["t4"]),
-                helper.make_node("Mul", ["t4", "p3"], ["fork2"]),
-                helper.make_node("Add", ["fork2", "p4"], ["t5"]),
-                helper.make_node("Add", ["fork2", "p5"], ["t6"]),
-                helper.make_node("Add", ["t5", "t6"], ["top_out"]),
-            ],
-        )
-    )
-    model = ModelWrapper(modelproto)
-    model = model.transform(InferShapes())
-
-    np.random.seed(0)
-    for i in range(num_of_params):
-        model.set_initializer("p" + str(i), np.random.rand(*input_shape).astype(np.float32))
-
-    # need equal mults:
-    model.set_initializer("p2", model.get_initializer("p1"))
-
-    # Transform
-    new_model = model.transform(MoveLinearPastEltwiseAdd())
-    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
-
-    # Test
-    assert oxe.compare_execution(model, new_model, inp_dict)
-    assert new_model.graph.node[0].op_type == "Add"
-    assert new_model.graph.node[1].op_type == "Add"
-    assert new_model.graph.node[2].op_type == "Mul"
-    assert new_model.graph.node[3].op_type == "Mul"
-    assert new_model.graph.node[4].op_type == "Add"
-    assert new_model.graph.node[5].op_type == "Add"
-    assert len(new_model.graph.node) == 6

From c92b919d0919e31206754e51c588f5bd474ecf7b Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 15:36:25 +0000
Subject: [PATCH 05/15] [Feature] The Concat op code is not hardcoded in the
 compiler anymore and it now accepts different datatypes of inputs. It uses
 the new implementation from finn-hlslib

---
 src/finn/custom_op/fpgadataflow/concat.py     |  71 ++++++--
 .../custom_op/fpgadataflow/hls/concat_hls.py  | 166 ++++++++++--------
 2 files changed, 144 insertions(+), 93 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
index 210b6b7fdd..214c5a4bd5 100644
--- a/src/finn/custom_op/fpgadataflow/concat.py
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -27,7 +27,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import math
 import numpy as np
+import warnings
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import roundup_to_integer_multiple
 
@@ -36,17 +38,18 @@
 
 class StreamingConcat(HWCustomOp):
     """Abstraction layer for HW implementation of Concat.
-    Only supports concatenating along the last axis."""
+    Only supports concatenating along the last (channel) axis."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
+            "SIMD": ("i", True, 0),
             # number of elements from each stream to concat
-            "ElemsPerStream": ("ints", True, []),
-            # FINN DataTypes for inputs; output datatype inferred from input
-            "inputDataType": ("s", True, ""),
+            "ChannelsPerStream": ("ints", True, []),
+            # FINN DataTypes for inputs; output datatype inferred from inputs
+            "inputDataTypes": ("strings", True, [""]),
             # number of input vectors for non-concat axes, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -57,21 +60,24 @@ def get_nodeattr_types(self):
         return my_attrs
 
     def get_n_inputs(self):
-        return len(self.get_nodeattr("ElemsPerStream"))
+        return len(self.get_nodeattr("ChannelsPerStream"))
 
     def get_total_elems(self):
-        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        elems_per_stream = self.get_nodeattr("ChannelsPerStream")
         return int(np.sum(elems_per_stream))
 
     def get_normal_input_shape(self, ind=0):
-        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        elems_per_stream = self.get_nodeattr("ChannelsPerStream")
         elems = elems_per_stream[ind]
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [elems])
         return ishape
 
     def get_folded_input_shape(self, ind=0):
-        return self.get_normal_input_shape(ind)
+        simd = self.get_nodeattr("SIMD")
+        folds = self.get_nodeattr("ChannelsPerStream")[ind] // simd
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [folds, simd])
 
     def get_normal_output_shape(self, ind=0):
         total_elems = self.get_total_elems()
@@ -79,7 +85,11 @@ def get_normal_output_shape(self, ind=0):
         return tuple(vecs + [total_elems])
 
     def get_folded_output_shape(self, ind=0):
-        return self.get_normal_output_shape()
+        total_elems = self.get_total_elems()
+        simd = self.get_nodeattr("SIMD")
+        folds = total_elems // simd
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [folds, simd])
 
     def make_shape_compatible_op(self, model):
         # check all input shapes
@@ -94,7 +104,16 @@ def infer_node_datatype(self, model):
         # check all input datatypes
         for i, inp in enumerate(self.onnx_node.input):
             idt = model.get_tensor_datatype(inp)
-            assert idt == self.get_input_datatype()
+            if idt != self.get_input_datatype(i):
+                warn_str = "inputDataType changing for %s: %s -> %s " % (
+                    self.onnx_node.name,
+                    str(self.get_input_datatype(i)),
+                    str(idt),
+                )
+                warnings.warn(warn_str)
+                old_datatypes_attr = self.get_nodeattr("inputDataTypes")
+                old_datatypes_attr[i] = idt.name
+                self.set_nodeattr("inputDataTypes", old_datatypes_attr)
         odt = self.get_output_datatype()
         model.set_tensor_datatype(self.onnx_node.output[0], odt)
 
@@ -103,21 +122,37 @@ def verify_node(self):
 
     def get_input_datatype(self, ind=0):
         # input dt identical for all inputs
-        return DataType[self.get_nodeattr("inputDataType")]
+        return DataType[self.get_nodeattr("inputDataTypes")[ind]]
 
     def get_output_datatype(self, ind=0):
-        return self.get_input_datatype()
+        # infer output datatype from declared inputDataTypes
+        min_input = 0
+        max_input = 0
+        for i in range(len(self.get_nodeattr("inputDataTypes"))):
+            idt = self.get_input_datatype(i)
+            if idt.min() < min_input:
+                min_input = idt.min()
+            if idt.max() > max_input:
+                max_input = idt.max()
+        # if the input range is always greater than 0, then acc_max <= 2^P - 1
+        if min_input >= 0:
+            out_bit_width = math.ceil(np.log2(max_input + 1))
+            odt = DataType[f"UINT{out_bit_width}"]
+        # if the input range is signed, then acc_min >= -2^{P-1} and acc_max <=
+        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+        else:
+            max_abs_input = max(-min_input, 1 + max_input)
+            out_bit_width = math.ceil(np.log2(max_abs_input) + 1)
+            odt = DataType[f"INT{out_bit_width}"]
+        return odt
 
     def get_instream_width(self, ind=0):
-        elems_per_stream = self.get_nodeattr("ElemsPerStream")
-        elems = elems_per_stream[ind]
-        ibits = self.get_input_datatype().bitwidth()
-        return elems * ibits
+        ibits = self.get_input_datatype(ind).bitwidth()
+        return ibits * self.get_nodeattr("SIMD")
 
     def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
-        total_elems = self.get_total_elems()
-        out_width = total_elems * obits
+        out_width = obits * self.get_nodeattr("SIMD")
         return out_width
 
     def get_number_output_values(self):
diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
index 008fa9cee8..641581a12d 100644
--- a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
@@ -30,6 +30,7 @@
 import numpy as np
 import os
 
+from finn.custom_op.fpgadataflow import templates
 from finn.custom_op.fpgadataflow.concat import StreamingConcat
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
@@ -48,47 +49,6 @@ def get_nodeattr_types(self):
         my_attrs.update(HLSBackend.get_nodeattr_types(self))
         return my_attrs
 
-    def generate_params(self, model, path):
-        elems_per_stream = self.get_nodeattr("ElemsPerStream")
-        inp_streams = []
-        commands = []
-        idt = self.get_input_datatype()
-        total_elems = self.get_total_elems()
-        total_bw = idt.bitwidth() * total_elems
-        for i, elems in enumerate(elems_per_stream):
-            bw = idt.bitwidth() * elems
-            inp_stream = "hls::stream<ap_uint<%d> > &in%d" % (bw, i)
-            inp_streams.append(inp_stream)
-            cmd = "in%d.read()" % i
-            commands.append(cmd)
-        out_stream = "hls::stream<ap_uint<%d> > &out" % (total_bw)
-        inp_streams.append(out_stream)
-
-        impl_hls_code = []
-        impl_hls_code.append("void StreamingConcat(")
-        impl_hls_code.append(",".join(inp_streams))
-        impl_hls_code.append(", unsigned int numReps) {")
-        impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {")
-        impl_hls_code.append("#pragma HLS PIPELINE II=1")
-        impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw)
-        # FIXME: the order of streams for concatenation works out differently
-        # for cppsim vs rtlsim, addressed via reversing the order of commands
-        # for now
-        impl_hls_code.append("#ifdef __SYNTHESIS__")
-        impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");")
-        impl_hls_code.append("#else")
-        impl_hls_code.append("out_elem = (" + ",".join(commands) + ");")
-        impl_hls_code.append("#endif")
-        impl_hls_code.append("out.write(out_elem);")
-        impl_hls_code.append("}")
-        impl_hls_code.append("}")
-        impl_hls_code = "\n".join(impl_hls_code)
-
-        impl_filename = "{}/concat_impl.hpp".format(path)
-        f_impl = open(impl_filename, "w")
-        f_impl.write(impl_hls_code)
-        f_impl.close()
-
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
@@ -96,8 +56,7 @@ def execute_node(self, context, graph):
         ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)]
         folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)]
         exp_oshape = self.get_normal_output_shape()
-        folded_oshape = self.get_folded_output_shape()
-        export_idt = self.get_input_datatype()
+        export_idts = [self.get_input_datatype(i) for i in range(n_inps)]
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -127,8 +86,10 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
-            ), "cppsim did not produce expected folded output shape"
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected folded output shape. Got: {}, expected: {}".format(
+                context[node.output[0]].shape, exp_oshape
+            )
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
@@ -137,7 +98,7 @@ def execute_node(self, context, graph):
                 nbits = self.get_instream_width(i)
                 rtlsim_inp = npy_to_rtlsim_input(
                     "%s/input_%d.npy" % (code_gen_dir, i),
-                    export_idt,
+                    export_idts[i],
                     nbits,
                     reverse_inner=True,
                 )
@@ -177,33 +138,54 @@ def execute_node(self, context, graph):
             context[node.output[0]].shape == exp_oshape
         ), """Output shape doesn't match expected shape."""
 
+    def code_generation_cppsim(self, model):
+        """Generates c++ code for simulation (cppsim)."""
+        node = self.onnx_node
+        path = self.get_nodeattr("code_gen_dir_cppsim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("cppsim")
+        self.read_npy_data()
+        self.strm_decl()
+        self.pragmas()
+        self.docompute()
+        self.dataoutstrm()
+        self.save_as_npy()
+        self.timeout_value()
+        self.timeout_condition()
+        self.timeout_read_stream()
+
+        template = templates.docompute_template_timeout
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
     def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"']
+        self.code_gen_dict["$GLOBALS$"] = ['#include "concat.hpp"']
 
     def defines(self, var):
-        num_reps = self.get_nodeattr("numInputVectors")
-        num_reps = np.prod(num_reps)
-        self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps]
+        self.code_gen_dict["$DEFINES$"] = ["#define SIMD {}".format(self.get_nodeattr("SIMD"))]
 
     def read_npy_data(self):
         n_inputs = self.get_n_inputs()
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         npy_type = "float"
         self.code_gen_dict["$READNPYDATA$"] = []
-        idt = self.get_input_datatype()
-        idt_bw = idt.bitwidth()
-        elem_hls_type = idt.get_hls_datatype_str()
-        elem_bits = idt_bw
         for i in range(n_inputs):
-            packed_bits = self.get_instream_width(i)
-            packed_hls_type = "ap_uint<%d>" % packed_bits
+            input_elem_hls_type = self.get_input_datatype(i).get_hls_datatype_str()
             npy_in = "%s/input_%d.npy" % (code_gen_dir, i)
             self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);'
+                'npy2vectorstream<%s, %s, SIMD>("%s", in%d_%s);'
                 % (
-                    packed_hls_type,
-                    elem_hls_type,
-                    elem_bits,
+                    input_elem_hls_type,
                     npy_type,
                     npy_in,
                     i,
@@ -215,41 +197,70 @@ def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         n_inputs = self.get_n_inputs()
         for i in range(n_inputs):
-            packed_bits = self.get_instream_width(i)
-            packed_hls_type = "ap_uint<%d>" % packed_bits
+            input_elem_hls_type = self.get_input_datatype(i).get_hls_datatype_str()
             stream_name = "in%d_%s" % (i, self.hls_sname())
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name)
+                'hls::stream<hls::vector<%s, SIMD>> %s ("%s");'
+                % (input_elem_hls_type, stream_name, stream_name)
+            )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<{}, SIMD>> out_{} ("out_{}");'.format(
+                self.get_output_datatype().get_hls_datatype_str(),
+                self.hls_sname(),
+                self.hls_sname(),
             )
+        )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
-                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            'hls::stream<hls::vector<{}, SIMD>> debug_out_{} ("debug_out_{}");'.format(
+                self.get_output_datatype().get_hls_datatype_str(),
+                self.hls_sname(),
+                self.hls_sname(),
             )
         )
 
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = []
         n_inputs = self.get_n_inputs()
+        input_folds = [str(self.get_folded_input_shape(i)[-2]) for i in range(n_inputs)]
         in_streams = []
         for i in range(n_inputs):
             in_streams.append("in%d_%s" % (i, self.hls_sname()))
-        in_stream_names = ",".join(in_streams)
-        comp_call = "StreamingConcat(%s, out_%s, NumReps);" % (
-            in_stream_names,
-            self.hls_sname(),
+        in_stream_names = ", ".join(in_streams)
+        in_stream_folds = ", ".join(input_folds)
+        comp_call = "StreamingConcat<{}>(out_{}, {});".format(
+            in_stream_folds, self.hls_sname(), in_stream_names
         )
         self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
 
+    def dataoutstrm(self):
+        npy_type = "float"
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+        npy_out = "%s/output.npy" % code_gen_dir
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'vectorstream2npy<%s, %s, SIMD>(debug_out_%s, %s, "%s");'
+            % (
+                self.get_output_datatype().get_hls_datatype_str(),
+                npy_type,
+                self.hls_sname(),
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
     def blackboxfunction(self):
         n_inputs = self.get_n_inputs()
         in_streams = []
         for i in range(n_inputs):
-            iwidth = self.get_instream_width(i)
-            in_streams.append("hls::stream<ap_uint<%d>> &in%d_%s" % (iwidth, i, self.hls_sname()))
-        in_streams = ",".join(in_streams)
-        total_width = self.get_input_datatype().bitwidth() * self.get_total_elems()
-        out_stream = "hls::stream<ap_uint<%d>> &out_%s" % (
-            total_width,
+            input_elem_hls_type = self.get_input_datatype(i).get_hls_datatype_str()
+            in_streams.append(
+                "hls::stream<hls::vector<%s, SIMD>> &in%d_%s"
+                % (input_elem_hls_type, i, self.hls_sname())
+            )
+        in_streams = ", ".join(in_streams)
+        out_stream = "hls::stream<hls::vector<%s, SIMD>> &out_%s" % (
+            self.get_output_datatype().get_hls_datatype_str(),
             self.hls_sname(),
         )
         blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream)
@@ -264,4 +275,9 @@ def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        for i in range(n_inputs):
+            pragmas.append(
+                "#pragma HLS aggregate variable=in%d_%s compact=bit" % (i, self.hls_sname())
+            )
+        pragmas.append("#pragma HLS aggregate variable=out_%s compact=bit" % self.hls_sname())
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")

From d185219640282c97f7a144c7a6a0294177202f87 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 16:03:23 +0000
Subject: [PATCH 06/15] [Feature] InferConcatLayer transformation now accepts
 different datatypes among inputs and sets the SIMD parameter

---
 .../fpgadataflow/convert_to_hw_layers.py      | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index b02bc89db8..121a5484af 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1211,21 +1211,24 @@ def apply(self, model):
                 if (axis != -1) and (axis != last_axis):
                     continue
                 # check datatype coherence
-                dt0 = model.get_tensor_datatype(node.input[0])
-                if dt0 is None:
-                    continue
-                dt_coherent = all([model.get_tensor_datatype(x) == dt0 for x in node.input])
-                if not dt_coherent:
+                if any([model.get_tensor_datatype(x) is None for x in node.input]):
+                    warnings.warn(
+                        "Inputs with undefined datatype detected, skipping InferConcatLayer()"
+                    )
                     continue
                 # skip conversion if any inputs are static
-                all_static = all([model.get_initializer(x) is None for x in node.input])
-                if not all_static:
+                any_static = any([model.get_initializer(x) is not None for x in node.input])
+                if any_static:
                     continue
                 # skip conversion if inputs are not integers
-                if not dt0.is_integer():
+                all_integer = all([model.get_tensor_datatype(x).is_integer() for x in node.input])
+                if not all_integer:
+                    warnings.warn(
+                        "Inputs with non-integer datatype detected, skipping InferConcatLayer()"
+                    )
                     continue
                 # ready for conversion
-                elems_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input]
+                channels_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input]
                 inp_vec = list(model.get_tensor_shape(node.input[0])[:-1])
                 new_node = helper.make_node(
                     "StreamingConcat",
@@ -1233,9 +1236,10 @@ def apply(self, model):
                     node.output,
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
-                    name="Concat_" + node.name,
-                    ElemsPerStream=elems_per_stream,
-                    inputDataType=dt0.name,
+                    name="StreamingConcat_" + node.name,
+                    SIMD=1,
+                    ChannelsPerStream=channels_per_stream,
+                    inputDataTypes=[model.get_tensor_datatype(x).name for x in node.input],
                     numInputVectors=inp_vec,
                     inFIFODepths=[2] * len(node.input),
                 )

From c8d36fb51ad8580bbd002c454ab3a478da3ac817 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 16:11:14 +0000
Subject: [PATCH 07/15] [Feature] test_fpgadataflow_concat.py test case checks
 different datatypes among inputs

---
 .../fpgadataflow/test_fpgadataflow_concat.py  | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
index 25c738d049..719d61905f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_concat.py
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -52,7 +52,7 @@
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
 
-def make_concat_model(i_shapes, idt):
+def make_concat_model(i_shapes, idts):
     class ConcatModel(nn.Module):
         def forward(self, *args):
             return torch.cat(args, -1)
@@ -67,20 +67,25 @@ def forward(self, *args):
     torch.onnx.export(torch_model, input_t, model_bytes, opset_version=11)
     model = onnx.ModelProto.FromString(model_bytes.getvalue())
     model = ModelWrapper(model)
-    for inp in model.graph.input:
+    for inp, idt in zip(model.graph.input, idts):
         model.set_tensor_datatype(inp.name, idt)
     return model
 
 
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
-@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# input datatypes and expected inferred out datatype
+@pytest.mark.parametrize(
+    "test_idts", [([DataType["INT3"], DataType["UINT4"], DataType["UINT6"]], DataType["INT7"])]
+)
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_concat(exec_mode, idt):
+def test_fpgadataflow_concat(exec_mode, test_idts):
+    idts = test_idts[0]
+    exp_odt = test_idts[1]
     i_shapes = [(1, 2, 4), (1, 2, 6), (1, 2, 1)]
-    i_data = [gen_finn_dt_tensor(idt, x) for x in i_shapes]
-    model = make_concat_model(i_shapes, idt)
+    i_data = [gen_finn_dt_tensor(idt, x) for x, idt in zip(i_shapes, idts)]
+    model = make_concat_model(i_shapes, idts)
     assert len(i_shapes) == len(model.graph.input)
     assert len(model.graph.output) == 1
     exp_oshape = list(i_shapes[0][:-1]) + [sum(x[-1] for x in i_shapes)]
@@ -96,6 +101,7 @@ def test_fpgadataflow_concat(exec_mode, idt):
     model = model.transform(InferConcatLayer())
     assert model.graph.node[0].op_type == "StreamingConcat"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    assert model.get_tensor_datatype(model.graph.output[0].name) == exp_odt
     ret = execute_onnx(model, inp_dict)
     assert (ret[oname] == exp_out).all()
     model = model.transform(SpecializeLayers("xc7z020clg400-1"))
@@ -120,12 +126,13 @@ def test_fpgadataflow_concat(exec_mode, idt):
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_concat_stitchedip():
-    idt = DataType["INT4"]
+    idts = [DataType["INT3"], DataType["UINT4"], DataType["UINT6"]]
+    exp_odt = DataType["INT7"]
     fpga_part = "xc7z020clg400-1"
     clk_ns = 10
     i_shapes = [(1, 2, 4), (1, 2, 6), (1, 2, 1)]
-    i_data = [gen_finn_dt_tensor(idt, x) for x in i_shapes]
-    model = make_concat_model(i_shapes, idt)
+    i_data = [gen_finn_dt_tensor(idt, x) for x, idt in zip(i_shapes, idts)]
+    model = make_concat_model(i_shapes, idts)
     assert len(i_shapes) == len(model.graph.input)
     assert len(model.graph.output) == 1
     exp_oshape = list(i_shapes[0][:-1]) + [sum(x[-1] for x in i_shapes)]
@@ -141,6 +148,7 @@ def test_fpgadataflow_concat_stitchedip():
     model = model.transform(InferConcatLayer())
     assert model.graph.node[0].op_type == "StreamingConcat"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    assert model.get_tensor_datatype(model.graph.output[0].name) == exp_odt
     model = model.transform(SpecializeLayers(fpga_part))
     assert model.graph.node[0].op_type == "StreamingConcat_hls"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls"

From 8f87454c45c688496d6e4e1650229e81e8417867 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:05:34 +0000
Subject: [PATCH 08/15] [Feature] New Split custom_op added

---
 src/finn/custom_op/fpgadataflow/__init__.py   |   2 +
 .../custom_op/fpgadataflow/hls/__init__.py    |   2 +
 .../custom_op/fpgadataflow/hls/split_hls.py   | 278 ++++++++++++++++++
 src/finn/custom_op/fpgadataflow/split.py      | 164 +++++++++++
 4 files changed, 446 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/split_hls.py
 create mode 100644 src/finn/custom_op/fpgadataflow/split.py

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..6f48bc6308 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -42,6 +42,7 @@
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
 from finn.custom_op.fpgadataflow.pool import Pool
+from finn.custom_op.fpgadataflow.split import StreamingSplit
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
 )
@@ -77,6 +78,7 @@
 custom_op["Lookup"] = Lookup
 custom_op["Pool"] = Pool
 custom_op["StreamingConcat"] = StreamingConcat
+custom_op["StreamingSplit"] = StreamingSplit
 custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter
 custom_op["StreamingEltwise"] = StreamingEltwise
 custom_op["StreamingMaxPool"] = StreamingMaxPool
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..e5b24413eb 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -43,6 +43,7 @@
 from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls
 from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls
 from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls
+from finn.custom_op.fpgadataflow.hls.split_hls import StreamingSplit_hls
 from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import (
     StreamingDataWidthConverter_hls,
 )
@@ -71,6 +72,7 @@
 custom_op["Lookup_hls"] = Lookup_hls
 custom_op["Pool_hls"] = Pool_hls
 custom_op["StreamingConcat_hls"] = StreamingConcat_hls
+custom_op["StreamingSplit_hls"] = StreamingSplit_hls
 custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls
 custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls
 custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/split_hls.py b/src/finn/custom_op/fpgadataflow/hls/split_hls.py
new file mode 100644
index 0000000000..d6f9d43f51
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/split_hls.py
@@ -0,0 +1,278 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow import templates
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.split import StreamingSplit
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingSplit_hls(StreamingSplit, HLSBackend):
+    """Streaming split node with dynamically generated HLS.
+    Only supports splitting along the last axis."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(StreamingSplit.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        ishape = self.get_normal_input_shape()
+        folded_ishape = self.get_folded_input_shape()
+        n_outputs = self.get_n_outputs()
+        exp_oshapes = [self.get_normal_output_shape(i) for i in range(len(node.output))]
+        export_idt = self.get_input_datatype()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == ishape, "Input shape mismatch for " + node.input[0]
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_outputs(context, ["output_%d.npy" % i for i in range(n_outputs)])
+            for i in range(n_outputs):
+                assert (
+                    context[node.output[i]].shape == exp_oshapes[i]
+                ), "cppsim did not produce expected folded output shape: {}, expected: {}".format(
+                    context[node.output[i]].shape, exp_oshapes[i]
+                )
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            io_dict = {"inputs": {}, "outputs": {}}
+
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "%s/input_0.npy" % code_gen_dir,
+                export_idt,
+                nbits,
+                # reverse_inner=True,
+            )
+            io_dict["inputs"]["in0"] = rtlsim_inp
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+
+            for i in range(n_outputs):
+                io_dict["outputs"]["out_arr_%d" % i] = []
+            self.rtlsim_multi_io(sim, io_dict, sname="_")
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            for i in range(n_outputs):
+                out_npy_path = "%s/output_%d.npy" % (code_gen_dir, i)
+                out_shape = self.get_folded_output_shape(i)
+                rtlsim_output_to_npy(
+                    io_dict["outputs"]["out_arr_%d" % i],
+                    out_npy_path,
+                    odt,
+                    out_shape,
+                    packed_bits,
+                    target_bits,
+                    # reverse_inner=True,
+                )
+                # load and reshape output
+                output = np.load(out_npy_path)
+                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshapes[i])
+                context[node.output[i]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        for i in range(n_outputs):
+            assert (
+                context[node.output[i]].shape == exp_oshapes[i]
+            ), "cppsim did not produce expected folded output shape. Got: {}, expected: {}".format(
+                context[node.output[i]].shape, exp_oshapes[i]
+            )
+
+    def code_generation_cppsim(self, model):
+        """Generates c++ code for simulation (cppsim)."""
+        node = self.onnx_node
+        path = self.get_nodeattr("code_gen_dir_cppsim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("cppsim")
+        self.read_npy_data()
+        self.strm_decl()
+        self.pragmas()
+        self.docompute()
+        self.dataoutstrm()
+        self.save_as_npy()
+        self.timeout_value()
+        self.timeout_condition()
+        self.timeout_read_stream()
+
+        template = templates.docompute_template_timeout
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "split.hpp"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = ["#define NUM_OUTPUTS " + str(self.get_n_outputs())]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        simd = self.get_nodeattr("SIMD")
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2vectorstream<%s, %s, %d>("%s", in0);'
+            % (input_elem_hls_type, npy_type, simd, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        simd = self.get_nodeattr("SIMD")
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        stream_name = "in0"
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<%s, %d>> %s ("%s");'
+            % (input_elem_hls_type, simd, stream_name, stream_name)
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            "hls::stream<hls::vector<{}, {}>> out_arr[NUM_OUTPUTS];".format(
+                self.get_output_datatype().get_hls_datatype_str(), simd
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            "hls::stream<hls::vector<{}, {}>> debug_out_arr[NUM_OUTPUTS];".format(
+                self.get_output_datatype().get_hls_datatype_str(), simd
+            )
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+        n_outputs = self.get_n_outputs()
+        output_folds = [str(self.get_folded_output_shape(i)[-2]) for i in range(n_outputs)]
+        out_stream_folds = ", ".join(output_folds)
+        comp_call = "StreamingSplit<{}>(in0, out_arr);".format(out_stream_folds)
+        self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
+
+    def dataoutstrm(self):
+        npy_type = "float"
+        simd = self.get_nodeattr("SIMD")
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        n_outputs = self.get_n_outputs()
+        self.code_gen_dict["$DATAOUTSTREAM$"] = []
+        for i in range(n_outputs):
+            oshape = self.get_folded_output_shape(i)
+            oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+            npy_out = "%s/output_%d.npy" % (code_gen_dir, i)
+            self.code_gen_dict["$DATAOUTSTREAM$"].append(
+                'vectorstream2npy<%s, %s, %d>(debug_out_arr[%d], %s, "%s");'
+                % (
+                    self.get_output_datatype(i).get_hls_datatype_str(),
+                    npy_type,
+                    simd,
+                    i,
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            )
+
+    def blackboxfunction(self):
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        simd = self.get_nodeattr("SIMD")
+        in_stream = "hls::stream<hls::vector<%s, %d>> &in0" % (input_elem_hls_type, simd)
+        out_streams = "hls::stream<hls::vector<%s, %d>> (&out_arr)[NUM_OUTPUTS]" % (
+            input_elem_hls_type,
+            simd,
+        )
+        blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_stream, out_streams)
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
+
+    def pragmas(self):
+        pragmas = []
+        pragmas.append("#pragma HLS INTERFACE axis port=in0")
+        for i in range(self.get_n_outputs()):
+            pragmas.append("#pragma HLS INTERFACE axis port=out_arr[%d]" % i)
+        pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+        pragmas.append("#pragma HLS aggregate variable=in0 compact=bit")
+        for i in range(self.get_n_outputs()):
+            pragmas.append("#pragma HLS aggregate variable=out_arr[%d] compact=bit" % i)
+        self.code_gen_dict["$PRAGMAS$"] = pragmas
+
+    def timeout_condition(self):
+        condition = []
+        for i in range(self.get_n_outputs()):
+            condition.append("out_arr[{}].empty()".format(i))
+        condition = " && ".join(condition)
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = [condition]
+
+    def timeout_read_stream(self):
+        read_stream_command = """
+for(int i = 0; i < NUM_OUTPUTS; i++){
+    if(!out_arr[i].empty())
+         debug_out_arr[i] << out_arr[i].read();
+}
+"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [read_stream_command]
diff --git a/src/finn/custom_op/fpgadataflow/split.py b/src/finn/custom_op/fpgadataflow/split.py
new file mode 100644
index 0000000000..e6ec551bc4
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/split.py
@@ -0,0 +1,164 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from onnx import helper
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class StreamingSplit(HWCustomOp):
+    """Abstraction layer for HW implementation of Split.
+    Only supports splitting along the last (channel) axis."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "SIMD": ("i", True, 0),
+            # number of elements of each output streams
+            "ChannelsPerStream": ("ints", True, []),
+            # FINN DataTypes for input; output datatypes inferred from input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors for non-split axes, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_n_outputs(self):
+        return len(self.get_nodeattr("ChannelsPerStream"))
+
+    def get_total_elems(self):
+        elems_per_stream = self.get_nodeattr("ChannelsPerStream")
+        return int(np.sum(elems_per_stream))
+
+    def get_normal_input_shape(self, ind=0):
+        total_elems = self.get_total_elems()
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [total_elems])
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        simd = self.get_nodeattr("SIMD")
+        folds = self.get_total_elems() // simd
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [folds, simd])
+
+    def get_normal_output_shape(self, ind=0):
+        elems = self.get_nodeattr("ChannelsPerStream")[ind]
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [elems])
+
+    def get_folded_output_shape(self, ind=0):
+        elems = self.get_nodeattr("ChannelsPerStream")[ind]
+        simd = self.get_nodeattr("SIMD")
+        folds = elems // simd
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [folds, simd])
+
+    def make_shape_compatible_op(self, model):
+        # check input shape
+        exp_ishape = self.get_normal_input_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape"
+
+        assert len(self.onnx_node.output) == self.get_n_outputs(), "Unexpected number of outputs"
+        ret = helper.make_node("Split", self.onnx_node.input, self.onnx_node.output, axis=-1)
+        return ret
+
+    def infer_node_datatype(self, model):
+        # check input datatype
+        inp = self.onnx_node.input[0]
+        idt = model.get_tensor_datatype(inp)
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                self.onnx_node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+            self.set_nodeattr("inputDataType", idt.name)
+        odt = self.get_output_datatype()
+        for out in self.onnx_node.output:
+            model.set_tensor_datatype(out, odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self, ind=0):
+        # all output datatypes are the same as the input datatype
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        return ibits * self.get_nodeattr("SIMD")
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        out_width = obits * self.get_nodeattr("SIMD")
+        return out_width
+
+    def get_number_output_values(self):
+        num_output_values = 0
+        for i in range(self.get_n_outputs()):
+            num_output_values += np.prod(self.get_folded_output_shape(i)[:-1])
+        return num_output_values
+
+    def get_exp_cycles(self):
+        return np.prod(self.get_folded_input_shape()[:-1])
+
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        split = self.get_nodeattr("ChannelsPerStream")
+        np_split_param = np.cumsum(split[:-1])
+        np_result = np.split(context[node.input[0]], np_split_param, axis=-1)
+        for i, out in enumerate(node.output):
+            context[out] = np_result[i]
+
+    def get_instream_width_padded(self, ind=0):
+        in_width = self.get_instream_width()
+        return roundup_to_integer_multiple(in_width, 8)
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["s_axis"] = [("in0", self.get_instream_width_padded())]
+        intf_names["m_axis"] = []
+        for i in range(self.get_n_outputs()):
+            intf_names["m_axis"].append(("out_arr_%d" % i, self.get_instream_width_padded()))
+        return intf_names

From 8ea47f37f288195564b908c7a374b1ce913ef450 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:08:54 +0000
Subject: [PATCH 09/15] [Feature] Change signal name option added to
 hwcustomop.rtlsim_multi_io, useful for array interfaces

---
 src/finn/custom_op/fpgadataflow/hwcustomop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index b40b8f3074..602a923424 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -284,11 +284,11 @@ def rtlsim(self, sim, inp, inp2=None):
             sim.stop_vcd_trace()
         return outputs
 
-    def rtlsim_multi_io(self, sim, io_dict):
+    def rtlsim_multi_io(self, sim, io_dict, sname=None):
         "Run rtlsim for this node, supports multiple i/o streams."
 
         # signal name
-        sname = "_" + self.hls_sname() + "_"
+        sname = "_" + self.hls_sname() + "_" if sname is None else sname
 
         trace_file = self.get_nodeattr("rtlsim_trace")
         if trace_file == "default":

From 59cfce74a4ba3788d0bf0596a6b0976ea5a030a0 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:11:43 +0000
Subject: [PATCH 10/15] [Feature] InferSplitlayer() added

---
 .../fpgadataflow/convert_to_hw_layers.py      | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index b02bc89db8..e4f10af3eb 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1250,6 +1250,72 @@ def apply(self, model):
         return (model, graph_modified)
 
 
+class InferSplitLayer(Transformation):
+    """Convert suitable Split nodes (operating on last/-1 axis)
+    into StreamingConcat HW layers."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Split":
+                split_param = node.input[1]
+                if model.get_initializer(split_param) is None:
+                    warnings.warn("Split param not constant, skipping InferSplitLayer()")
+                    continue
+                ishape = model.get_tensor_shape(node.input[0])
+                axis = get_by_name(node.attribute, "axis")
+                if (axis is None) or (ishape is None):
+                    continue
+                axis = axis.i
+                last_axis = len(ishape) - 1
+                # skip conversion if not using last axis
+                if (axis != -1) and (axis != last_axis):
+                    warnings.warn(
+                        "StreamingSplit supports only last axis, skipping InferSplitLayer()"
+                    )
+                    continue
+                # only one input allowed (two including split_param)
+                if len(node.input) != 2:
+                    warnings.warn("Only one input allowed, skipping InferSplitLayer()")
+                    continue
+                # skip conversion if the input is static
+                if model.get_initializer(node.input[0]) is not None:
+                    warnings.warn("Static input detected, skipping InferSplitLayer()")
+                    continue
+                # skip conversion if inputs are not integers
+                if not model.get_tensor_datatype(node.input[0]).is_integer():
+                    warnings.warn("Non-integer input detected, skipping InferSplitLayer()")
+                    continue
+                # ready for conversion
+                channels_per_stream = [model.get_tensor_shape(x)[-1] for x in node.output]
+                inp_vec = list(model.get_tensor_shape(node.input[0])[:-1])
+                new_node = helper.make_node(
+                    "StreamingSplit",
+                    node.input,
+                    node.output,
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    name="StreamingSplit_" + node.name,
+                    SIMD=1,
+                    ChannelsPerStream=channels_per_stream,
+                    inputDataType=model.get_tensor_datatype(node.input[0]).name,
+                    numInputVectors=inp_vec,
+                    outFIFODepths=[2] * len(node.output),
+                )
+                graph.node.insert(node_ind, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferStreamingEltwise(Transformation):
     """Convert eltwise Add, Sub or Sub -> Abs to StreamingEltwise layer
     with AddEltwise, SubEltwise or AbsDiffEltwise op."""

From 6960e1505d2c220c7363488852fb82157282f4e0 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:17:07 +0000
Subject: [PATCH 11/15] [Feature] fpgadataflow test for split added

---
 tests/fpgadataflow/test_fpgadataflow_split.py | 150 ++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_split.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_split.py b/tests/fpgadataflow/test_fpgadataflow_split.py
new file mode 100644
index 0000000000..5859b6d5a6
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_split.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2021, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx
+from onnx import helper as oh
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hw_layers import InferSplitLayer
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+
+
+def make_split_model(IN_SHAPE, IN_DTYPE, SPLIT, AXIS):
+    out_shapes = [IN_SHAPE[:-1] + [s] for s in SPLIT]
+    outputs = []
+    for i in range(len(SPLIT)):
+        name = "global_out_" + str(i)
+        out = oh.make_tensor_value_info(name, onnx.TensorProto.FLOAT, out_shapes[i])
+        outputs.append(out)
+
+    inp = oh.make_tensor_value_info("global_in", onnx.TensorProto.FLOAT, IN_SHAPE)
+    split_init = onnx.numpy_helper.from_array(
+        np.array(SPLIT, dtype=np.int64), name="Split_0_param0"
+    )
+    split_node = oh.make_node(
+        "Split", [inp.name, split_init.name], [out.name for out in outputs], axis=AXIS
+    )
+    graph = oh.make_graph(nodes=[split_node], name="split_test", inputs=[inp], outputs=outputs)
+    model = oh.make_model(graph)
+    model = ModelWrapper(model)
+    for out in outputs:
+        model.set_tensor_datatype(out.name, IN_DTYPE)
+        model.set_tensor_layout(out.name, ["N", "H", "W", "C"])
+    model.set_tensor_datatype(inp.name, IN_DTYPE)
+    model.set_tensor_layout(inp.name, ["N", "H", "W", "C"])
+    model.set_initializer(split_init.name, np.array(SPLIT, dtype=np.int64))
+    model = model.transform(GiveUniqueNodeNames())
+
+    return model
+
+
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim", "stitched_rtlsim"])
+@pytest.mark.parametrize("idt", [DataType["INT3"]])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_split(exec_mode, idt):
+    fpga_part = "xc7z020clg400-1"
+    clk_ns = 10
+    i_shape = [1, 5, 5, 10]
+    split = [2, 2, 6]
+    split_axis = 3
+    model = make_split_model(i_shape, idt, split, split_axis)
+    assert len(model.graph.output) == len(split)
+    exp_oshapes = []
+    for s in split:
+        oshape = i_shape.copy()
+        oshape[split_axis] = s
+        exp_oshapes.append(oshape)
+    onames = [o.name for o in model.graph.output]
+    assert all(model.get_tensor_shape(oname) == exp_oshapes[i] for i, oname in enumerate(onames))
+
+    inputs = []
+    for out_shape in exp_oshapes:
+        inputs.append(np.random.randint(idt.min(), idt.max() + 1, out_shape).astype(np.float32))
+    test_input = np.concatenate(inputs, axis=split_axis)
+    input_dict = {model.graph.input[0].name: test_input}
+    ret = execute_onnx(model, input_dict)
+    for i, (k, v) in enumerate(ret.items()):
+        assert (v == inputs[i]).all()
+
+    # call transformation to convert to HW and verify conversion
+    model = model.transform(InferSplitLayer())
+    assert model.graph.node[0].op_type == "StreamingSplit"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    ret = execute_onnx(model, input_dict)
+    for i, (k, v) in enumerate(ret.items()):
+        assert (v == inputs[i]).all()
+
+    model = model.transform(SpecializeLayers(fpga_part))
+    assert model.graph.node[0].op_type == "StreamingSplit_hls"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls"
+    if exec_mode == "cppsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(fpga_part, clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+    elif exec_mode == "stitched_rtlsim":
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
+        model = model.transform(SpecializeLayers(fpga_part))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(fpga_part, clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(
+            CreateStitchedIP(
+                fpga_part,
+                clk_ns,
+                vitis=False,
+            )
+        )
+        model.set_metadata_prop("exec_mode", "rtlsim")
+        model.set_metadata_prop("rtlsim_trace", "trace.vcd")
+    ret_sim = execute_onnx(model, input_dict)
+    for i, (k, v) in enumerate(ret_sim.items()):
+        assert (v == inputs[i]).all()

From c8c8d49cef0c9374ccca4337bc60701fae3ef450 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 23 Sep 2024 13:34:20 +0000
Subject: [PATCH 12/15] [Update] Finn-hlslib commit updated

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index a4fc124fa4..078eb33ec0 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+HLSLIB_COMMIT="2c066e87f5b8d309693c5d46c206473ca20ac68c"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"

From 823588bc1de3881e8a9dab0b91c7c0f4ad17be65 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 23 Sep 2024 13:36:12 +0000
Subject: [PATCH 13/15] [Update] Finn-hlslib commit updated

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index a4fc124fa4..078eb33ec0 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+HLSLIB_COMMIT="2c066e87f5b8d309693c5d46c206473ca20ac68c"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"

From dbb387ee7f187d60a1f4ad6c9ddd2163f382c314 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Tue, 21 Jan 2025 15:06:57 +0100
Subject: [PATCH 14/15] [Streamline] Add MoveTransposePastJoinMul
 transformation

---
 src/finn/transformation/streamline/reorder.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 8688145453..9797e6abf3 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -1325,6 +1325,20 @@ def are_producers_identical(self, model, producers):
         return True
 
 
+class MoveTransposePastJoinMul(MoveIdenticalOpPastJoinOp):
+    def __init__(self):
+        super().__init__(["Transpose"], ["Mul"])
+
+    def are_producers_identical(self, model, producers):
+        if not super().are_producers_identical(model, producers):
+            return False
+        first_perm = get_by_name(producers[0].attribute, "perm").ints
+        for producer in producers:
+            if first_perm != get_by_name(producer.attribute,  "perm").ints:
+                False
+        return True
+
+
 class MoveMulPastJoinAdd(MoveIdenticalOpPastJoinOp):
     def __init__(self):
         super().__init__(["Mul"], ["Add"])

From b50a63b039240afb11eb825c4a719763604ecd7a Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Tue, 28 Jan 2025 17:35:00 +0100
Subject: [PATCH 15/15] [Folding] Add Split, Concat and RTL-based FMPadding to
 auto-folding

---
 .../transformation/fpgadataflow/set_folding.py    | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 3ce17a27ff..0dd089597d 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -114,9 +114,13 @@ def apply(self, model):
         simd_ops = [
             "DownSampler_hls",
             "FMPadding_hls",
+            "FMPadding_rtl",
             "FMPadding_Pixel_hls",
             "ConvolutionInputGenerator_hls",
             "ConvolutionInputGenerator_rtl",
+            # Streaming Split and Concat are SIMD operations
+            "StreamingSplit_hls",
+            "StreamingConcat_hls"
         ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
@@ -215,7 +219,16 @@ def apply(self, model):
                         # depthwise SWGs are handled separately
                         continue
                 else:
-                    max_simd = node_inst.get_nodeattr("NumChannels")
+                    # Note: Keep original behavior for all custom-ops defining
+                    # the NumChannels attribute as it is
+                    try:
+                        max_simd = node_inst.get_nodeattr("NumChannels")
+                    # Note: Some of the recent additions do not define the
+                    # NumChannels attribute
+                    except AttributeError:
+                        # We can extract the channels from the normal, i.e., not
+                        # folded, shape of the input in these cases
+                        max_simd = node_inst.get_normal_input_shape()[-1]
                     self.optimize_attribute_val(node_inst, max_simd, "SIMD")
             else:
                 warnings.warn("SetFolding doesn't know how to handle op_type " + op_type)