diff --git a/src/finn/analysis/fpgadataflow/validate_dataflow_conversion.py b/src/finn/analysis/fpgadataflow/validate_dataflow_conversion.py new file mode 100644 index 0000000000..e2a1b1cd2c --- /dev/null +++ b/src/finn/analysis/fpgadataflow/validate_dataflow_conversion.py @@ -0,0 +1,100 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +"""Analysis pass to validate that model has been properly converted to fpgadataflow layers.""" + +from finn.util.fpgadataflow import is_fpgadataflow_node + + +def validate_dataflow_conversion(model): + """Validate that model has been properly converted to dataflow layers. + + Checks that either: + 1. All layers are fpgadataflow layers (ideal case), OR + 2. Fpgadataflow layers form a contiguous block in the middle of the model, + with only non-dataflow layers on the outside (partition case) + + Returns a dictionary with validation results: + - 'valid': bool indicating if validation passed + - 'message': str with validation status message + - 'unconverted_layers': list of (index, name, op_type) tuples for non-dataflow layers + - 'dataflow_block': tuple (first_index, last_index) if dataflow forms a block, else None + + Example usage in transformation: + result = model.analysis(validate_dataflow_conversion) + if not result['valid']: + raise AssertionError(result['message']) + """ + nodes = model.graph.node + fpgadataflow_nodes = [] + non_fpgadataflow_nodes = [] + + for i, node in enumerate(nodes): + if is_fpgadataflow_node(node): + fpgadataflow_nodes.append((i, node.name, node.op_type)) + else: + non_fpgadataflow_nodes.append((i, node.name, node.op_type)) + + # Case 1: All nodes are fpgadataflow (ideal) + if len(non_fpgadataflow_nodes) == 0: + return { + "valid": True, + "message": "Dataflow conversion validation: All layers are fpgadataflow layers", + "unconverted_layers": [], + "dataflow_block": None, + } + + # Case 2: Check if fpgadataflow nodes form contiguous block + if len(fpgadataflow_nodes) > 0: + dataflow_indices = [i for i, _, _ in fpgadataflow_nodes] + first_dataflow = min(dataflow_indices) + last_dataflow = max(dataflow_indices) + + # Check all indices between first and last are dataflow + for i in range(first_dataflow, last_dataflow + 1): + node = nodes[i] + if not is_fpgadataflow_node(node): + # Found non-dataflow layer inside dataflow block + unconverted_str = "\n".join( + [ + f" [{idx}] {name} (op_type: {op})" + for idx, name, op in non_fpgadataflow_nodes + ] + ) + return { + "valid": False, + "message": ( + "Non-contiguous dataflow block detected.\n" + f"Layer '{node.name}' (op_type: {node.op_type}) at position {i} " + "is not a fpgadataflow layer but is between dataflow layers.\n" + f"Dataflow block spans positions {first_dataflow} to {last_dataflow}.\n" + f"Unconverted layers:\n{unconverted_str}" + ), + "unconverted_layers": non_fpgadataflow_nodes, + "dataflow_block": (first_dataflow, last_dataflow), + } + + # Valid: fpgadataflow block in middle + return { + "valid": True, + "message": ( + "Dataflow conversion validation: Fpgadataflow layers form contiguous block " + f"(positions {first_dataflow}-{last_dataflow})" + ), + "unconverted_layers": non_fpgadataflow_nodes, + "dataflow_block": (first_dataflow, last_dataflow), + } + + # Case 3: No fpgadataflow layers at all + unconverted_str = "\n".join( + [f" [{idx}] {name} (op_type: {op})" for idx, name, op in non_fpgadataflow_nodes] + ) + return { + "valid": False, + "message": ( + "No fpgadataflow layers found in model.\n" + f"All layers remain unconverted:\n{unconverted_str}" + ), + "unconverted_layers": non_fpgadataflow_nodes, + "dataflow_block": None, + } diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 6c761e76cb..f7e1ff9ba6 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -45,6 +45,7 @@ DataflowBuildConfig, default_build_dataflow_steps, ) +from finn.builder.build_dataflow_phases import build_dataflow_phase_lookup from finn.builder.build_dataflow_steps import build_dataflow_step_lookup @@ -68,19 +69,53 @@ def flush(self): def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True): + """Resolve build steps from config, supporting both phases and fine-grained steps. + + Note: When using phase-based builds with start_step/stop_step, specify phase names + (e.g., start_step="phase_build_hardware") rather than fine-grained step names. + Phases save intermediate models for each internal step, so checkpoints like + step_hw_ipgen.onnx will exist, but the build loop operates at the phase level. + """ steps = cfg.steps if steps is None: steps = default_build_dataflow_steps + + # Merge phase and step lookup dictionaries + all_steps = { + **build_dataflow_step_lookup, + **build_dataflow_phase_lookup, + } + steps_as_fxns = [] for transform_step in steps: + step_name = None + + # Get step function and name if type(transform_step) is str: - # lookup step function from step name - steps_as_fxns.append(build_dataflow_step_lookup[transform_step]) + step_name = transform_step + if transform_step in all_steps: + step_fn = all_steps[transform_step] + else: + raise ValueError(f"Unknown step or phase: {transform_step}") elif callable(transform_step): - # treat step as function to be called as-is - steps_as_fxns.append(transform_step) + step_fn = transform_step + step_name = getattr(transform_step, "__name__", None) else: - raise Exception("Could not resolve build step: " + str(transform_step)) + raise ValueError(f"Invalid step type: {type(transform_step)}") + + # Inject steps BEFORE this step + if step_name and step_name in cfg.inject_steps_before: + for injected_step in cfg.inject_steps_before[step_name]: + steps_as_fxns.append(injected_step) + + # Add the main step + steps_as_fxns.append(step_fn) + + # Inject steps AFTER this step + if step_name and step_name in cfg.inject_steps_after: + for injected_step in cfg.inject_steps_after[step_name]: + steps_as_fxns.append(injected_step) + if partial: step_names = list(map(lambda x: x.__name__, steps_as_fxns)) if cfg.start_step is None: diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 96ecfeb6b7..e7348a86df 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -29,10 +29,10 @@ import numpy as np import os -from dataclasses import dataclass +from dataclasses import dataclass, field from dataclasses_json import dataclass_json from enum import Enum -from typing import Any, List, Optional +from typing import Any, Callable, Dict, List, Optional from finn.transformation.fpgadataflow.alveo_build import VitisOptStrategy from finn.util.basic import part_map, vitis_default_platform @@ -108,39 +108,20 @@ class VerificationStepType(str, Enum): #: specified order. Use the `steps` as part of build config to restrict which #: steps will be run. default_build_dataflow_steps = [ - "step_qonnx_to_finn", - "step_tidy_up", - "step_streamline", - "step_convert_to_hw", - "step_create_dataflow_partition", - "step_specialize_layers", - "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_transpose_decomposition", - "step_generate_estimate_reports", - "step_hw_codegen", - "step_hw_ipgen", - "step_set_fifo_depths", - "step_create_stitched_ip", - "step_measure_rtlsim_performance", - "step_synthesize_bitfile", - "step_make_driver", - "step_deployment_package", + "phase_prepare_model", + "phase_optimize_model", + "phase_convert_to_hardware", + "phase_optimize_hardware", + "phase_build_hardware", + "phase_synthesize_hardware", ] #: List of steps to run for an estimate-only (no synthesis) dataflow build estimate_only_dataflow_steps = [ - "step_qonnx_to_finn", - "step_tidy_up", - "step_streamline", - "step_convert_to_hw", - "step_create_dataflow_partition", - "step_specialize_layers", - "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", + "phase_prepare_model", + "phase_optimize_model", + "phase_convert_to_hardware", + "phase_optimize_hardware", ] #: List of steps to run for a dataflow build including HW code generation, but @@ -361,10 +342,14 @@ class DataflowBuildConfig: steps: Optional[List[Any]] = None #: If given, start from this step, loading the intermediate model generated - #: from the previous step (save_intermediate_models must be enabled) + #: from the previous step (save_intermediate_models must be enabled). + #: Note: When using phase-based builds (default), specify phase names + #: (e.g., "phase_build_hardware") rather than fine-grained step names. start_step: Optional[str] = None #: If given, stop at this step. + #: Note: When using phase-based builds (default), specify phase names + #: (e.g., "phase_build_hardware") rather than fine-grained step names. stop_step: Optional[str] = None #: The optional argument `max_multithreshold_bit_width` affects which Quant nodes @@ -406,6 +391,16 @@ class DataflowBuildConfig: #: Warnings and info will still be printed but errors will not halt the build. mute_config_assertions: Optional[bool] = False + #: Inject custom steps after named steps/phases. + #: Dict mapping step/phase names to list of callable functions to run after that step. + #: Example: inject_steps_after={"phase_optimize_model": [my_custom_verification]} + inject_steps_after: Dict[str, List[Callable]] = field(default_factory=dict) + + #: Inject custom steps before named steps/phases. + #: Dict mapping step/phase names to list of callable functions to run before that step. + #: Example: inject_steps_before={"phase_build_hardware": [my_custom_analysis]} + inject_steps_before: Dict[str, List[Callable]] = field(default_factory=dict) + def _resolve_hls_clk_period(self): if self.hls_clk_period_ns is None: # use same clk for synth and hls if not explicitly specified diff --git a/src/finn/builder/build_dataflow_phases.py b/src/finn/builder/build_dataflow_phases.py new file mode 100644 index 0000000000..d5235331c6 --- /dev/null +++ b/src/finn/builder/build_dataflow_phases.py @@ -0,0 +1,341 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +""" +Phases for FINN dataflow builder pipeline. + +Phases group related fine-grained steps into logical build phases. +All phases internally call functions from build_dataflow_steps.py. + +Users can: +- Use phases via default_phase_build_steps +- Still use fine-grained steps +- Mix phases and fine-grained steps in custom pipelines +- Replace individual phases with custom implementations +- Inject custom steps before/after phases using inject_steps_before/after config +""" + +import os +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp + +from finn.analysis.fpgadataflow.validate_dataflow_conversion import ( + validate_dataflow_conversion, +) +from finn.builder.build_dataflow_config import DataflowBuildConfig +from finn.builder.build_dataflow_steps import ( + step_apply_folding_config, + step_convert_to_hw, + step_create_dataflow_partition, + step_create_stitched_ip, + step_deployment_package, + step_generate_estimate_reports, + step_hw_codegen, + step_hw_ipgen, + step_loop_body_hw_ipgen, + step_loop_body_set_fifo_depths, + step_loop_rolling, + step_make_driver, + step_measure_rtlsim_performance, + step_minimize_bit_width, + step_qonnx_to_finn, + step_set_fifo_depths, + step_specialize_layers, + step_streamline, + step_synthesize_bitfile, + step_target_fps_parallelization, + step_tidy_up, + step_transpose_decomposition, +) +from finn.util.mlo_sim import is_mlo + + +def _save_intermediate_model(model: ModelWrapper, step_name: str, cfg: DataflowBuildConfig): + """Helper to save intermediate model checkpoint.""" + intermediate_model_dir = cfg.output_dir + "/intermediate_models" + if not os.path.exists(intermediate_model_dir): + os.makedirs(intermediate_model_dir) + model.save(f"{intermediate_model_dir}/{step_name}.onnx") + + +def _execute_step(step_fn, model: ModelWrapper, cfg: DataflowBuildConfig): + """Execute a step with injection support and save intermediate model if configured. + + This helper allows phases to: + - Inject custom steps before/after any internal step using cfg.inject_steps_before/after + - Save intermediate models after each internal step for inspection + + Step injection works at both phase and internal step level. For example: + - inject_steps_after={"step_hw_codegen": [my_func]} will run my_func after + step_hw_codegen, even when running phase_build_hardware. + """ + step_name = step_fn.__name__ + + # Inject steps BEFORE this step + if step_name in cfg.inject_steps_before: + for injected_step in cfg.inject_steps_before[step_name]: + model = injected_step(model, cfg) + if cfg.save_intermediate_models: + _save_intermediate_model(model, injected_step.__name__, cfg) + + # Execute main step + model = step_fn(model, cfg) + + # Save main step checkpoint + if cfg.save_intermediate_models: + _save_intermediate_model(model, step_name, cfg) + + # Inject steps AFTER this step + if step_name in cfg.inject_steps_after: + for injected_step in cfg.inject_steps_after[step_name]: + model = injected_step(model, cfg) + if cfg.save_intermediate_models: + _save_intermediate_model(model, injected_step.__name__, cfg) + + return model + + +def phase_prepare_model(model: ModelWrapper, cfg: DataflowBuildConfig): + """Phase: Import and prepare model for FINN transformations. + + This phase handles the initial model import and cleanup, converting from + QONNX dialect to FINN and performing basic tidying operations. + + Internal steps: + - step_qonnx_to_finn: Convert QONNX dialect to FINN + - step_tidy_up: Shape/dtype inference, constant folding, cleanup + + Args: + model: Input ModelWrapper + cfg: Build configuration + + Returns: + Prepared ModelWrapper ready for optimization + """ + model = _execute_step(step_qonnx_to_finn, model, cfg) + model = _execute_step(step_tidy_up, model, cfg) + return model + + +def phase_optimize_model(model: ModelWrapper, cfg: DataflowBuildConfig): + """Phase: Apply model-specific streamlining transformations. + + This phase applies streamlining to move and absorb operations for hardware + efficiency. Streamlining is highly model-dependent and frequently customized. + + Internal steps: + - step_streamline: Apply streamlining transformations + + Note: This phase can be easily replaced with a custom streamline function + in the steps list for model-specific optimizations. + + Args: + model: Input ModelWrapper + cfg: Build configuration + + Returns: + Streamlined ModelWrapper + """ + model = _execute_step(step_streamline, model, cfg) + return model + + +def phase_convert_to_hardware(model: ModelWrapper, cfg: DataflowBuildConfig): + """Phase: Convert model to hardware-eligible operations and specialize. + + This phase identifies hardware-eligible operations, creates the dataflow + partition, specializes layers for the target backend (HLS/RTL), and handles + loop rolling for FINNLoop nodes. After conversion, validates that all layers + are fpgadataflow layers or form a contiguous dataflow block. + + Internal steps: + - step_convert_to_hw: Infer hardware layer types + - step_create_dataflow_partition: Create accelerator subgraph + - step_specialize_layers: Convert to HLS or RTL variants + - step_loop_rolling: Process FINNLoop nodes (auto-detects if needed) + - Validation: Check dataflow conversion success + + Args: + model: Input ModelWrapper + cfg: Build configuration + + Returns: + ModelWrapper with hardware-specialized operations + + Raises: + AssertionError: If dataflow conversion validation fails + """ + model = _execute_step(step_convert_to_hw, model, cfg) + model = _execute_step(step_create_dataflow_partition, model, cfg) + model = _execute_step(step_specialize_layers, model, cfg) + model = _execute_step(step_loop_rolling, model, cfg) + + # Validate dataflow conversion + validation_result = model.analysis(validate_dataflow_conversion) + print(validation_result["message"]) + if not validation_result["valid"]: + raise AssertionError(validation_result["message"]) + + return model + + +def phase_optimize_hardware(model: ModelWrapper, cfg: DataflowBuildConfig): + """Phase: Configure parallelism, apply folding, optimize bit widths, + FIFO sizing, generate reports. + + This phase configures the hardware parallelism and resource usage. It applies + folding configurations, minimizes bit widths (after folding), decomposes + transpose/shuffle operations, sizes FIFOs, + and generates analytical performance/resource reports. + + Internal steps (each step checks its own config parameters): + - step_target_fps_parallelization: Auto-parallelization (if target_fps set) + - step_apply_folding_config: Apply folding configuration (if config provided) + - step_minimize_bit_width: Minimize weight/accumulator bit widths (if enabled) + - step_transpose_decomposition: Decompose Shuffle nodes + - step_set_fifo_depths: FIFO sizing (skipped for MLO, handled in phase_build_hardware) + - step_generate_estimate_reports: Generate analytical estimates (if requested) + + For MLO models, FIFO sizing is deferred to phase_build_hardware because the + characterize strategy requires FINNLoop nodes to have stitched IP, which + depends on loop body FIFO sizing and IP generation happening first. + + Args: + model: Input ModelWrapper + cfg: Build configuration + + Returns: + ModelWrapper with optimized parallelism and resource configuration + """ + model = _execute_step(step_target_fps_parallelization, model, cfg) + model = _execute_step(step_apply_folding_config, model, cfg) + model = _execute_step(step_minimize_bit_width, model, cfg) + model = _execute_step(step_transpose_decomposition, model, cfg) + # Skip FIFO sizing for MLO - handled in phase_build_hardware after loop body IPs are ready + if not is_mlo(model): + model = _execute_step(step_set_fifo_depths, model, cfg) + model = _execute_step(step_generate_estimate_reports, model, cfg) + return model + + +def _apply_to_loop_bodies(model: ModelWrapper, cfg: DataflowBuildConfig, step_fn): + """Apply a step function to all FINNLoop bodies recursively (depth-first). + + Args: + model: ModelWrapper containing FINNLoop nodes + cfg: Build configuration + step_fn: Step function to apply to each loop body + + Returns: + ModelWrapper with step applied to all loop bodies + """ + for node in model.get_nodes_by_op_type("FINNLoop"): + node_inst = getCustomOp(node) + loop_model = node_inst.get_nodeattr("body") + + # Recursively process nested FINNLoop nodes first (depth-first) + if loop_model.get_nodes_by_op_type("FINNLoop"): + loop_model = _apply_to_loop_bodies(loop_model, cfg, step_fn) + + # Apply step to this loop body + print(f"Running {step_fn.__name__} for FINNLoop: {node.name}") + loop_model = step_fn(loop_model, cfg) + + node_inst.set_nodeattr("body", loop_model.graph) + + return model + + +def phase_build_hardware(model: ModelWrapper, cfg: DataflowBuildConfig): + """Phase: Generate hardware code, synthesize IP blocks. + + This phase generates hardware code for each layer (HLS C++ for HLS layers, + RTL/SystemVerilog for RTL layers) and synthesizes IP blocks via Vitis HLS. + + For models with FINNLoop nodes, loop bodies are processed in a specific order: + 1. FIFO sizing for loop bodies (creates FIFO nodes in subgraphs) + 2. step_hw_codegen for main model (applies to subgraphs, so loop body FIFOs get codegen) + 3. Create stitched IP for loop bodies (subgraph IP needed by FINNLoop wrapper) + 4. step_set_fifo_depths for main model (MLO only - needs loop body stitched IPs) + 5. step_hw_ipgen for main model (FINNLoop ipgen uses the subgraph IPs) + + For non-MLO models, step_set_fifo_depths already ran in phase_optimize_hardware. + + Internal steps: + - step_loop_body_set_fifo_depths: FIFO sizing for loop bodies (MLO only) + - step_hw_codegen: Generate HLS C++ or RTL code via PrepareIP + - step_loop_body_hw_ipgen: Create stitched IP for loop bodies (MLO only) + - step_set_fifo_depths: FIFO sizing for main model (MLO only) + - step_hw_ipgen: Synthesize IP blocks via HLSSynthIP + + Args: + model: Input ModelWrapper + cfg: Build configuration + + Returns: + ModelWrapper with generated and synthesized IP blocks + """ + # Step 1: FIFO sizing for loop bodies (creates FIFO nodes in subgraphs) + # Must happen before step_hw_codegen so the new FIFO nodes get code generated + model = _apply_to_loop_bodies(model, cfg, step_loop_body_set_fifo_depths) + + # Step 2: HW codegen for main model (applies to subgraphs via apply_to_subgraphs=True) + model = _execute_step(step_hw_codegen, model, cfg) + + # Step 3: Create stitched IP for loop bodies + # Must happen before step_set_fifo_depths (MLO) so FINNLoop can be simulated + model = _apply_to_loop_bodies(model, cfg, step_loop_body_hw_ipgen) + + # Step 4: FIFO sizing for main model (MLO only) + # Must happen after loop body stitched IPs so FINNLoop can be characterized + if is_mlo(model): + model = _execute_step(step_set_fifo_depths, model, cfg) + + # Step 5: HW ipgen for main model + model = _execute_step(step_hw_ipgen, model, cfg) + + return model + + +def phase_synthesize_hardware(model: ModelWrapper, cfg: DataflowBuildConfig): + """Phase: Create final hardware artifacts (stitched IP or bitfile + deployment package). + + This phase creates the final hardware deliverables based on requested outputs. + It can generate stitched IP (including optional OOC synthesis), measure RTL + simulation performance, or create a complete bitfile with driver and deployment + package. + + Internal steps (each step checks generate_outputs): + - step_create_stitched_ip: Create stitched IP (includes OOC synth if requested) + - step_measure_rtlsim_performance: Measure RTL sim performance (if requested) + - step_synthesize_bitfile: Full bitfile synthesis (if BITFILE requested) + - step_make_driver: Generate PYNQ or C++ driver (if BITFILE requested) + - step_deployment_package: Package for deployment (if requested) + + Note: OOC (out-of-context) synthesis happens inside step_create_stitched_ip + when DataflowOutputType.OOC_SYNTH is requested, not as a separate step. + + Args: + model: Input ModelWrapper + cfg: Build configuration + + Returns: + ModelWrapper with final hardware artifacts generated + """ + model = _execute_step(step_create_stitched_ip, model, cfg) + model = _execute_step(step_measure_rtlsim_performance, model, cfg) + model = _execute_step(step_synthesize_bitfile, model, cfg) + model = _execute_step(step_make_driver, model, cfg) + model = _execute_step(step_deployment_package, model, cfg) + return model + + +#: Map phase name strings to phase functions +build_dataflow_phase_lookup = { + "phase_prepare_model": phase_prepare_model, + "phase_optimize_model": phase_optimize_model, + "phase_convert_to_hardware": phase_convert_to_hardware, + "phase_optimize_hardware": phase_optimize_hardware, + "phase_build_hardware": phase_build_hardware, + "phase_synthesize_hardware": phase_synthesize_hardware, +} diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index f2164ca2c1..0a95207a86 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -284,58 +284,6 @@ def prepare_for_stitched_ip_rtlsim(verify_model, cfg): return verify_model -def prepare_loop_ops_fifo_sizing(node, cfg): - node_inst = getCustomOp(node) - loop_model = node_inst.get_nodeattr("body") - loop_model = loop_model.transform(GiveUniqueNodeNames(prefix=node.name + "_")) - # go first into subgraph to check if there are other loop ops - loop_nodes = loop_model.get_nodes_by_op_type("FINNLoop") - for loop_node in loop_nodes: - prepare_loop_ops_fifo_sizing(loop_node, cfg) - loop_model = loop_model.transform( - PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) - ) - loop_model = loop_model.transform(HLSSynthIP(cfg._resolve_hls_clk_period())) - loop_model = loop_model.transform(ReplaceVerilogRelPaths()) - if cfg.fifosim_save_waveform: - report_dir = cfg.output_dir + "/report" - os.makedirs(report_dir, exist_ok=True) - loop_model.set_metadata_prop( - "rtlsim_trace", os.path.abspath(report_dir) + f"/{node.name}_fifosim_trace.wdb" - ) - loop_model = loop_model.transform( - InsertAndSetFIFODepths( - cfg._resolve_fpga_part(), - cfg._resolve_hls_clk_period(), - swg_exception=cfg.default_swg_exception, - vivado_ram_style=cfg.large_fifo_mem_style, - fifosim_input_throttle=cfg.fifosim_input_throttle, - ) - ) - loop_model = loop_model.transform(SplitLargeFIFOs()) - loop_model = loop_model.transform(RemoveShallowFIFOs()) - loop_model = loop_model.transform(GiveUniqueNodeNames(prefix=node.name + "_")) - loop_model = loop_model.transform(GiveReadableTensorNames()) - node_inst.set_nodeattr("body", loop_model.graph) - - -def prepare_loop_ops_ipgen(node, cfg): - node_inst = getCustomOp(node) - loop_model = node_inst.get_nodeattr("body") - # go first into subgraph to check if there are other loop ops - loop_nodes = loop_model.get_nodes_by_op_type("FINNLoop") - for loop_node in loop_nodes: - prepare_loop_ops_ipgen(loop_node, cfg) - loop_model = loop_model.transform(HLSSynthIP(cfg._resolve_hls_clk_period())) - loop_model = loop_model.transform( - CreateStitchedIP( - cfg._resolve_fpga_part(), - cfg.synth_clk_period_ns, - ) - ) - node_inst.set_nodeattr("body", loop_model.graph) - - def step_qonnx_to_finn(model: ModelWrapper, cfg: DataflowBuildConfig): """ This step will only execute if QONNX nodes are found. @@ -845,11 +793,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): """Generate Vitis HLS code to prepare HLSBackend nodes for IP generation. And fills RTL templates for RTLBackend nodes.""" - model = model.transform(GiveUniqueNodeNames()) - loop_nodes = model.get_nodes_by_op_type("FINNLoop") - for node in loop_nodes: - prepare_loop_ops_fifo_sizing(node, cfg) model = model.transform( PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()), apply_to_subgraphs=True, @@ -861,10 +805,6 @@ def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): """Run Vitis HLS synthesis on generated code for HLSBackend nodes, in order to generate IP blocks. For RTL nodes this step does not do anything.""" - - loop_nodes = model.get_nodes_by_op_type("FINNLoop") - for node in loop_nodes: - prepare_loop_ops_ipgen(node, cfg) model = model.transform(HLSSynthIP(cfg._resolve_fpga_part())) model = model.transform(ReplaceVerilogRelPaths()) report_dir = cfg.output_dir + "/report" @@ -1370,6 +1310,79 @@ def step_loop_rolling(model, cfg): return model +def step_loop_body_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): + """Set FIFO depths for loop body model. + + This step is designed to be called on a loop body model (extracted from FINNLoop). + It performs PrepareIP, HLSSynthIP, and InsertAndSetFIFODepths with parameters + appropriate for loop bodies. + + Args: + model: Loop body ModelWrapper + cfg: Build configuration + + Returns: + Loop body ModelWrapper with FIFOs sized + """ + # Prepare and synthesize IP for FIFO characterization + model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) + model = model.transform(HLSSynthIP(cfg._resolve_hls_clk_period())) + model = model.transform(ReplaceVerilogRelPaths()) + + # Set waveform trace if configured + if cfg.fifosim_save_waveform: + report_dir = cfg.output_dir + "/report" + os.makedirs(report_dir, exist_ok=True) + model.set_metadata_prop( + "rtlsim_trace", os.path.abspath(report_dir) + "/loop_body_fifosim_trace.wdb" + ) + + # Insert and size FIFOs + model = model.transform( + InsertAndSetFIFODepths( + cfg._resolve_fpga_part(), + cfg._resolve_hls_clk_period(), + swg_exception=cfg.default_swg_exception, + vivado_ram_style=cfg.large_fifo_mem_style, + fifosim_input_throttle=cfg.fifosim_input_throttle, + ) + ) + model = model.transform(SplitLargeFIFOs()) + model = model.transform(RemoveShallowFIFOs()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + return model + + +def step_loop_body_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): + """Run HLS synthesis and create stitched IP for loop body model. + + This step is designed to be called on a loop body model (extracted from FINNLoop). + It performs HLSSynthIP and CreateStitchedIP with parameters appropriate for + loop bodies (no verification, no output directory operations). + + Args: + model: Loop body ModelWrapper + cfg: Build configuration + + Returns: + Loop body ModelWrapper with synthesized IP and stitched IP created + """ + # HLS synthesis for this loop body + model = model.transform(HLSSynthIP(cfg._resolve_hls_clk_period())) + + # Create stitched IP for this loop body + model = model.transform( + CreateStitchedIP( + cfg._resolve_fpga_part(), + cfg.synth_clk_period_ns, + ) + ) + + return model + + #: map step name strings to step functions build_dataflow_step_lookup = { "step_qonnx_to_finn": step_qonnx_to_finn, @@ -1392,4 +1405,6 @@ def step_loop_rolling(model, cfg): "step_synthesize_bitfile": step_synthesize_bitfile, "step_deployment_package": step_deployment_package, "step_loop_rolling": step_loop_rolling, + "step_loop_body_set_fifo_depths": step_loop_body_set_fifo_depths, + "step_loop_body_hw_ipgen": step_loop_body_hw_ipgen, } diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py index 018d8f0417..6c345adf99 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py @@ -220,8 +220,6 @@ def prepare_rtlsim(self, behav=False): return super().prepare_rtlsim(behav) def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - if mode == "cppsim": - StreamingFIFO.execute_node(self, context, graph) - elif mode == "rtlsim": - RTLBackend.execute_node(self, context, graph) + # FIFOs just pass data through - use simple execution for all modes + # RTL simulation of FIFO is only relevant for timing/throughput, not functional verification + StreamingFIFO.execute_node(self, context, graph) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index fa317265a6..4755ededaa 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -527,6 +527,7 @@ def apply(self, model): # clean up references to stitched IP and rtlsim objects # (the stitched IP needs to be re-done after FIFO sizing) + model.set_metadata_prop("exec_mode", "") model.set_metadata_prop("rtlsim_trace", "") model.set_metadata_prop("rtlsim_so", "") model.set_metadata_prop("vivado_stitch_proj", "") diff --git a/tests/fpgadataflow/test_fpgadataflow_finnloop.py b/tests/fpgadataflow/test_fpgadataflow_finnloop.py index 860eb6321c..74ea433f38 100644 --- a/tests/fpgadataflow/test_fpgadataflow_finnloop.py +++ b/tests/fpgadataflow/test_fpgadataflow_finnloop.py @@ -510,23 +510,14 @@ def test_finnloop_end2end_mlo( model.save(tmp_output_dir + "/mlo_model.onnx") - # steps are skipped because test model created with HLS and RTL layers + # Use phase-based pipeline + # Steps are adjusted because test model already has HLS and RTL layers steps = [ - # "step_qonnx_to_finn", - # "step_tidy_up", - # "step_streamline", - # "step_convert_to_hw", - "step_create_dataflow_partition", - # "step_specialize_layers", - "step_loop_rolling", - # "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_hw_codegen", - "step_hw_ipgen", - "step_set_fifo_depths", - "step_create_stitched_ip", + "step_create_dataflow_partition", # Fine-grained (model already specialized) + "phase_convert_to_hardware", # Phase (includes loop rolling) + "phase_optimize_hardware", # Phase (includes folding, bit-width, reports) + "phase_build_hardware", # Phase (includes codegen, ipgen, FIFOs) + "step_create_stitched_ip", # Fine-grained (just IP, no full synth) ] cfg = build_cfg.DataflowBuildConfig( diff --git a/tests/fpgadataflow/test_fpgadataflow_layernorm.py b/tests/fpgadataflow/test_fpgadataflow_layernorm.py index 8d3cc7bf62..15d587cc95 100644 --- a/tests/fpgadataflow/test_fpgadataflow_layernorm.py +++ b/tests/fpgadataflow/test_fpgadataflow_layernorm.py @@ -448,19 +448,12 @@ def test_hls_rtl_dsp_conflict_detection(): with open(specialize_config_file, "w") as f: json.dump(specialize_config, f) - # Build steps - includes conversion to HW layers and specialization + # Build steps using phases steps = [ - "step_convert_to_hw", - "step_create_dataflow_partition", - "step_specialize_layers", - "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_hw_codegen", - "step_hw_ipgen", - "step_set_fifo_depths", - "step_create_stitched_ip", + "phase_convert_to_hardware", # Includes convert_to_hw, partition, specialize, loop_rolling + "phase_optimize_hardware", # Includes target_fps, folding, bit_width, reports + "phase_build_hardware", # Includes codegen, ipgen, FIFO depths + "step_create_stitched_ip", # Fine-grained (just stitched IP) ] # Request verification steps that will trigger DSP conflict detection @@ -667,17 +660,10 @@ def test_integer_hls_elementwise_no_dsp_conflict(): # Build steps - includes conversion to HW layers and specialization steps = [ - "step_convert_to_hw", - "step_create_dataflow_partition", - "step_specialize_layers", - "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_hw_codegen", - "step_hw_ipgen", - "step_set_fifo_depths", - "step_create_stitched_ip", + "phase_convert_to_hardware", # Includes convert_to_hw, partition, specialize, loop_rolling + "phase_optimize_hardware", # Includes target_fps, folding, bit_width, reports + "phase_build_hardware", # Includes codegen, ipgen, FIFO depths + "step_create_stitched_ip", # Fine-grained (just stitched IP) ] # Request verification steps - stitched_ip_rtlsim should NOT be skipped diff --git a/tests/fpgadataflow/test_validate_dataflow_conversion.py b/tests/fpgadataflow/test_validate_dataflow_conversion.py new file mode 100644 index 0000000000..b6933d82e6 --- /dev/null +++ b/tests/fpgadataflow/test_validate_dataflow_conversion.py @@ -0,0 +1,160 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +"""Tests for dataflow conversion validation analysis pass.""" + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + +from finn.analysis.fpgadataflow.validate_dataflow_conversion import ( + validate_dataflow_conversion, +) +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( + InferElementwiseBinaryOperation, + InferHWSoftmax, + InferQuantizedMatrixVectorActivation, + InferThresholdingLayer, +) +from finn.util.fpgadataflow import is_fpgadataflow_node + + +def make_test_model(): + """Create a small model with different layer types for testing validation. + + Model structure (all non-fpgadataflow initially): + - Layer 0: Transpose + - Layer 1: MatMul (INT4 weights) + - Layer 2: MultiThreshold + - Layer 3: Mul + - Layer 4: Softmax + """ + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 4]) + out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4, 4]) + + # Layer 0: Transpose + node0 = helper.make_node("Transpose", ["inp"], ["t0"], perm=[0, 1, 2]) + + # Layer 1: MatMul with INT4 weights + W1_data = gen_finn_dt_tensor(DataType["INT4"], (4, 4)) + W1 = helper.make_tensor("W1", TensorProto.FLOAT, [4, 4], W1_data.flatten().tolist()) + node1 = helper.make_node("MatMul", ["t0", "W1"], ["t1"]) + + # Layer 2: MultiThreshold (QONNX custom op) + # UINT4 has 16 values (0-15), so we need 15 thresholds per channel + T2_data = gen_finn_dt_tensor(DataType["INT16"], (4, 15)) + T2_data = np.sort(T2_data, axis=1) # Sort thresholds in increasing order + T2 = helper.make_tensor("T2", TensorProto.FLOAT, [4, 15], T2_data.flatten().tolist()) + node2 = helper.make_node( + "MultiThreshold", + ["t1", "T2"], + ["t2"], + domain="qonnx.custom_op.general", + out_dtype="UINT4", + data_layout="NHWC", + ) + + # Layer 3: Mul + scale_data = np.array([2.0], dtype=np.float32) + scale = helper.make_tensor("scale", TensorProto.FLOAT, [1], scale_data.tolist()) + node3 = helper.make_node("Mul", ["t2", "scale"], ["t3"]) + + # Layer 4: Softmax + node4 = helper.make_node("Softmax", ["t3"], ["out"], axis=-1) + + graph = helper.make_graph( + [node0, node1, node2, node3, node4], + "test_validation", + [inp], + [out], + initializer=[W1, T2, scale], + ) + + model = qonnx_make_model(graph) + model = ModelWrapper(model) + + # Set INT4 datatypes + model.set_tensor_datatype("inp", DataType["INT4"]) + model.set_tensor_datatype("W1", DataType["INT4"]) + model.set_tensor_datatype("T2", DataType["INT16"]) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +@pytest.mark.fpgadataflow +def test_validate_dataflow_conversion_scenarios(): + """Test validation through progressive conversion scenarios. + + Test plan: + 0. No conversions - should fail + 1. Convert layer 2 (MultiThreshold) → [non, non, fpga, non, non] - should pass + 2. Convert layer 4 (Softmax) → [non, non, fpga, non, fpga] - should FAIL (non-contiguous) + 3. Convert layer 3 and 1 (Mul and MatMul) → [non, fpga, fpga, fpga, fpga] - should pass + """ + + # Scenario 0: No fpgadataflow layers - should fail + print("\n--- Scenario 0: No fpgadataflow layers ---") + model = make_test_model() + result = model.analysis(validate_dataflow_conversion) + print(f"Valid: {result['valid']}") + print(f"Message: {result['message']}") + + assert result["valid"] is False, "Expected validation to fail with no fpgadataflow layers" + assert "No fpgadataflow layers found" in result["message"] + assert len(result["unconverted_layers"]) == 5 + + # Scenario 1: Convert layer 2 (MultiThreshold) → [non, non, fpga, non, non] + print("\n--- Scenario 1: Convert layer 2 (MultiThreshold) ---") + model = model.transform(InferThresholdingLayer()) + result = model.analysis(validate_dataflow_conversion) + print(f"Valid: {result['valid']}") + print(f"Message: {result['message']}") + + assert result["valid"] is True + assert "contiguous block" in result["message"].lower() + assert result["dataflow_block"] == (2, 2) + + # Scenario 2: Convert layer 4 (Softmax) → [non, non, fpga, non, fpga] - should FAIL + print("\n--- Scenario 2: Convert layer 4 (Softmax) - EXPECT FAILURE ---") + model = model.transform(InferHWSoftmax()) + result = model.analysis(validate_dataflow_conversion) + print(f"Valid: {result['valid']}") + print(f"Message: {result['message']}") + + assert ( + result["valid"] is False + ), "Expected validation to fail with non-contiguous dataflow block" + assert "Non-contiguous dataflow block detected" in result["message"] + + # Scenario 3: Convert layer 3 (Mul) and layer 1 (MatMul) → [non, fpga, fpga, fpga, fpga] + print("\n--- Scenario 3: Convert layers 3 (Mul) and 1 (MatMul) ---") + model = model.transform(InferElementwiseBinaryOperation()) + model = model.transform(InferQuantizedMatrixVectorActivation()) + result = model.analysis(validate_dataflow_conversion) + print(f"Valid: {result['valid']}") + print(f"Message: {result['message']}") + + assert result["valid"] is True + assert "contiguous block" in result["message"].lower() + assert result["dataflow_block"] == (1, 4) + + # Final verification + print("\n--- Final verification ---") + nodes = model.graph.node + fpgadataflow_count = sum(1 for node in nodes if is_fpgadataflow_node(node)) + print(f"Fpgadataflow layers: {fpgadataflow_count} / {len(nodes)}") + print(f"Total nodes: {len(nodes)}") + + # 4 out of 5 layers should be fpgadataflow (all except Transpose) + assert fpgadataflow_count == 4 + assert result["valid"] is True + assert len(result["unconverted_layers"]) == 1 # Only Transpose unconverted diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index 9edfecb352..8adc7d3a16 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -98,8 +98,11 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(stitched_waveform_dir + f"/verify_rtlsim_{i}.wdb") # Check that node-by-node rtlsim waveforms were created for each node + # Skip FIFO nodes as they use pass-through execution (no RTL simulation) node_waveform_dir = verify_out_dir + "/node_by_node_rtlsim_waveforms" for node in model.graph.node: + if node.op_type.startswith("StreamingFIFO"): + continue assert os.path.isfile( node_waveform_dir + f"/{node.name}_rtlsim_{i}.wdb" ), f"Missing waveform for node {node.name} in batch {i}" diff --git a/tests/util/test_build_dataflow_checks.py b/tests/util/test_build_dataflow_checks.py index b6cdccd120..84c010ca2a 100644 --- a/tests/util/test_build_dataflow_checks.py +++ b/tests/util/test_build_dataflow_checks.py @@ -33,11 +33,11 @@ def make_test_model(build_dir): def cfg(output_dir, **kw): - """Create config that stops immediately after first step.""" + """Create config that stops immediately after first phase.""" return DataflowBuildConfig( output_dir=output_dir, synth_clk_period_ns=5.0, - stop_step="step_qonnx_to_finn", + stop_step="phase_prepare_model", generate_outputs=kw.pop("generate_outputs", [DataflowOutputType.ESTIMATE_REPORTS]), **kw )