diff --git a/src/finn/analysis/fpgadataflow/validate_dataflow_conversion.py b/src/finn/analysis/fpgadataflow/validate_dataflow_conversion.py
new file mode 100644
index 0000000000..e2a1b1cd2c
--- /dev/null
+++ b/src/finn/analysis/fpgadataflow/validate_dataflow_conversion.py
@@ -0,0 +1,100 @@
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Analysis pass to validate that model has been properly converted to fpgadataflow layers."""
+
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def validate_dataflow_conversion(model):
+    """Validate that model has been properly converted to dataflow layers.
+
+    Checks that either:
+    1. All layers are fpgadataflow layers (ideal case), OR
+    2. Fpgadataflow layers form a contiguous block in the middle of the model,
+       with only non-dataflow layers on the outside (partition case)
+
+    Returns a dictionary with validation results:
+    - 'valid': bool indicating if validation passed
+    - 'message': str with validation status message
+    - 'unconverted_layers': list of (index, name, op_type) tuples for non-dataflow layers
+    - 'dataflow_block': tuple (first_index, last_index) if dataflow forms a block, else None
+
+    Example usage in transformation:
+        result = model.analysis(validate_dataflow_conversion)
+        if not result['valid']:
+            raise AssertionError(result['message'])
+    """
+    nodes = model.graph.node
+    fpgadataflow_nodes = []
+    non_fpgadataflow_nodes = []
+
+    for i, node in enumerate(nodes):
+        if is_fpgadataflow_node(node):
+            fpgadataflow_nodes.append((i, node.name, node.op_type))
+        else:
+            non_fpgadataflow_nodes.append((i, node.name, node.op_type))
+
+    # Case 1: All nodes are fpgadataflow (ideal)
+    if len(non_fpgadataflow_nodes) == 0:
+        return {
+            "valid": True,
+            "message": "Dataflow conversion validation: All layers are fpgadataflow layers",
+            "unconverted_layers": [],
+            "dataflow_block": None,
+        }
+
+    # Case 2: Check if fpgadataflow nodes form contiguous block
+    if len(fpgadataflow_nodes) > 0:
+        dataflow_indices = [i for i, _, _ in fpgadataflow_nodes]
+        first_dataflow = min(dataflow_indices)
+        last_dataflow = max(dataflow_indices)
+
+        # Check all indices between first and last are dataflow
+        for i in range(first_dataflow, last_dataflow + 1):
+            node = nodes[i]
+            if not is_fpgadataflow_node(node):
+                # Found non-dataflow layer inside dataflow block
+                unconverted_str = "\n".join(
+                    [
+                        f"  [{idx}] {name} (op_type: {op})"
+                        for idx, name, op in non_fpgadataflow_nodes
+                    ]
+                )
+                return {
+                    "valid": False,
+                    "message": (
+                        "Non-contiguous dataflow block detected.\n"
+                        f"Layer '{node.name}' (op_type: {node.op_type}) at position {i} "
+                        "is not a fpgadataflow layer but is between dataflow layers.\n"
+                        f"Dataflow block spans positions {first_dataflow} to {last_dataflow}.\n"
+                        f"Unconverted layers:\n{unconverted_str}"
+                    ),
+                    "unconverted_layers": non_fpgadataflow_nodes,
+                    "dataflow_block": (first_dataflow, last_dataflow),
+                }
+
+        # Valid: fpgadataflow block in middle
+        return {
+            "valid": True,
+            "message": (
+                "Dataflow conversion validation: Fpgadataflow layers form contiguous block "
+                f"(positions {first_dataflow}-{last_dataflow})"
+            ),
+            "unconverted_layers": non_fpgadataflow_nodes,
+            "dataflow_block": (first_dataflow, last_dataflow),
+        }
+
+    # Case 3: No fpgadataflow layers at all
+    unconverted_str = "\n".join(
+        [f"  [{idx}] {name} (op_type: {op})" for idx, name, op in non_fpgadataflow_nodes]
+    )
+    return {
+        "valid": False,
+        "message": (
+            "No fpgadataflow layers found in model.\n"
+            f"All layers remain unconverted:\n{unconverted_str}"
+        ),
+        "unconverted_layers": non_fpgadataflow_nodes,
+        "dataflow_block": None,
+    }
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 6c761e76cb..f7e1ff9ba6 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -45,6 +45,7 @@
     DataflowBuildConfig,
     default_build_dataflow_steps,
 )
+from finn.builder.build_dataflow_phases import build_dataflow_phase_lookup
 from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
 
 
@@ -68,19 +69,53 @@ def flush(self):
 
 
 def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True):
+    """Resolve build steps from config, supporting both phases and fine-grained steps.
+
+    Note: When using phase-based builds with start_step/stop_step, specify phase names
+    (e.g., start_step="phase_build_hardware") rather than fine-grained step names.
+    Phases save intermediate models for each internal step, so checkpoints like
+    step_hw_ipgen.onnx will exist, but the build loop operates at the phase level.
+    """
     steps = cfg.steps
     if steps is None:
         steps = default_build_dataflow_steps
+
+    # Merge phase and step lookup dictionaries
+    all_steps = {
+        **build_dataflow_step_lookup,
+        **build_dataflow_phase_lookup,
+    }
+
     steps_as_fxns = []
     for transform_step in steps:
+        step_name = None
+
+        # Get step function and name
         if type(transform_step) is str:
-            # lookup step function from step name
-            steps_as_fxns.append(build_dataflow_step_lookup[transform_step])
+            step_name = transform_step
+            if transform_step in all_steps:
+                step_fn = all_steps[transform_step]
+            else:
+                raise ValueError(f"Unknown step or phase: {transform_step}")
         elif callable(transform_step):
-            # treat step as function to be called as-is
-            steps_as_fxns.append(transform_step)
+            step_fn = transform_step
+            step_name = getattr(transform_step, "__name__", None)
         else:
-            raise Exception("Could not resolve build step: " + str(transform_step))
+            raise ValueError(f"Invalid step type: {type(transform_step)}")
+
+        # Inject steps BEFORE this step
+        if step_name and step_name in cfg.inject_steps_before:
+            for injected_step in cfg.inject_steps_before[step_name]:
+                steps_as_fxns.append(injected_step)
+
+        # Add the main step
+        steps_as_fxns.append(step_fn)
+
+        # Inject steps AFTER this step
+        if step_name and step_name in cfg.inject_steps_after:
+            for injected_step in cfg.inject_steps_after[step_name]:
+                steps_as_fxns.append(injected_step)
+
     if partial:
         step_names = list(map(lambda x: x.__name__, steps_as_fxns))
         if cfg.start_step is None:
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 96ecfeb6b7..e7348a86df 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -29,10 +29,10 @@
 
 import numpy as np
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from dataclasses_json import dataclass_json
 from enum import Enum
-from typing import Any, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 from finn.transformation.fpgadataflow.alveo_build import VitisOptStrategy
 from finn.util.basic import part_map, vitis_default_platform
@@ -108,39 +108,20 @@ class VerificationStepType(str, Enum):
 #: specified order. Use the `steps` as part of build config to restrict which
 #: steps will be run.
 default_build_dataflow_steps = [
-    "step_qonnx_to_finn",
-    "step_tidy_up",
-    "step_streamline",
-    "step_convert_to_hw",
-    "step_create_dataflow_partition",
-    "step_specialize_layers",
-    "step_target_fps_parallelization",
-    "step_apply_folding_config",
-    "step_minimize_bit_width",
-    "step_transpose_decomposition",
-    "step_generate_estimate_reports",
-    "step_hw_codegen",
-    "step_hw_ipgen",
-    "step_set_fifo_depths",
-    "step_create_stitched_ip",
-    "step_measure_rtlsim_performance",
-    "step_synthesize_bitfile",
-    "step_make_driver",
-    "step_deployment_package",
+    "phase_prepare_model",
+    "phase_optimize_model",
+    "phase_convert_to_hardware",
+    "phase_optimize_hardware",
+    "phase_build_hardware",
+    "phase_synthesize_hardware",
 ]
 
 #: List of steps to run for an estimate-only (no synthesis) dataflow build
 estimate_only_dataflow_steps = [
-    "step_qonnx_to_finn",
-    "step_tidy_up",
-    "step_streamline",
-    "step_convert_to_hw",
-    "step_create_dataflow_partition",
-    "step_specialize_layers",
-    "step_target_fps_parallelization",
-    "step_apply_folding_config",
-    "step_minimize_bit_width",
-    "step_generate_estimate_reports",
+    "phase_prepare_model",
+    "phase_optimize_model",
+    "phase_convert_to_hardware",
+    "phase_optimize_hardware",
 ]
 
 #: List of steps to run for a dataflow build including HW code generation, but
@@ -361,10 +342,14 @@ class DataflowBuildConfig:
     steps: Optional[List[Any]] = None
 
     #: If given, start from this step, loading the intermediate model generated
-    #: from the previous step (save_intermediate_models must be enabled)
+    #: from the previous step (save_intermediate_models must be enabled).
+    #: Note: When using phase-based builds (default), specify phase names
+    #: (e.g., "phase_build_hardware") rather than fine-grained step names.
     start_step: Optional[str] = None
 
     #: If given, stop at this step.
+    #: Note: When using phase-based builds (default), specify phase names
+    #: (e.g., "phase_build_hardware") rather than fine-grained step names.
     stop_step: Optional[str] = None
 
     #: The optional argument `max_multithreshold_bit_width` affects which Quant nodes
@@ -406,6 +391,16 @@ class DataflowBuildConfig:
     #: Warnings and info will still be printed but errors will not halt the build.
     mute_config_assertions: Optional[bool] = False
 
+    #: Inject custom steps after named steps/phases.
+    #: Dict mapping step/phase names to list of callable functions to run after that step.
+    #: Example: inject_steps_after={"phase_optimize_model": [my_custom_verification]}
+    inject_steps_after: Dict[str, List[Callable]] = field(default_factory=dict)
+
+    #: Inject custom steps before named steps/phases.
+    #: Dict mapping step/phase names to list of callable functions to run before that step.
+    #: Example: inject_steps_before={"phase_build_hardware": [my_custom_analysis]}
+    inject_steps_before: Dict[str, List[Callable]] = field(default_factory=dict)
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
diff --git a/src/finn/builder/build_dataflow_phases.py b/src/finn/builder/build_dataflow_phases.py
new file mode 100644
index 0000000000..d5235331c6
--- /dev/null
+++ b/src/finn/builder/build_dataflow_phases.py
@@ -0,0 +1,341 @@
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Phases for FINN dataflow builder pipeline.
+
+Phases group related fine-grained steps into logical build phases.
+All phases internally call functions from build_dataflow_steps.py.
+
+Users can:
+- Use phases via default_phase_build_steps
+- Still use fine-grained steps
+- Mix phases and fine-grained steps in custom pipelines
+- Replace individual phases with custom implementations
+- Inject custom steps before/after phases using inject_steps_before/after config
+"""
+
+import os
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+
+from finn.analysis.fpgadataflow.validate_dataflow_conversion import (
+    validate_dataflow_conversion,
+)
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+from finn.builder.build_dataflow_steps import (
+    step_apply_folding_config,
+    step_convert_to_hw,
+    step_create_dataflow_partition,
+    step_create_stitched_ip,
+    step_deployment_package,
+    step_generate_estimate_reports,
+    step_hw_codegen,
+    step_hw_ipgen,
+    step_loop_body_hw_ipgen,
+    step_loop_body_set_fifo_depths,
+    step_loop_rolling,
+    step_make_driver,
+    step_measure_rtlsim_performance,
+    step_minimize_bit_width,
+    step_qonnx_to_finn,
+    step_set_fifo_depths,
+    step_specialize_layers,
+    step_streamline,
+    step_synthesize_bitfile,
+    step_target_fps_parallelization,
+    step_tidy_up,
+    step_transpose_decomposition,
+)
+from finn.util.mlo_sim import is_mlo
+
+
+def _save_intermediate_model(model: ModelWrapper, step_name: str, cfg: DataflowBuildConfig):
+    """Helper to save intermediate model checkpoint."""
+    intermediate_model_dir = cfg.output_dir + "/intermediate_models"
+    if not os.path.exists(intermediate_model_dir):
+        os.makedirs(intermediate_model_dir)
+    model.save(f"{intermediate_model_dir}/{step_name}.onnx")
+
+
+def _execute_step(step_fn, model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Execute a step with injection support and save intermediate model if configured.
+
+    This helper allows phases to:
+    - Inject custom steps before/after any internal step using cfg.inject_steps_before/after
+    - Save intermediate models after each internal step for inspection
+
+    Step injection works at both phase and internal step level. For example:
+    - inject_steps_after={"step_hw_codegen": [my_func]} will run my_func after
+      step_hw_codegen, even when running phase_build_hardware.
+    """
+    step_name = step_fn.__name__
+
+    # Inject steps BEFORE this step
+    if step_name in cfg.inject_steps_before:
+        for injected_step in cfg.inject_steps_before[step_name]:
+            model = injected_step(model, cfg)
+            if cfg.save_intermediate_models:
+                _save_intermediate_model(model, injected_step.__name__, cfg)
+
+    # Execute main step
+    model = step_fn(model, cfg)
+
+    # Save main step checkpoint
+    if cfg.save_intermediate_models:
+        _save_intermediate_model(model, step_name, cfg)
+
+    # Inject steps AFTER this step
+    if step_name in cfg.inject_steps_after:
+        for injected_step in cfg.inject_steps_after[step_name]:
+            model = injected_step(model, cfg)
+            if cfg.save_intermediate_models:
+                _save_intermediate_model(model, injected_step.__name__, cfg)
+
+    return model
+
+
+def phase_prepare_model(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Phase: Import and prepare model for FINN transformations.
+
+    This phase handles the initial model import and cleanup, converting from
+    QONNX dialect to FINN and performing basic tidying operations.
+
+    Internal steps:
+    - step_qonnx_to_finn: Convert QONNX dialect to FINN
+    - step_tidy_up: Shape/dtype inference, constant folding, cleanup
+
+    Args:
+        model: Input ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        Prepared ModelWrapper ready for optimization
+    """
+    model = _execute_step(step_qonnx_to_finn, model, cfg)
+    model = _execute_step(step_tidy_up, model, cfg)
+    return model
+
+
+def phase_optimize_model(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Phase: Apply model-specific streamlining transformations.
+
+    This phase applies streamlining to move and absorb operations for hardware
+    efficiency. Streamlining is highly model-dependent and frequently customized.
+
+    Internal steps:
+    - step_streamline: Apply streamlining transformations
+
+    Note: This phase can be easily replaced with a custom streamline function
+    in the steps list for model-specific optimizations.
+
+    Args:
+        model: Input ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        Streamlined ModelWrapper
+    """
+    model = _execute_step(step_streamline, model, cfg)
+    return model
+
+
+def phase_convert_to_hardware(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Phase: Convert model to hardware-eligible operations and specialize.
+
+    This phase identifies hardware-eligible operations, creates the dataflow
+    partition, specializes layers for the target backend (HLS/RTL), and handles
+    loop rolling for FINNLoop nodes. After conversion, validates that all layers
+    are fpgadataflow layers or form a contiguous dataflow block.
+
+    Internal steps:
+    - step_convert_to_hw: Infer hardware layer types
+    - step_create_dataflow_partition: Create accelerator subgraph
+    - step_specialize_layers: Convert to HLS or RTL variants
+    - step_loop_rolling: Process FINNLoop nodes (auto-detects if needed)
+    - Validation: Check dataflow conversion success
+
+    Args:
+        model: Input ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        ModelWrapper with hardware-specialized operations
+
+    Raises:
+        AssertionError: If dataflow conversion validation fails
+    """
+    model = _execute_step(step_convert_to_hw, model, cfg)
+    model = _execute_step(step_create_dataflow_partition, model, cfg)
+    model = _execute_step(step_specialize_layers, model, cfg)
+    model = _execute_step(step_loop_rolling, model, cfg)
+
+    # Validate dataflow conversion
+    validation_result = model.analysis(validate_dataflow_conversion)
+    print(validation_result["message"])
+    if not validation_result["valid"]:
+        raise AssertionError(validation_result["message"])
+
+    return model
+
+
+def phase_optimize_hardware(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Phase: Configure parallelism, apply folding, optimize bit widths,
+    FIFO sizing, generate reports.
+
+    This phase configures the hardware parallelism and resource usage. It applies
+    folding configurations, minimizes bit widths (after folding), decomposes
+    transpose/shuffle operations, sizes FIFOs,
+    and generates analytical performance/resource reports.
+
+    Internal steps (each step checks its own config parameters):
+    - step_target_fps_parallelization: Auto-parallelization (if target_fps set)
+    - step_apply_folding_config: Apply folding configuration (if config provided)
+    - step_minimize_bit_width: Minimize weight/accumulator bit widths (if enabled)
+    - step_transpose_decomposition: Decompose Shuffle nodes
+    - step_set_fifo_depths: FIFO sizing (skipped for MLO, handled in phase_build_hardware)
+    - step_generate_estimate_reports: Generate analytical estimates (if requested)
+
+    For MLO models, FIFO sizing is deferred to phase_build_hardware because the
+    characterize strategy requires FINNLoop nodes to have stitched IP, which
+    depends on loop body FIFO sizing and IP generation happening first.
+
+    Args:
+        model: Input ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        ModelWrapper with optimized parallelism and resource configuration
+    """
+    model = _execute_step(step_target_fps_parallelization, model, cfg)
+    model = _execute_step(step_apply_folding_config, model, cfg)
+    model = _execute_step(step_minimize_bit_width, model, cfg)
+    model = _execute_step(step_transpose_decomposition, model, cfg)
+    # Skip FIFO sizing for MLO - handled in phase_build_hardware after loop body IPs are ready
+    if not is_mlo(model):
+        model = _execute_step(step_set_fifo_depths, model, cfg)
+    model = _execute_step(step_generate_estimate_reports, model, cfg)
+    return model
+
+
+def _apply_to_loop_bodies(model: ModelWrapper, cfg: DataflowBuildConfig, step_fn):
+    """Apply a step function to all FINNLoop bodies recursively (depth-first).
+
+    Args:
+        model: ModelWrapper containing FINNLoop nodes
+        cfg: Build configuration
+        step_fn: Step function to apply to each loop body
+
+    Returns:
+        ModelWrapper with step applied to all loop bodies
+    """
+    for node in model.get_nodes_by_op_type("FINNLoop"):
+        node_inst = getCustomOp(node)
+        loop_model = node_inst.get_nodeattr("body")
+
+        # Recursively process nested FINNLoop nodes first (depth-first)
+        if loop_model.get_nodes_by_op_type("FINNLoop"):
+            loop_model = _apply_to_loop_bodies(loop_model, cfg, step_fn)
+
+        # Apply step to this loop body
+        print(f"Running {step_fn.__name__} for FINNLoop: {node.name}")
+        loop_model = step_fn(loop_model, cfg)
+
+        node_inst.set_nodeattr("body", loop_model.graph)
+
+    return model
+
+
+def phase_build_hardware(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Phase: Generate hardware code, synthesize IP blocks.
+
+    This phase generates hardware code for each layer (HLS C++ for HLS layers,
+    RTL/SystemVerilog for RTL layers) and synthesizes IP blocks via Vitis HLS.
+
+    For models with FINNLoop nodes, loop bodies are processed in a specific order:
+    1. FIFO sizing for loop bodies (creates FIFO nodes in subgraphs)
+    2. step_hw_codegen for main model (applies to subgraphs, so loop body FIFOs get codegen)
+    3. Create stitched IP for loop bodies (subgraph IP needed by FINNLoop wrapper)
+    4. step_set_fifo_depths for main model (MLO only - needs loop body stitched IPs)
+    5. step_hw_ipgen for main model (FINNLoop ipgen uses the subgraph IPs)
+
+    For non-MLO models, step_set_fifo_depths already ran in phase_optimize_hardware.
+
+    Internal steps:
+    - step_loop_body_set_fifo_depths: FIFO sizing for loop bodies (MLO only)
+    - step_hw_codegen: Generate HLS C++ or RTL code via PrepareIP
+    - step_loop_body_hw_ipgen: Create stitched IP for loop bodies (MLO only)
+    - step_set_fifo_depths: FIFO sizing for main model (MLO only)
+    - step_hw_ipgen: Synthesize IP blocks via HLSSynthIP
+
+    Args:
+        model: Input ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        ModelWrapper with generated and synthesized IP blocks
+    """
+    # Step 1: FIFO sizing for loop bodies (creates FIFO nodes in subgraphs)
+    # Must happen before step_hw_codegen so the new FIFO nodes get code generated
+    model = _apply_to_loop_bodies(model, cfg, step_loop_body_set_fifo_depths)
+
+    # Step 2: HW codegen for main model (applies to subgraphs via apply_to_subgraphs=True)
+    model = _execute_step(step_hw_codegen, model, cfg)
+
+    # Step 3: Create stitched IP for loop bodies
+    # Must happen before step_set_fifo_depths (MLO) so FINNLoop can be simulated
+    model = _apply_to_loop_bodies(model, cfg, step_loop_body_hw_ipgen)
+
+    # Step 4: FIFO sizing for main model (MLO only)
+    # Must happen after loop body stitched IPs so FINNLoop can be characterized
+    if is_mlo(model):
+        model = _execute_step(step_set_fifo_depths, model, cfg)
+
+    # Step 5: HW ipgen for main model
+    model = _execute_step(step_hw_ipgen, model, cfg)
+
+    return model
+
+
+def phase_synthesize_hardware(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Phase: Create final hardware artifacts (stitched IP or bitfile + deployment package).
+
+    This phase creates the final hardware deliverables based on requested outputs.
+    It can generate stitched IP (including optional OOC synthesis), measure RTL
+    simulation performance, or create a complete bitfile with driver and deployment
+    package.
+
+    Internal steps (each step checks generate_outputs):
+    - step_create_stitched_ip: Create stitched IP (includes OOC synth if requested)
+    - step_measure_rtlsim_performance: Measure RTL sim performance (if requested)
+    - step_synthesize_bitfile: Full bitfile synthesis (if BITFILE requested)
+    - step_make_driver: Generate PYNQ or C++ driver (if BITFILE requested)
+    - step_deployment_package: Package for deployment (if requested)
+
+    Note: OOC (out-of-context) synthesis happens inside step_create_stitched_ip
+    when DataflowOutputType.OOC_SYNTH is requested, not as a separate step.
+
+    Args:
+        model: Input ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        ModelWrapper with final hardware artifacts generated
+    """
+    model = _execute_step(step_create_stitched_ip, model, cfg)
+    model = _execute_step(step_measure_rtlsim_performance, model, cfg)
+    model = _execute_step(step_synthesize_bitfile, model, cfg)
+    model = _execute_step(step_make_driver, model, cfg)
+    model = _execute_step(step_deployment_package, model, cfg)
+    return model
+
+
+#: Map phase name strings to phase functions
+build_dataflow_phase_lookup = {
+    "phase_prepare_model": phase_prepare_model,
+    "phase_optimize_model": phase_optimize_model,
+    "phase_convert_to_hardware": phase_convert_to_hardware,
+    "phase_optimize_hardware": phase_optimize_hardware,
+    "phase_build_hardware": phase_build_hardware,
+    "phase_synthesize_hardware": phase_synthesize_hardware,
+}
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index f2164ca2c1..0a95207a86 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -284,58 +284,6 @@ def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
     return verify_model
 
 
-def prepare_loop_ops_fifo_sizing(node, cfg):
-    node_inst = getCustomOp(node)
-    loop_model = node_inst.get_nodeattr("body")
-    loop_model = loop_model.transform(GiveUniqueNodeNames(prefix=node.name + "_"))
-    # go first into subgraph to check if there are other loop ops
-    loop_nodes = loop_model.get_nodes_by_op_type("FINNLoop")
-    for loop_node in loop_nodes:
-        prepare_loop_ops_fifo_sizing(loop_node, cfg)
-    loop_model = loop_model.transform(
-        PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
-    )
-    loop_model = loop_model.transform(HLSSynthIP(cfg._resolve_hls_clk_period()))
-    loop_model = loop_model.transform(ReplaceVerilogRelPaths())
-    if cfg.fifosim_save_waveform:
-        report_dir = cfg.output_dir + "/report"
-        os.makedirs(report_dir, exist_ok=True)
-        loop_model.set_metadata_prop(
-            "rtlsim_trace", os.path.abspath(report_dir) + f"/{node.name}_fifosim_trace.wdb"
-        )
-    loop_model = loop_model.transform(
-        InsertAndSetFIFODepths(
-            cfg._resolve_fpga_part(),
-            cfg._resolve_hls_clk_period(),
-            swg_exception=cfg.default_swg_exception,
-            vivado_ram_style=cfg.large_fifo_mem_style,
-            fifosim_input_throttle=cfg.fifosim_input_throttle,
-        )
-    )
-    loop_model = loop_model.transform(SplitLargeFIFOs())
-    loop_model = loop_model.transform(RemoveShallowFIFOs())
-    loop_model = loop_model.transform(GiveUniqueNodeNames(prefix=node.name + "_"))
-    loop_model = loop_model.transform(GiveReadableTensorNames())
-    node_inst.set_nodeattr("body", loop_model.graph)
-
-
-def prepare_loop_ops_ipgen(node, cfg):
-    node_inst = getCustomOp(node)
-    loop_model = node_inst.get_nodeattr("body")
-    # go first into subgraph to check if there are other loop ops
-    loop_nodes = loop_model.get_nodes_by_op_type("FINNLoop")
-    for loop_node in loop_nodes:
-        prepare_loop_ops_ipgen(loop_node, cfg)
-    loop_model = loop_model.transform(HLSSynthIP(cfg._resolve_hls_clk_period()))
-    loop_model = loop_model.transform(
-        CreateStitchedIP(
-            cfg._resolve_fpga_part(),
-            cfg.synth_clk_period_ns,
-        )
-    )
-    node_inst.set_nodeattr("body", loop_model.graph)
-
-
 def step_qonnx_to_finn(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
     This step will only execute if QONNX nodes are found.
@@ -845,11 +793,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
 def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Generate Vitis HLS code to prepare HLSBackend nodes for IP generation.
     And fills RTL templates for RTLBackend nodes."""
-
     model = model.transform(GiveUniqueNodeNames())
-    loop_nodes = model.get_nodes_by_op_type("FINNLoop")
-    for node in loop_nodes:
-        prepare_loop_ops_fifo_sizing(node, cfg)
     model = model.transform(
         PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()),
         apply_to_subgraphs=True,
@@ -861,10 +805,6 @@ def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
 def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Run Vitis HLS synthesis on generated code for HLSBackend nodes,
     in order to generate IP blocks. For RTL nodes this step does not do anything."""
-
-    loop_nodes = model.get_nodes_by_op_type("FINNLoop")
-    for node in loop_nodes:
-        prepare_loop_ops_ipgen(node, cfg)
     model = model.transform(HLSSynthIP(cfg._resolve_fpga_part()))
     model = model.transform(ReplaceVerilogRelPaths())
     report_dir = cfg.output_dir + "/report"
@@ -1370,6 +1310,79 @@ def step_loop_rolling(model, cfg):
     return model
 
 
+def step_loop_body_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Set FIFO depths for loop body model.
+
+    This step is designed to be called on a loop body model (extracted from FINNLoop).
+    It performs PrepareIP, HLSSynthIP, and InsertAndSetFIFODepths with parameters
+    appropriate for loop bodies.
+
+    Args:
+        model: Loop body ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        Loop body ModelWrapper with FIFOs sized
+    """
+    # Prepare and synthesize IP for FIFO characterization
+    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+    model = model.transform(HLSSynthIP(cfg._resolve_hls_clk_period()))
+    model = model.transform(ReplaceVerilogRelPaths())
+
+    # Set waveform trace if configured
+    if cfg.fifosim_save_waveform:
+        report_dir = cfg.output_dir + "/report"
+        os.makedirs(report_dir, exist_ok=True)
+        model.set_metadata_prop(
+            "rtlsim_trace", os.path.abspath(report_dir) + "/loop_body_fifosim_trace.wdb"
+        )
+
+    # Insert and size FIFOs
+    model = model.transform(
+        InsertAndSetFIFODepths(
+            cfg._resolve_fpga_part(),
+            cfg._resolve_hls_clk_period(),
+            swg_exception=cfg.default_swg_exception,
+            vivado_ram_style=cfg.large_fifo_mem_style,
+            fifosim_input_throttle=cfg.fifosim_input_throttle,
+        )
+    )
+    model = model.transform(SplitLargeFIFOs())
+    model = model.transform(RemoveShallowFIFOs())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    return model
+
+
+def step_loop_body_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Run HLS synthesis and create stitched IP for loop body model.
+
+    This step is designed to be called on a loop body model (extracted from FINNLoop).
+    It performs HLSSynthIP and CreateStitchedIP with parameters appropriate for
+    loop bodies (no verification, no output directory operations).
+
+    Args:
+        model: Loop body ModelWrapper
+        cfg: Build configuration
+
+    Returns:
+        Loop body ModelWrapper with synthesized IP and stitched IP created
+    """
+    # HLS synthesis for this loop body
+    model = model.transform(HLSSynthIP(cfg._resolve_hls_clk_period()))
+
+    # Create stitched IP for this loop body
+    model = model.transform(
+        CreateStitchedIP(
+            cfg._resolve_fpga_part(),
+            cfg.synth_clk_period_ns,
+        )
+    )
+
+    return model
+
+
 #: map step name strings to step functions
 build_dataflow_step_lookup = {
     "step_qonnx_to_finn": step_qonnx_to_finn,
@@ -1392,4 +1405,6 @@ def step_loop_rolling(model, cfg):
     "step_synthesize_bitfile": step_synthesize_bitfile,
     "step_deployment_package": step_deployment_package,
     "step_loop_rolling": step_loop_rolling,
+    "step_loop_body_set_fifo_depths": step_loop_body_set_fifo_depths,
+    "step_loop_body_hw_ipgen": step_loop_body_hw_ipgen,
 }
diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
index 018d8f0417..6c345adf99 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
@@ -220,8 +220,6 @@ def prepare_rtlsim(self, behav=False):
         return super().prepare_rtlsim(behav)
 
     def execute_node(self, context, graph):
-        mode = self.get_nodeattr("exec_mode")
-        if mode == "cppsim":
-            StreamingFIFO.execute_node(self, context, graph)
-        elif mode == "rtlsim":
-            RTLBackend.execute_node(self, context, graph)
+        # FIFOs just pass data through - use simple execution for all modes
+        # RTL simulation of FIFO is only relevant for timing/throughput, not functional verification
+        StreamingFIFO.execute_node(self, context, graph)
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index fa317265a6..4755ededaa 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -527,6 +527,7 @@ def apply(self, model):
 
         # clean up references to stitched IP and rtlsim objects
         # (the stitched IP needs to be re-done after FIFO sizing)
+        model.set_metadata_prop("exec_mode", "")
         model.set_metadata_prop("rtlsim_trace", "")
         model.set_metadata_prop("rtlsim_so", "")
         model.set_metadata_prop("vivado_stitch_proj", "")
diff --git a/tests/fpgadataflow/test_fpgadataflow_finnloop.py b/tests/fpgadataflow/test_fpgadataflow_finnloop.py
index 860eb6321c..74ea433f38 100644
--- a/tests/fpgadataflow/test_fpgadataflow_finnloop.py
+++ b/tests/fpgadataflow/test_fpgadataflow_finnloop.py
@@ -510,23 +510,14 @@ def test_finnloop_end2end_mlo(
 
     model.save(tmp_output_dir + "/mlo_model.onnx")
 
-    # steps are skipped because test model created with HLS and RTL layers
+    # Use phase-based pipeline
+    # Steps are adjusted because test model already has HLS and RTL layers
     steps = [
-        # "step_qonnx_to_finn",
-        # "step_tidy_up",
-        # "step_streamline",
-        # "step_convert_to_hw",
-        "step_create_dataflow_partition",
-        # "step_specialize_layers",
-        "step_loop_rolling",
-        # "step_target_fps_parallelization",
-        "step_apply_folding_config",
-        "step_minimize_bit_width",
-        "step_generate_estimate_reports",
-        "step_hw_codegen",
-        "step_hw_ipgen",
-        "step_set_fifo_depths",
-        "step_create_stitched_ip",
+        "step_create_dataflow_partition",  # Fine-grained (model already specialized)
+        "phase_convert_to_hardware",  # Phase (includes loop rolling)
+        "phase_optimize_hardware",  # Phase (includes folding, bit-width, reports)
+        "phase_build_hardware",  # Phase (includes codegen, ipgen, FIFOs)
+        "step_create_stitched_ip",  # Fine-grained (just IP, no full synth)
     ]
 
     cfg = build_cfg.DataflowBuildConfig(
diff --git a/tests/fpgadataflow/test_fpgadataflow_layernorm.py b/tests/fpgadataflow/test_fpgadataflow_layernorm.py
index 8d3cc7bf62..15d587cc95 100644
--- a/tests/fpgadataflow/test_fpgadataflow_layernorm.py
+++ b/tests/fpgadataflow/test_fpgadataflow_layernorm.py
@@ -448,19 +448,12 @@ def test_hls_rtl_dsp_conflict_detection():
     with open(specialize_config_file, "w") as f:
         json.dump(specialize_config, f)
 
-    # Build steps - includes conversion to HW layers and specialization
+    # Build steps using phases
     steps = [
-        "step_convert_to_hw",
-        "step_create_dataflow_partition",
-        "step_specialize_layers",
-        "step_target_fps_parallelization",
-        "step_apply_folding_config",
-        "step_minimize_bit_width",
-        "step_generate_estimate_reports",
-        "step_hw_codegen",
-        "step_hw_ipgen",
-        "step_set_fifo_depths",
-        "step_create_stitched_ip",
+        "phase_convert_to_hardware",  # Includes convert_to_hw, partition, specialize, loop_rolling
+        "phase_optimize_hardware",  # Includes target_fps, folding, bit_width, reports
+        "phase_build_hardware",  # Includes codegen, ipgen, FIFO depths
+        "step_create_stitched_ip",  # Fine-grained (just stitched IP)
     ]
 
     # Request verification steps that will trigger DSP conflict detection
@@ -667,17 +660,10 @@ def test_integer_hls_elementwise_no_dsp_conflict():
 
     # Build steps - includes conversion to HW layers and specialization
     steps = [
-        "step_convert_to_hw",
-        "step_create_dataflow_partition",
-        "step_specialize_layers",
-        "step_target_fps_parallelization",
-        "step_apply_folding_config",
-        "step_minimize_bit_width",
-        "step_generate_estimate_reports",
-        "step_hw_codegen",
-        "step_hw_ipgen",
-        "step_set_fifo_depths",
-        "step_create_stitched_ip",
+        "phase_convert_to_hardware",  # Includes convert_to_hw, partition, specialize, loop_rolling
+        "phase_optimize_hardware",  # Includes target_fps, folding, bit_width, reports
+        "phase_build_hardware",  # Includes codegen, ipgen, FIFO depths
+        "step_create_stitched_ip",  # Fine-grained (just stitched IP)
     ]
 
     # Request verification steps - stitched_ip_rtlsim should NOT be skipped
diff --git a/tests/fpgadataflow/test_validate_dataflow_conversion.py b/tests/fpgadataflow/test_validate_dataflow_conversion.py
new file mode 100644
index 0000000000..b6933d82e6
--- /dev/null
+++ b/tests/fpgadataflow/test_validate_dataflow_conversion.py
@@ -0,0 +1,160 @@
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Tests for dataflow conversion validation analysis pass."""
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+from finn.analysis.fpgadataflow.validate_dataflow_conversion import (
+    validate_dataflow_conversion,
+)
+from finn.transformation.fpgadataflow.convert_to_hw_layers import (
+    InferElementwiseBinaryOperation,
+    InferHWSoftmax,
+    InferQuantizedMatrixVectorActivation,
+    InferThresholdingLayer,
+)
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def make_test_model():
+    """Create a small model with different layer types for testing validation.
+
+    Model structure (all non-fpgadataflow initially):
+    - Layer 0: Transpose
+    - Layer 1: MatMul (INT4 weights)
+    - Layer 2: MultiThreshold
+    - Layer 3: Mul
+    - Layer 4: Softmax
+    """
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 4])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4, 4])
+
+    # Layer 0: Transpose
+    node0 = helper.make_node("Transpose", ["inp"], ["t0"], perm=[0, 1, 2])
+
+    # Layer 1: MatMul with INT4 weights
+    W1_data = gen_finn_dt_tensor(DataType["INT4"], (4, 4))
+    W1 = helper.make_tensor("W1", TensorProto.FLOAT, [4, 4], W1_data.flatten().tolist())
+    node1 = helper.make_node("MatMul", ["t0", "W1"], ["t1"])
+
+    # Layer 2: MultiThreshold (QONNX custom op)
+    # UINT4 has 16 values (0-15), so we need 15 thresholds per channel
+    T2_data = gen_finn_dt_tensor(DataType["INT16"], (4, 15))
+    T2_data = np.sort(T2_data, axis=1)  # Sort thresholds in increasing order
+    T2 = helper.make_tensor("T2", TensorProto.FLOAT, [4, 15], T2_data.flatten().tolist())
+    node2 = helper.make_node(
+        "MultiThreshold",
+        ["t1", "T2"],
+        ["t2"],
+        domain="qonnx.custom_op.general",
+        out_dtype="UINT4",
+        data_layout="NHWC",
+    )
+
+    # Layer 3: Mul
+    scale_data = np.array([2.0], dtype=np.float32)
+    scale = helper.make_tensor("scale", TensorProto.FLOAT, [1], scale_data.tolist())
+    node3 = helper.make_node("Mul", ["t2", "scale"], ["t3"])
+
+    # Layer 4: Softmax
+    node4 = helper.make_node("Softmax", ["t3"], ["out"], axis=-1)
+
+    graph = helper.make_graph(
+        [node0, node1, node2, node3, node4],
+        "test_validation",
+        [inp],
+        [out],
+        initializer=[W1, T2, scale],
+    )
+
+    model = qonnx_make_model(graph)
+    model = ModelWrapper(model)
+
+    # Set INT4 datatypes
+    model.set_tensor_datatype("inp", DataType["INT4"])
+    model.set_tensor_datatype("W1", DataType["INT4"])
+    model.set_tensor_datatype("T2", DataType["INT16"])
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    return model
+
+
+@pytest.mark.fpgadataflow
+def test_validate_dataflow_conversion_scenarios():
+    """Test validation through progressive conversion scenarios.
+
+    Test plan:
+    0. No conversions - should fail
+    1. Convert layer 2 (MultiThreshold) → [non, non, fpga, non, non] - should pass
+    2. Convert layer 4 (Softmax) → [non, non, fpga, non, fpga] - should FAIL (non-contiguous)
+    3. Convert layer 3 and 1 (Mul and MatMul) → [non, fpga, fpga, fpga, fpga] - should pass
+    """
+
+    # Scenario 0: No fpgadataflow layers - should fail
+    print("\n--- Scenario 0: No fpgadataflow layers ---")
+    model = make_test_model()
+    result = model.analysis(validate_dataflow_conversion)
+    print(f"Valid: {result['valid']}")
+    print(f"Message: {result['message']}")
+
+    assert result["valid"] is False, "Expected validation to fail with no fpgadataflow layers"
+    assert "No fpgadataflow layers found" in result["message"]
+    assert len(result["unconverted_layers"]) == 5
+
+    # Scenario 1: Convert layer 2 (MultiThreshold) → [non, non, fpga, non, non]
+    print("\n--- Scenario 1: Convert layer 2 (MultiThreshold) ---")
+    model = model.transform(InferThresholdingLayer())
+    result = model.analysis(validate_dataflow_conversion)
+    print(f"Valid: {result['valid']}")
+    print(f"Message: {result['message']}")
+
+    assert result["valid"] is True
+    assert "contiguous block" in result["message"].lower()
+    assert result["dataflow_block"] == (2, 2)
+
+    # Scenario 2: Convert layer 4 (Softmax) → [non, non, fpga, non, fpga] - should FAIL
+    print("\n--- Scenario 2: Convert layer 4 (Softmax) - EXPECT FAILURE ---")
+    model = model.transform(InferHWSoftmax())
+    result = model.analysis(validate_dataflow_conversion)
+    print(f"Valid: {result['valid']}")
+    print(f"Message: {result['message']}")
+
+    assert (
+        result["valid"] is False
+    ), "Expected validation to fail with non-contiguous dataflow block"
+    assert "Non-contiguous dataflow block detected" in result["message"]
+
+    # Scenario 3: Convert layer 3 (Mul) and layer 1 (MatMul) → [non, fpga, fpga, fpga, fpga]
+    print("\n--- Scenario 3: Convert layers 3 (Mul) and 1 (MatMul) ---")
+    model = model.transform(InferElementwiseBinaryOperation())
+    model = model.transform(InferQuantizedMatrixVectorActivation())
+    result = model.analysis(validate_dataflow_conversion)
+    print(f"Valid: {result['valid']}")
+    print(f"Message: {result['message']}")
+
+    assert result["valid"] is True
+    assert "contiguous block" in result["message"].lower()
+    assert result["dataflow_block"] == (1, 4)
+
+    # Final verification
+    print("\n--- Final verification ---")
+    nodes = model.graph.node
+    fpgadataflow_count = sum(1 for node in nodes if is_fpgadataflow_node(node))
+    print(f"Fpgadataflow layers: {fpgadataflow_count} / {len(nodes)}")
+    print(f"Total nodes: {len(nodes)}")
+
+    # 4 out of 5 layers should be fpgadataflow (all except Transpose)
+    assert fpgadataflow_count == 4
+    assert result["valid"] is True
+    assert len(result["unconverted_layers"]) == 1  # Only Transpose unconverted
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index 9edfecb352..8adc7d3a16 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -98,8 +98,11 @@ def test_end2end_build_dataflow_directory():
         assert os.path.isfile(stitched_waveform_dir + f"/verify_rtlsim_{i}.wdb")
 
         # Check that node-by-node rtlsim waveforms were created for each node
+        # Skip FIFO nodes as they use pass-through execution (no RTL simulation)
         node_waveform_dir = verify_out_dir + "/node_by_node_rtlsim_waveforms"
         for node in model.graph.node:
+            if node.op_type.startswith("StreamingFIFO"):
+                continue
             assert os.path.isfile(
                 node_waveform_dir + f"/{node.name}_rtlsim_{i}.wdb"
             ), f"Missing waveform for node {node.name} in batch {i}"
diff --git a/tests/util/test_build_dataflow_checks.py b/tests/util/test_build_dataflow_checks.py
index b6cdccd120..84c010ca2a 100644
--- a/tests/util/test_build_dataflow_checks.py
+++ b/tests/util/test_build_dataflow_checks.py
@@ -33,11 +33,11 @@ def make_test_model(build_dir):
 
 
 def cfg(output_dir, **kw):
-    """Create config that stops immediately after first step."""
+    """Create config that stops immediately after first phase."""
     return DataflowBuildConfig(
         output_dir=output_dir,
         synth_clk_period_ns=5.0,
-        stop_step="step_qonnx_to_finn",
+        stop_step="phase_prepare_model",
         generate_outputs=kw.pop("generate_outputs", [DataflowOutputType.ESTIMATE_REPORTS]),
         **kw
     )