From 4d160bc55a98b8766a7373420eb9fd04360ef168 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 10 Nov 2024 10:38:20 -0500
Subject: [PATCH 01/64] Add new multithreaded TwoQubitPeepholeOptimization pass

This commit adds a new transpiler pass for physical optimization,
TwoQubitPeepholeOptimization. This replaces the use of Collect2qBlocks,
ConsolidateBlocks, and UnitarySynthesis in the optimization stage for
a default pass manager setup. The pass logically works the same way
where it analyzes the dag to get a list of 2q runs, calculates the matrix
of each run, and then synthesizes the matrix and substitutes it inplace.
The distinction this pass makes though is it does this all in a single
pass and also parallelizes the matrix calculation and synthesis steps
because there is no data dependency there.

This new pass is not meant to fully replace the Collect2qBlocks,
ConsolidateBlocks, or UnitarySynthesis passes as those also run in
contexts where we don't have a physical circuit. This is meant instead
to replace their usage in the optimization stage only. Accordingly this
new pass also changes the logic on how we select the synthesis to use
and when to make a substituion. Previously this logic was primarily done
via the ConsolidateBlocks pass by only consolidating to a UnitaryGate if
the number of basis gates needed based on the weyl chamber coordinates
was less than the number of 2q gates in the block (see #11659 for
discussion on this). Since this new pass skips the explicit
consolidation stage we go ahead and try all the available synthesizers

Right now this commit has a number of limitations, the largest are:

- Only supports the target
- It doesn't support any synthesizers besides the TwoQubitBasisDecomposer,
  because it's the only one in rust currently.

For plugin handling I left the logic as running the three pass series,
but I'm not sure this is the behavior we want. We could say keep the
synthesis plugins for `UnitarySynthesis` only and then rely on our
built-in methods for physical optimiztion only. But this also seems less
than ideal because the plugin mechanism is how we support synthesizing
to custom basis gates, and also more advanced approximate synthesis
methods. Both of those are things we need to do as part of the synthesis
here.

Additionally, this is currently missing tests and documentation and while
running it manually "works" as in it returns a circuit that looks valid,
I've not done any validation yet. This also likely will need several
rounds of performance optimization and tuning. t this point this is
just a rough proof of concept and will need a lof refinement along with
larger changes to Qiskit's rust code before this is ready to merge.

Fixes #12007
Fixes #11659
---
 crates/accelerate/Cargo.toml                  |   7 +-
 crates/accelerate/src/consolidate_blocks.rs   |   8 +-
 .../accelerate/src/convert_2q_block_matrix.rs |  21 +-
 crates/accelerate/src/lib.rs                  |   1 +
 crates/accelerate/src/two_qubit_decompose.rs  |  19 +-
 crates/accelerate/src/two_qubit_peephole.rs   | 428 ++++++++++++++++++
 crates/accelerate/src/unitary_synthesis.rs    |   3 +-
 crates/pyext/src/lib.rs                       |   5 +
 qiskit/__init__.py                            |   1 +
 .../passes/optimization/two_qubit_peephole.py |  59 +++
 10 files changed, 527 insertions(+), 25 deletions(-)
 create mode 100644 crates/accelerate/src/two_qubit_peephole.rs
 create mode 100644 qiskit/transpiler/passes/optimization/two_qubit_peephole.py

diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml
index cc624e7750d6..0a3c295474a7 100644
--- a/crates/accelerate/Cargo.toml
+++ b/crates/accelerate/Cargo.toml
@@ -9,6 +9,10 @@ license.workspace = true
 name = "qiskit_accelerate"
 doctest = false
 
+
+[features]
+cache_pygates = ["qiskit-circuit/cache_pygates"]
+
 [dependencies]
 rayon.workspace = true
 numpy.workspace = true
@@ -60,6 +64,3 @@ features = ["ndarray"]
 [dependencies.pulp]
 version = "0.18.22"
 features = ["macro"]
-
-[features]
-cache_pygates = ["qiskit-circuit/cache_pygates"]
diff --git a/crates/accelerate/src/consolidate_blocks.rs b/crates/accelerate/src/consolidate_blocks.rs
index 1edd592ce877..46c648af75c1 100644
--- a/crates/accelerate/src/consolidate_blocks.rs
+++ b/crates/accelerate/src/consolidate_blocks.rs
@@ -107,7 +107,7 @@ pub(crate) fn consolidate_blocks(
                 dag.get_qargs(inst.qubits),
             ) {
                 all_block_gates.insert(inst_node);
-                let matrix = match get_matrix_from_inst(py, inst) {
+                let matrix = match get_matrix_from_inst(inst) {
                     Ok(mat) => mat,
                     Err(_) => continue,
                 };
@@ -198,7 +198,7 @@ pub(crate) fn consolidate_blocks(
                 *block_qargs.iter().min().unwrap(),
                 *block_qargs.iter().max().unwrap(),
             ];
-            let matrix = blocks_to_matrix(py, dag, &block, block_index_map).ok();
+            let matrix = blocks_to_matrix(dag, &block, block_index_map).ok();
             if let Some(matrix) = matrix {
                 if force_consolidate
                     || decomposer.num_basis_gates_inner(matrix.view()) < basis_count
@@ -252,7 +252,7 @@ pub(crate) fn consolidate_blocks(
                     first_qubits,
                 )
             {
-                let matrix = match get_matrix_from_inst(py, first_inst) {
+                let matrix = match get_matrix_from_inst(first_inst) {
                     Ok(mat) => mat,
                     Err(_) => continue,
                 };
@@ -272,7 +272,7 @@ pub(crate) fn consolidate_blocks(
                     already_in_block = true;
                 }
                 let gate = dag.dag()[*node].unwrap_operation();
-                let operator = match get_matrix_from_inst(py, gate) {
+                let operator = match get_matrix_from_inst(gate) {
                     Ok(mat) => mat,
                     Err(_) => {
                         // Set this to skip this run because we can't compute the matrix of the
diff --git a/crates/accelerate/src/convert_2q_block_matrix.rs b/crates/accelerate/src/convert_2q_block_matrix.rs
index aefc5976e82f..c79bd655dce8 100644
--- a/crates/accelerate/src/convert_2q_block_matrix.rs
+++ b/crates/accelerate/src/convert_2q_block_matrix.rs
@@ -31,7 +31,7 @@ use crate::euler_one_qubit_decomposer::matmul_1q;
 use crate::QiskitError;
 
 #[inline]
-pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Array2<Complex64>> {
+pub fn get_matrix_from_inst(inst: &PackedInstruction) -> PyResult<Array2<Complex64>> {
     if let Some(mat) = inst.op.matrix(inst.params_view()) {
         Ok(mat)
     } else if inst.op.try_standard_gate().is_some() {
@@ -39,13 +39,15 @@ pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Ar
             "Parameterized gates can't be consolidated",
         ))
     } else if let OperationRef::Gate(gate) = inst.op.view() {
-        Ok(QI_OPERATOR
-            .get_bound(py)
-            .call1((gate.gate.clone_ref(py),))?
-            .getattr(intern!(py, "data"))?
-            .extract::<PyReadonlyArray2<Complex64>>()?
-            .as_array()
-            .to_owned())
+        Python::with_gil(|py| {
+            Ok(QI_OPERATOR
+                .get_bound(py)
+                .call1((gate.gate.clone_ref(py),))?
+                .getattr(intern!(py, "data"))?
+                .extract::<PyReadonlyArray2<Complex64>>()?
+                .as_array()
+                .to_owned())
+        })
     } else {
         Err(QiskitError::new_err(
             "Can't compute matrix of non-unitary op",
@@ -55,7 +57,6 @@ pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Ar
 
 /// Return the matrix Operator resulting from a block of Instructions.
 pub fn blocks_to_matrix(
-    py: Python,
     dag: &DAGCircuit,
     op_list: &[NodeIndex],
     block_index_map: [Qubit; 2],
@@ -73,7 +74,7 @@ pub fn blocks_to_matrix(
     let mut output_matrix: Option<Array2<Complex64>> = None;
     for node in op_list {
         let inst = dag.dag()[*node].unwrap_operation();
-        let op_matrix = get_matrix_from_inst(py, inst)?;
+        let op_matrix = get_matrix_from_inst(inst)?;
         match dag
             .get_qargs(inst.qubits)
             .iter()
diff --git a/crates/accelerate/src/lib.rs b/crates/accelerate/src/lib.rs
index 45cf047a6808..eab307859c4b 100644
--- a/crates/accelerate/src/lib.rs
+++ b/crates/accelerate/src/lib.rs
@@ -56,6 +56,7 @@ pub mod synthesis;
 pub mod target_transpiler;
 pub mod twirling;
 pub mod two_qubit_decompose;
+pub mod two_qubit_peephole;
 pub mod uc_gate;
 pub mod unitary_synthesis;
 pub mod utils;
diff --git a/crates/accelerate/src/two_qubit_decompose.rs b/crates/accelerate/src/two_qubit_decompose.rs
index 4410d6f35e07..d0217dd5ff87 100644
--- a/crates/accelerate/src/two_qubit_decompose.rs
+++ b/crates/accelerate/src/two_qubit_decompose.rs
@@ -1244,9 +1244,9 @@ type TwoQubitSequenceVec = Vec<(Option<StandardGate>, SmallVec<[f64; 3]>, SmallV
 #[derive(Clone, Debug)]
 #[pyclass(sequence)]
 pub struct TwoQubitGateSequence {
-    gates: TwoQubitSequenceVec,
+    pub gates: TwoQubitSequenceVec,
     #[pyo3(get)]
-    global_phase: f64,
+    pub global_phase: f64,
 }
 
 impl TwoQubitGateSequence {
@@ -1709,7 +1709,7 @@ impl TwoQubitBasisDecomposer {
         gate: String,
         gate_matrix: ArrayView2<Complex64>,
         basis_fidelity: f64,
-        euler_basis: &str,
+        euler_basis: EulerBasis,
         pulse_optimize: Option<bool>,
     ) -> PyResult<Self> {
         let ipz: ArrayView2<Complex64> = aview2(&IPZ);
@@ -1817,7 +1817,7 @@ impl TwoQubitBasisDecomposer {
         Ok(TwoQubitBasisDecomposer {
             gate,
             basis_fidelity,
-            euler_basis: EulerBasis::__new__(euler_basis)?,
+            euler_basis,
             pulse_optimize,
             basis_decomposer,
             super_controlled,
@@ -1986,7 +1986,7 @@ impl TwoQubitBasisDecomposer {
             gate,
             gate_matrix.as_array(),
             basis_fidelity,
-            euler_basis,
+            EulerBasis::__new__(euler_basis)?,
             pulse_optimize,
         )
     }
@@ -2284,8 +2284,13 @@ fn two_qubit_decompose_up_to_diagonal(
     let (su4, phase) = u4_to_su4(mat_arr);
     let mut real_map = real_trace_transform(su4.view());
     let mapped_su4 = real_map.dot(&su4.view());
-    let decomp =
-        TwoQubitBasisDecomposer::new_inner("cx".to_string(), aview2(&CX_GATE), 1.0, "U", None)?;
+    let decomp = TwoQubitBasisDecomposer::new_inner(
+        "cx".to_string(),
+        aview2(&CX_GATE),
+        1.0,
+        EulerBasis::__new__("U")?,
+        None,
+    )?;
 
     let circ_seq = decomp.call_inner(mapped_su4.view(), None, true, None)?;
     let circ = CircuitData::from_standard_gates(
diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
new file mode 100644
index 000000000000..cf852fbbf997
--- /dev/null
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -0,0 +1,428 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2024
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+use std::cmp::Ordering;
+use std::sync::Mutex;
+
+use hashbrown::{HashMap, HashSet};
+use pyo3::prelude::*;
+use rayon::prelude::*;
+use rustworkx_core::petgraph::stable_graph::NodeIndex;
+use smallvec::{smallvec, SmallVec};
+
+use qiskit_circuit::circuit_instruction::ExtraInstructionAttributes;
+use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
+use qiskit_circuit::operations::{Operation, OperationRef, Param, StandardGate};
+use qiskit_circuit::packed_instruction::PackedOperation;
+use qiskit_circuit::Qubit;
+
+use crate::convert_2q_block_matrix::blocks_to_matrix;
+use crate::euler_one_qubit_decomposer::{
+    EulerBasis, EulerBasisSet, EULER_BASES, EULER_BASIS_NAMES,
+};
+use crate::nlayout::PhysicalQubit;
+use crate::target_transpiler::Target;
+use crate::two_qubit_decompose::{TwoQubitBasisDecomposer, TwoQubitGateSequence};
+
+fn get_decomposers_from_target(
+    target: &Target,
+    qubits: &[Qubit],
+    fidelity: f64,
+) -> PyResult<Vec<TwoQubitBasisDecomposer>> {
+    let physical_qubits = smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
+    let gate_names = match target.operation_names_for_qargs(Some(&physical_qubits)) {
+        Ok(names) => names,
+        Err(_) => {
+            let reverse_qubits = physical_qubits.iter().rev().copied().collect();
+            target
+                .operation_names_for_qargs(Some(&reverse_qubits))
+                .unwrap()
+        }
+    };
+
+    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param])> = gate_names
+        .iter()
+        .filter_map(|name| match target.operation_from_name(name) {
+            Ok(raw_op) => match raw_op.operation.view() {
+                OperationRef::Standard(_) | OperationRef::Gate(_) => {
+                    Some((*name, &raw_op.operation, raw_op.params.as_slice()))
+                }
+                _ => None,
+            },
+            Err(_) => None,
+        })
+        .collect();
+
+    let single_qubit_basis_list =
+        target.operation_names_for_qargs(Some(&smallvec![physical_qubits[0]]));
+    let mut target_basis_set = EulerBasisSet::new();
+    match single_qubit_basis_list {
+        Ok(basis_list) => {
+            EULER_BASES
+                .iter()
+                .enumerate()
+                .filter_map(|(idx, gates)| {
+                    if !gates.iter().all(|gate| basis_list.contains(gate)) {
+                        return None;
+                    }
+                    let basis = EULER_BASIS_NAMES[idx];
+                    Some(basis)
+                })
+                .for_each(|basis| target_basis_set.add_basis(basis));
+        }
+        Err(_) => target_basis_set.support_all(),
+    }
+    if target_basis_set.basis_supported(EulerBasis::U3)
+        && target_basis_set.basis_supported(EulerBasis::U321)
+    {
+        target_basis_set.remove(EulerBasis::U3);
+    }
+    if target_basis_set.basis_supported(EulerBasis::ZSX)
+        && target_basis_set.basis_supported(EulerBasis::ZSXX)
+    {
+        target_basis_set.remove(EulerBasis::ZSX);
+    }
+
+    let euler_bases: Vec<EulerBasis> = target_basis_set.get_bases().collect();
+
+    available_kak_gate
+        .iter()
+        .filter_map(|(two_qubit_name, two_qubit_gate, params)| {
+            let matrix = two_qubit_gate.matrix(params);
+            matrix.map(|matrix| {
+                euler_bases.iter().map(move |euler_basis| {
+                    TwoQubitBasisDecomposer::new_inner(
+                        two_qubit_name.to_string(),
+                        matrix.view(),
+                        fidelity,
+                        *euler_basis,
+                        None,
+                    )
+                })
+            })
+        })
+        .flatten()
+        .collect()
+}
+
+#[inline]
+fn score_sequence<'a>(
+    target: &'a Target,
+    kak_gate_name: &str,
+    sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
+) -> f64 {
+    1. - sequence
+        .map(|(gate, local_qubits)| {
+            let qubits = local_qubits
+                .iter()
+                .map(|qubit| PhysicalQubit(qubit.0))
+                .collect::<Vec<_>>();
+            let name = match gate.as_ref() {
+                Some(g) => g.name(),
+                None => kak_gate_name,
+            };
+            1. - target.get_error(name, qubits.as_slice()).unwrap_or(0.)
+        })
+        .product::<f64>()
+}
+
+type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;
+
+/// This transpiler pass can only run in a context where we've translated the circuit gates (or
+/// where we know all gates have a matrix). If any gate identified in the run fails to have a
+/// matrix defined (either in rust or python) it will be skipped
+#[pyfunction]
+pub(crate) fn two_qubit_unitary_peephole_optimize(
+    py: Python,
+    dag: &DAGCircuit,
+    target: &Target,
+    fidelity: f64,
+) -> PyResult<DAGCircuit> {
+    let runs: Vec<Vec<NodeIndex>> = dag.collect_2q_runs().unwrap();
+    let node_mapping: HashMap<NodeIndex, usize> =
+        HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
+    let locked_node_mapping = Mutex::new(node_mapping);
+
+    // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
+    // This is done in parallel
+    let run_mapping: PyResult<Vec<MappingIterItem>> = runs
+        .par_iter()
+        .enumerate()
+        .map(|(run_index, node_indices)| {
+            let block_qubit_map = node_indices
+                .iter()
+                .find_map(|node_index| {
+                    let inst = dag.dag()[*node_index].unwrap_operation();
+                    let qubits = dag.get_qargs(inst.qubits);
+                    if qubits.len() == 2 {
+                        if qubits[0] > qubits[1] {
+                            Some([qubits[1], qubits[0]])
+                        } else {
+                            Some([qubits[0], qubits[1]])
+                        }
+                    } else {
+                        None
+                    }
+                })
+                .unwrap();
+            let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
+            let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
+            let mut decomposer_scores: Vec<Option<f64>> = vec![None; decomposers.len()];
+
+            let order_sequence =
+                |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
+                 (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
+                    let score_a = match decomposer_scores[*index_a] {
+                        Some(score) => score,
+                        None => {
+                            let score: f64 =
+                                score_sequence(
+                                    target,
+                                    sequence_a.1.as_str(),
+                                    sequence_a.0.gates.iter().map(
+                                        |(gate, _params, local_qubits)| {
+                                            let qubits = local_qubits
+                                                .iter()
+                                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                                .collect();
+                                            (*gate, qubits)
+                                        },
+                                    ),
+                                );
+                            decomposer_scores[*index_a] = Some(score);
+                            score
+                        }
+                    };
+
+                    let score_b = match decomposer_scores[*index_b] {
+                        Some(score) => score,
+                        None => {
+                            let score: f64 =
+                                score_sequence(
+                                    target,
+                                    sequence_b.1.as_str(),
+                                    sequence_b.0.gates.iter().map(
+                                        |(gate, _params, local_qubits)| {
+                                            let qubits = local_qubits
+                                                .iter()
+                                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                                .collect();
+                                            (*gate, qubits)
+                                        },
+                                    ),
+                                );
+                            decomposer_scores[*index_b] = Some(score);
+                            score
+                        }
+                    };
+                    score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
+                };
+
+            let sequence = decomposers
+                .iter()
+                .map(|decomposer| {
+                    (
+                        decomposer
+                            .call_inner(matrix.view(), None, true, None)
+                            .unwrap(),
+                        decomposer.gate_name().to_string(),
+                    )
+                })
+                .enumerate()
+                .min_by(order_sequence)
+                .unwrap()
+                .1;
+            let mut original_err: f64 = 1.;
+            let mut outside_target = false;
+            for node_index in node_indices {
+                let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
+                    unreachable!("All run nodes will be ops")
+                };
+                let qubits = dag
+                    .get_qargs(inst.qubits)
+                    .iter()
+                    .map(|qubit| PhysicalQubit(qubit.0))
+                    .collect::<Vec<_>>();
+                let name = inst.op.name();
+                let gate_err = match target.get_error(name, qubits.as_slice()) {
+                    Some(err) => 1. - err,
+                    None => {
+                        // If error rate is None this can mean either the gate is not supported
+                        // in the target or the gate is ideal. We need to do a second lookup
+                        // to determine if the gate is supported, and if it isn't we don't need
+                        // to finish scoring because we know we'll use the synthesis output
+                        let physical_qargs =
+                            qubits.iter().map(|bit| PhysicalQubit(bit.0)).collect();
+                        if !target.instruction_supported(name, Some(&physical_qargs)) {
+                            outside_target = true;
+                            break;
+                        }
+                        1.
+                    }
+                };
+                original_err *= gate_err;
+            }
+            let original_score = 1. - original_err;
+            let new_score: f64 = if !outside_target {
+                score_sequence(
+                    target,
+                    sequence.1.as_str(),
+                    sequence
+                        .0
+                        .gates
+                        .iter()
+                        .map(|(gate, _params, local_qubits)| {
+                            let qubits = local_qubits
+                                .iter()
+                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                .collect();
+                            (*gate, qubits)
+                        }),
+                )
+            } else {
+                1.
+            };
+
+            if outside_target
+                || new_score > original_score
+                || (new_score == original_score
+                    && sequence
+                        .0
+                        .gates
+                        .iter()
+                        .filter(|(_, __, qubits)| qubits.len() == 2)
+                        .count()
+                        >= node_indices
+                            .iter()
+                            .filter(|node_index| {
+                                let NodeType::Operation(ref inst) = dag.dag()[**node_index] else {
+                                    unreachable!("All run nodes will be ops")
+                                };
+                                let qubits = dag.get_qargs(inst.qubits);
+                                qubits.len() == 2
+                            })
+                            .count())
+            {
+                return Ok(None);
+            }
+            // This is done at the end of the map in some attempt to minimize
+            // lock contention. If this were serial code it'd make more sense
+            // to do this as part of the iteration building the
+            let mut node_mapping = locked_node_mapping.lock().unwrap();
+            for node in node_indices {
+                node_mapping.insert(*node, run_index);
+            }
+            Ok(Some((sequence, block_qubit_map)))
+        })
+        .collect();
+
+    let run_mapping = run_mapping?;
+    // After we've computed all the sequences to execute now serially build up a new dag.
+    let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
+    let mut out_dag = dag.copy_empty_like(py, "alike")?;
+    let node_mapping = locked_node_mapping.into_inner().unwrap();
+    for node in dag.topological_op_nodes()? {
+        match node_mapping.get(&node) {
+            Some(run_index) => {
+                if processed_runs.contains(run_index) {
+                    continue;
+                }
+                if run_mapping[*run_index].is_none() {
+                    let NodeType::Operation(ref instr) = dag.dag()[node] else {
+                        unreachable!("Must be an op node")
+                    };
+                    out_dag.push_back(py, instr.clone())?;
+                    continue;
+                }
+                let (sequence, qubit_map) = &run_mapping[*run_index].as_ref().unwrap();
+                for (gate, params, local_qubits) in &sequence.0.gates {
+                    let qubits: Vec<Qubit> = local_qubits
+                        .iter()
+                        .map(|index| qubit_map[*index as usize])
+                        .collect();
+                    let out_params = if params.is_empty() {
+                        None
+                    } else {
+                        Some(params.iter().map(|val| Param::Float(*val)).collect())
+                    };
+                    match gate {
+                        Some(gate) => {
+                            #[cfg(feature = "cache_pygates")]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    PackedOperation::from_standard(*gate),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                    None,
+                                )
+                            }
+                            #[cfg(not(feature = "cache_pygates"))]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    PackedOperation::from_standard(*gate),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                )
+                            }
+                        }
+                        None => {
+                            let gate = target.operation_from_name(sequence.1.as_str()).unwrap();
+                            #[cfg(feature = "cache_pygates")]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    gate.operation.clone(),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                    None,
+                                )
+                            }
+                            #[cfg(not(feature = "cache_pygates"))]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    gate.operation.clone(),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                )
+                            }
+                        }
+                    }?;
+                }
+                out_dag.add_global_phase(py, &Param::Float(sequence.0.global_phase))?;
+                processed_runs.insert(*run_index);
+            }
+            None => {
+                let NodeType::Operation(ref instr) = dag.dag()[node] else {
+                    unreachable!("Must be an op node")
+                };
+                out_dag.push_back(py, instr.clone())?;
+            }
+        }
+    }
+    Ok(out_dag)
+}
+
+pub fn two_qubit_peephole_mod(m: &Bound<PyModule>) -> PyResult<()> {
+    m.add_wrapped(wrap_pyfunction!(two_qubit_unitary_peephole_optimize))?;
+    Ok(())
+}
diff --git a/crates/accelerate/src/unitary_synthesis.rs b/crates/accelerate/src/unitary_synthesis.rs
index 62f41c78084c..338e9481f05d 100644
--- a/crates/accelerate/src/unitary_synthesis.rs
+++ b/crates/accelerate/src/unitary_synthesis.rs
@@ -666,6 +666,7 @@ fn get_2q_decomposers_from_target(
             if let Some(approx_degree) = approximation_degree {
                 basis_2q_fidelity *= approx_degree;
             }
+            let basis_1q = EulerBasis::__new__(basis_1q)?;
             let decomposer = TwoQubitBasisDecomposer::new_inner(
                 gate.operation.name().to_string(),
                 gate.operation.matrix(&gate.params).unwrap().view(),
@@ -761,7 +762,7 @@ fn get_2q_decomposers_from_target(
                         pi_2_basis.to_string(),
                         StandardGate::CXGate.matrix(&[]).unwrap().view(),
                         fidelity,
-                        basis_1q,
+                        EulerBasis::__new__(basis_1q)?,
                         Some(true),
                     )?)
                 } else {
diff --git a/crates/pyext/src/lib.rs b/crates/pyext/src/lib.rs
index d8e59e04e51e..890cf486df88 100644
--- a/crates/pyext/src/lib.rs
+++ b/crates/pyext/src/lib.rs
@@ -28,6 +28,11 @@ where
 #[rustfmt::skip]
 #[pymodule]
 fn _accelerate(m: &Bound<PyModule>) -> PyResult<()> {
+    add_submodule(
+        m,
+        ::qiskit_accelerate::two_qubit_peephole::two_qubit_peephole_mod,
+        "two_qubit_peephole",
+    )?;
     add_submodule(m, ::qiskit_accelerate::barrier_before_final_measurement::barrier_before_final_measurements_mod, "barrier_before_final_measurement")?;
     add_submodule(m, ::qiskit_accelerate::basis::basis, "basis")?;
     add_submodule(m, ::qiskit_accelerate::check_map::check_map_mod, "check_map")?;
diff --git a/qiskit/__init__.py b/qiskit/__init__.py
index af543bea80c7..9105e39fed91 100644
--- a/qiskit/__init__.py
+++ b/qiskit/__init__.py
@@ -107,6 +107,7 @@
 sys.modules["qiskit._accelerate.inverse_cancellation"] = _accelerate.inverse_cancellation
 sys.modules["qiskit._accelerate.check_map"] = _accelerate.check_map
 sys.modules["qiskit._accelerate.filter_op_nodes"] = _accelerate.filter_op_nodes
+sys.modules["qiskit._accelerate.two_qubit_peephole"] = _accelerate.two_qubit_peephole
 sys.modules["qiskit._accelerate.twirling"] = _accelerate.twirling
 sys.modules["qiskit._accelerate.high_level_synthesis"] = _accelerate.high_level_synthesis
 sys.modules["qiskit._accelerate.remove_identity_equiv"] = _accelerate.remove_identity_equiv
diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
new file mode 100644
index 000000000000..60536d09aa71
--- /dev/null
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -0,0 +1,59 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2017, 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""Splits each two-qubit gate in the `dag` into two single-qubit gates, if possible without error."""
+
+from __future__ import annotations
+
+from qiskit.transpiler.basepasses import TransformationPass
+from qiskit.transpiler.passmanager import PassManager
+from qiskit.transpiler.passes.optimization import Collect2qBlocks, ConsolidateBlocks
+from qiskit.transpiler.passes.synthesis import UnitarySynthesis
+from qiskit.transpiler.target import Target
+from qiskit.dagcircuit.dagcircuit import DAGCircuit
+from qiskit._accelerate.two_qubit_peephole import two_qubit_unitary_peephole_optimize
+
+
+class TwoQubitPeepholeOptimization(TransformationPass):
+    """Unified two qubit unitary peephole optimization"""
+
+    def __init__(
+        self,
+        target: Target,
+        approximation_degree: float | None = 1.0,
+        method: str = "default",
+        plugin_config: dict = None,
+    ):
+        super().__init__()
+        self._target = target
+        self._approximation_degree = approximation_degree
+        self._pm = None
+        if method != "default":
+            self._pm = PassManager(
+                [
+                    Collect2qBlocks(),
+                    ConsolidateBlocks(
+                        target=self._target, approximation_degree=self._approximation_degree
+                    ),
+                    UnitarySynthesis(
+                        target=target,
+                        approximation_degree=approximation_degree,
+                        method=method,
+                        plugin_config=plugin_config,
+                    ),
+                ]
+            )
+
+    def run(self, dag: DAGCircuit) -> DAGCircuit:
+        if self._pm is not None:
+            return self._pm.run(dag)
+        return two_qubit_unitary_peephole_optimize(dag, self._target, self._approximation_degree)

From 746758fe15ccafb08eca954d910635cc004163a7 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 1 Dec 2024 02:58:56 -0500
Subject: [PATCH 02/64] Add support for running the
 TwoQubitControlledUDecomposer

Since #13139 merged we have another two qubit decomposer available to
run in rust, the TwoQubitControlledUDecomposer. This commit updates the
new TwoQubitPeepholeOptimization to call this decomposer if the target
supports appropriate 2q gates.
---
 crates/accelerate/src/two_qubit_decompose.rs | 44 +++++++++----------
 crates/accelerate/src/two_qubit_peephole.rs  | 45 +++++++++++++++++---
 2 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_decompose.rs b/crates/accelerate/src/two_qubit_decompose.rs
index d0217dd5ff87..3eba734a9171 100644
--- a/crates/accelerate/src/two_qubit_decompose.rs
+++ b/crates/accelerate/src/two_qubit_decompose.rs
@@ -2436,23 +2436,23 @@ pub enum RXXEquivalent {
 }
 
 impl RXXEquivalent {
-    fn matrix(&self, py: Python, param: f64) -> PyResult<Array2<Complex64>> {
+    fn matrix(&self, param: f64) -> PyResult<Array2<Complex64>> {
         match self {
             Self::Standard(gate) => Ok(gate.matrix(&[Param::Float(param)]).unwrap()),
-            Self::CustomPython(gate_cls) => {
+            Self::CustomPython(gate_cls) => Python::with_gil(|py: Python| {
                 let gate_obj = gate_cls.bind(py).call1((param,))?;
                 let raw_matrix = gate_obj
                     .call_method0(intern!(py, "to_matrix"))?
                     .extract::<PyReadonlyArray2<Complex64>>()?;
                 Ok(raw_matrix.as_array().to_owned())
-            }
+            }),
         }
     }
 }
 
 #[pyclass(module = "qiskit._accelerate.two_qubit_decompose", subclass)]
 pub struct TwoQubitControlledUDecomposer {
-    rxx_equivalent_gate: RXXEquivalent,
+    pub rxx_equivalent_gate: RXXEquivalent,
     #[pyo3(get)]
     scale: f64,
 }
@@ -2467,7 +2467,6 @@ impl TwoQubitControlledUDecomposer {
     /// invert 2q gate sequence
     fn invert_2q_gate(
         &self,
-        py: Python,
         gate: (Option<StandardGate>, SmallVec<[f64; 3]>, SmallVec<[u8; 2]>),
     ) -> PyResult<InverseReturn> {
         let (gate, params, qubits) = gate;
@@ -2504,7 +2503,7 @@ impl TwoQubitControlledUDecomposer {
                         .collect::<SmallVec<_>>();
                     Ok((Some(inv_gate.0), inv_gate_params, qubits))
                 }
-                RXXEquivalent::CustomPython(gate_cls) => {
+                RXXEquivalent::CustomPython(gate_cls) => Python::with_gil(|py: Python| {
                     let gate_obj = gate_cls.bind(py).call1(PyTuple::new_bound(py, params))?;
                     let raw_inverse = gate_obj.call_method0(intern!(py, "inverse"))?;
                     let inverse: OperationFromPython = raw_inverse.extract()?;
@@ -2525,7 +2524,7 @@ impl TwoQubitControlledUDecomposer {
                             "rxx gate inverse is not valid for this decomposer",
                         ))
                     }
-                }
+                }),
             }
         }
     }
@@ -2538,14 +2537,14 @@ impl TwoQubitControlledUDecomposer {
     ///      Circuit: Circuit equivalent to an RXXGate.
     ///  Raises:
     ///      QiskitError: If the circuit is not equivalent to an RXXGate.
-    fn to_rxx_gate(&self, py: Python, angle: f64) -> PyResult<TwoQubitGateSequence> {
+    fn to_rxx_gate(&self, angle: f64) -> PyResult<TwoQubitGateSequence> {
         // The user-provided RXXGate equivalent gate may be locally equivalent to the RXXGate
         // but with some scaling in the rotation angle. For example, RXXGate(angle) has Weyl
         // parameters (angle, 0, 0) for angle in [0, pi/2] but the user provided gate, i.e.
         // :code:`self.rxx_equivalent_gate(angle)` might produce the Weyl parameters
         // (scale * angle, 0, 0) where scale != 1. This is the case for the CPhaseGate.
 
-        let mat = self.rxx_equivalent_gate.matrix(py, self.scale * angle)?;
+        let mat = self.rxx_equivalent_gate.matrix(self.scale * angle)?;
         let decomposer_inv =
             TwoQubitWeylDecomposition::new_inner(mat.view(), Some(DEFAULT_FIDELITY), None)?;
 
@@ -2611,18 +2610,17 @@ impl TwoQubitControlledUDecomposer {
     /// Appends U_d(a, b, c) to the circuit.
     fn weyl_gate(
         &self,
-        py: Python,
         circ: &mut TwoQubitGateSequence,
         target_decomposed: TwoQubitWeylDecomposition,
         atol: f64,
     ) -> PyResult<()> {
-        let circ_a = self.to_rxx_gate(py, -2.0 * target_decomposed.a)?;
+        let circ_a = self.to_rxx_gate(-2.0 * target_decomposed.a)?;
         circ.gates.extend(circ_a.gates);
         let mut global_phase = circ_a.global_phase;
 
         // translate the RYYGate(b) into a circuit based on the desired Ctrl-U gate.
         if (target_decomposed.b).abs() > atol {
-            let circ_b = self.to_rxx_gate(py, -2.0 * target_decomposed.b)?;
+            let circ_b = self.to_rxx_gate(-2.0 * target_decomposed.b)?;
             global_phase += circ_b.global_phase;
             circ.gates
                 .push((Some(StandardGate::SdgGate), smallvec![], smallvec![0]));
@@ -2644,7 +2642,7 @@ impl TwoQubitControlledUDecomposer {
             // circuit if c < 0.
             let mut gamma = -2.0 * target_decomposed.c;
             if gamma <= 0.0 {
-                let circ_c = self.to_rxx_gate(py, gamma)?;
+                let circ_c = self.to_rxx_gate(gamma)?;
                 global_phase += circ_c.global_phase;
                 circ.gates
                     .push((Some(StandardGate::HGate), smallvec![], smallvec![0]));
@@ -2658,7 +2656,7 @@ impl TwoQubitControlledUDecomposer {
             } else {
                 // invert the circuit above
                 gamma *= -1.0;
-                let circ_c = self.to_rxx_gate(py, gamma)?;
+                let circ_c = self.to_rxx_gate(gamma)?;
                 global_phase -= circ_c.global_phase;
                 circ.gates
                     .push((Some(StandardGate::HGate), smallvec![], smallvec![0]));
@@ -2666,7 +2664,7 @@ impl TwoQubitControlledUDecomposer {
                     .push((Some(StandardGate::HGate), smallvec![], smallvec![1]));
                 for gate in circ_c.gates.into_iter().rev() {
                     let (inv_gate_name, inv_gate_params, inv_gate_qubits) =
-                        self.invert_2q_gate(py, gate)?;
+                        self.invert_2q_gate(gate)?;
                     circ.gates
                         .push((inv_gate_name, inv_gate_params, inv_gate_qubits));
                 }
@@ -2683,9 +2681,8 @@ impl TwoQubitControlledUDecomposer {
 
     ///  Returns the Weyl decomposition in circuit form.
     ///  Note: atol is passed to OneQubitEulerDecomposer.
-    fn call_inner(
+    pub fn call_inner(
         &self,
-        py: Python,
         unitary: ArrayView2<Complex64>,
         atol: f64,
     ) -> PyResult<TwoQubitGateSequence> {
@@ -2729,7 +2726,7 @@ impl TwoQubitControlledUDecomposer {
             gates,
             global_phase,
         };
-        self.weyl_gate(py, &mut gates1, target_decomposed, atol)?;
+        self.weyl_gate(&mut gates1, target_decomposed, atol)?;
         global_phase += gates1.global_phase;
 
         if let Some(unitary_c1r) = unitary_c1r {
@@ -2760,7 +2757,7 @@ impl TwoQubitControlledUDecomposer {
     ///      QiskitError: If the gate is not locally equivalent to an :class:`.RXXGate`.
     #[new]
     #[pyo3(signature=(rxx_equivalent_gate))]
-    pub fn new(py: Python, rxx_equivalent_gate: RXXEquivalent) -> PyResult<Self> {
+    pub fn new(rxx_equivalent_gate: RXXEquivalent) -> PyResult<Self> {
         let atol = DEFAULT_ATOL;
         let test_angles = [0.2, 0.3, PI2];
 
@@ -2776,14 +2773,17 @@ impl TwoQubitControlledUDecomposer {
                         }
                     }
                     RXXEquivalent::CustomPython(gate_cls) => {
-                        if gate_cls.bind(py).call1((test_angle,)).ok().is_none() {
+                        let takes_param = Python::with_gil(|py: Python| {
+                            gate_cls.bind(py).call1((test_angle,)).ok().is_none()
+                        });
+                        if takes_param {
                             return Err(QiskitError::new_err(
                                 "Equivalent gate needs to take exactly 1 angle parameter.",
                             ));
                         }
                     }
                 };
-                let mat = rxx_equivalent_gate.matrix(py, test_angle)?;
+                let mat = rxx_equivalent_gate.matrix(test_angle)?;
                 let decomp =
                     TwoQubitWeylDecomposition::new_inner(mat.view(), Some(DEFAULT_FIDELITY), None)?;
                 let mat_rxx = StandardGate::RXXGate
@@ -2834,7 +2834,7 @@ impl TwoQubitControlledUDecomposer {
         unitary: PyReadonlyArray2<Complex64>,
         atol: f64,
     ) -> PyResult<CircuitData> {
-        let sequence = self.call_inner(py, unitary.as_array(), atol)?;
+        let sequence = self.call_inner(unitary.as_array(), atol)?;
         match &self.rxx_equivalent_gate {
             RXXEquivalent::Standard(rxx_gate) => CircuitData::from_standard_gates(
                 py,
diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index cf852fbbf997..d6a59b93b4cf 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -31,13 +31,20 @@ use crate::euler_one_qubit_decomposer::{
 };
 use crate::nlayout::PhysicalQubit;
 use crate::target_transpiler::Target;
-use crate::two_qubit_decompose::{TwoQubitBasisDecomposer, TwoQubitGateSequence};
+use crate::two_qubit_decompose::{
+    RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
+};
+
+enum TwoQubitDecomposer {
+    Basis(TwoQubitBasisDecomposer),
+    ControlledU(TwoQubitControlledUDecomposer),
+}
 
 fn get_decomposers_from_target(
     target: &Target,
     qubits: &[Qubit],
     fidelity: f64,
-) -> PyResult<Vec<TwoQubitBasisDecomposer>> {
+) -> PyResult<Vec<TwoQubitDecomposer>> {
     let physical_qubits = smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
     let gate_names = match target.operation_names_for_qargs(Some(&physical_qubits)) {
         Ok(names) => names,
@@ -94,7 +101,7 @@ fn get_decomposers_from_target(
 
     let euler_bases: Vec<EulerBasis> = target_basis_set.get_bases().collect();
 
-    available_kak_gate
+    let decomposers: PyResult<Vec<TwoQubitDecomposer>> = available_kak_gate
         .iter()
         .filter_map(|(two_qubit_name, two_qubit_gate, params)| {
             let matrix = two_qubit_gate.matrix(params);
@@ -107,11 +114,26 @@ fn get_decomposers_from_target(
                         *euler_basis,
                         None,
                     )
+                    .map(TwoQubitDecomposer::Basis)
                 })
             })
         })
         .flatten()
-        .collect()
+        .collect();
+    let mut decomposers = decomposers?;
+    for gate in [
+        StandardGate::RXXGate,
+        StandardGate::RZZGate,
+        StandardGate::RYYGate,
+        StandardGate::RZXGate,
+    ] {
+        if gate_names.contains(gate.name()) {
+            decomposers.push(TwoQubitDecomposer::ControlledU(
+                TwoQubitControlledUDecomposer::new(RXXEquivalent::Standard(gate))?,
+            ));
+        }
+    }
+    Ok(decomposers)
 }
 
 #[inline]
@@ -229,13 +251,22 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
 
             let sequence = decomposers
                 .iter()
-                .map(|decomposer| {
-                    (
+                .map(|decomposer| match decomposer {
+                    TwoQubitDecomposer::Basis(decomposer) => (
                         decomposer
                             .call_inner(matrix.view(), None, true, None)
                             .unwrap(),
                         decomposer.gate_name().to_string(),
-                    )
+                    ),
+                    TwoQubitDecomposer::ControlledU(decomposer) => (
+                        decomposer.call_inner(matrix.view(), 1e-12).unwrap(),
+                        match decomposer.rxx_equivalent_gate {
+                            RXXEquivalent::Standard(gate) => gate.name().to_string(),
+                            RXXEquivalent::CustomPython(_) => {
+                                unreachable!("Decomposer only uses standard gates")
+                            }
+                        },
+                    ),
                 })
                 .enumerate()
                 .min_by(order_sequence)

From ee70e658fb20172f5653f6cf942226da7f119369 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 1 Dec 2024 15:51:42 -0500
Subject: [PATCH 03/64] Suprress clippy warning about mismatch enum variant
 sizes

Clippy is correctly warning that the size difference between the two
decomposer types in the TwoQubitDecomposer enumese two types is large.
TwoQubitBasisDecomposer is 1640 bytes and TwoQubitControlledUDecomposer
is only 24 bytes. This means each element of ControlledU is wasting
> 1600 bytes. However, in this case that is acceptable in order to
avoid a layer of pointer indirection as these are stored temporarily
in a vec inside a thread to decompose a unitary. A trait would be more
natural for this to define a common interface between all the two qubit
decomposers but since we keep them instantiated for each edge in a Vec
they need to be sized and doing something like
`Box<dyn TwoQubitDecomposer>` (assuming a trait `TwoQubitDecomposer`
instead of a enum) to get around this would have additional runtime
overhead. This is also considering that TwoQubitControlledUDecomposer
has far less likelihood in practice as it only works with some targets
that have RZZ, RXX, RYY, or RZX gates on an edge which is less common.
---
 crates/accelerate/src/two_qubit_peephole.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index d6a59b93b4cf..ff03d3df73f7 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -35,6 +35,12 @@ use crate::two_qubit_decompose::{
     RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
 };
 
+// The difference between these two types is large where TwoQubitBasisDecomposer
+// is 1640 bytes and TwoQubitControlledUDecomposer is only 24 bytes. This means
+// each element of ControlledU is wasting > 1600 bytes but that is acceptable in
+// this case to avoid the layer of pointer indirection as these are stored
+// temporarily in a vec inside a thread to decompose a unitary.
+#[allow(clippy::large_enum_variant)]
 enum TwoQubitDecomposer {
     Basis(TwoQubitBasisDecomposer),
     ControlledU(TwoQubitControlledUDecomposer),

From cb6b70fce19ba03768fee6814c9f394dec268559 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Mon, 2 Dec 2024 00:37:11 -0500
Subject: [PATCH 04/64] Embed 2q gate count into score as tie breaker

Also don't run scoring more than needed.
---
 crates/accelerate/src/two_qubit_peephole.rs | 94 ++++++++++-----------
 1 file changed, 45 insertions(+), 49 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index ff03d3df73f7..0a5ed97e6f6d 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -142,25 +142,38 @@ fn get_decomposers_from_target(
     Ok(decomposers)
 }
 
+/// Score a given sequence using the error rate reported in the target
+///
+/// Return a tuple of the predicted fidelity and the number of 2q gates in the sequence
 #[inline]
 fn score_sequence<'a>(
     target: &'a Target,
     kak_gate_name: &str,
     sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
-) -> f64 {
-    1. - sequence
-        .map(|(gate, local_qubits)| {
-            let qubits = local_qubits
-                .iter()
-                .map(|qubit| PhysicalQubit(qubit.0))
-                .collect::<Vec<_>>();
-            let name = match gate.as_ref() {
-                Some(g) => g.name(),
-                None => kak_gate_name,
-            };
-            1. - target.get_error(name, qubits.as_slice()).unwrap_or(0.)
-        })
-        .product::<f64>()
+) -> (f64, usize) {
+    let mut gate_count = 0;
+    let res = (
+        1. - sequence
+            .filter_map(|(gate, local_qubits)| {
+                let qubits = local_qubits
+                    .iter()
+                    .map(|qubit| PhysicalQubit(qubit.0))
+                    .collect::<Vec<_>>();
+                if qubits.len() == 2 {
+                    gate_count += 1;
+                }
+                let name = match gate.as_ref() {
+                    Some(g) => g.name(),
+                    None => kak_gate_name,
+                };
+                target
+                    .get_error(name, qubits.as_slice())
+                    .map(|error| 1. - error)
+            })
+            .product::<f64>(),
+        gate_count,
+    );
+    res
 }
 
 type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;
@@ -204,7 +217,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                 .unwrap();
             let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
             let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
-            let mut decomposer_scores: Vec<Option<f64>> = vec![None; decomposers.len()];
+            let mut decomposer_scores: Vec<Option<(f64, usize)>> = vec![None; decomposers.len()];
 
             let order_sequence =
                 |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
@@ -212,7 +225,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     let score_a = match decomposer_scores[*index_a] {
                         Some(score) => score,
                         None => {
-                            let score: f64 =
+                            let score: (f64, usize) =
                                 score_sequence(
                                     target,
                                     sequence_a.1.as_str(),
@@ -234,7 +247,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     let score_b = match decomposer_scores[*index_b] {
                         Some(score) => score,
                         None => {
-                            let score: f64 =
+                            let score: (f64, usize) =
                                 score_sequence(
                                     target,
                                     sequence_b.1.as_str(),
@@ -276,9 +289,9 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                 })
                 .enumerate()
                 .min_by(order_sequence)
-                .unwrap()
-                .1;
+                .unwrap();
             let mut original_err: f64 = 1.;
+            let mut original_count: usize = 0;
             let mut outside_target = false;
             for node_index in node_indices {
                 let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
@@ -289,6 +302,9 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     .iter()
                     .map(|qubit| PhysicalQubit(qubit.0))
                     .collect::<Vec<_>>();
+                if qubits.len() == 2 {
+                    original_count += 1;
+                }
                 let name = inst.op.name();
                 let gate_err = match target.get_error(name, qubits.as_slice()) {
                     Some(err) => 1. - err,
@@ -308,13 +324,15 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                 };
                 original_err *= gate_err;
             }
-            let original_score = 1. - original_err;
-            let new_score: f64 = if !outside_target {
-                score_sequence(
+            let original_score = (1. - original_err, original_count);
+            let new_score: (f64, usize) = match decomposer_scores[sequence.0] {
+                Some(score) => score,
+                None => score_sequence(
                     target,
-                    sequence.1.as_str(),
+                    sequence.1 .1.as_str(),
                     sequence
-                        .0
+                        .1
+                         .0
                         .gates
                         .iter()
                         .map(|(gate, _params, local_qubits)| {
@@ -324,31 +342,9 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                 .collect();
                             (*gate, qubits)
                         }),
-                )
-            } else {
-                1.
+                ),
             };
-
-            if outside_target
-                || new_score > original_score
-                || (new_score == original_score
-                    && sequence
-                        .0
-                        .gates
-                        .iter()
-                        .filter(|(_, __, qubits)| qubits.len() == 2)
-                        .count()
-                        >= node_indices
-                            .iter()
-                            .filter(|node_index| {
-                                let NodeType::Operation(ref inst) = dag.dag()[**node_index] else {
-                                    unreachable!("All run nodes will be ops")
-                                };
-                                let qubits = dag.get_qargs(inst.qubits);
-                                qubits.len() == 2
-                            })
-                            .count())
-            {
+            if !outside_target && new_score > original_score {
                 return Ok(None);
             }
             // This is done at the end of the map in some attempt to minimize
@@ -358,7 +354,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
             for node in node_indices {
                 node_mapping.insert(*node, run_index);
             }
-            Ok(Some((sequence, block_qubit_map)))
+            Ok(Some((sequence.1, block_qubit_map)))
         })
         .collect();
 

From f06a070e16658792918289b8d46d901546e10fa1 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Mon, 2 Dec 2024 01:12:06 -0500
Subject: [PATCH 05/64] Release GIL during parallel portion

---
 crates/accelerate/src/two_qubit_peephole.rs | 229 ++++++++++----------
 1 file changed, 118 insertions(+), 111 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 0a5ed97e6f6d..ac1a0f13a8c7 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -193,35 +193,41 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
         HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
     let locked_node_mapping = Mutex::new(node_mapping);
 
-    // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
-    // This is done in parallel
-    let run_mapping: PyResult<Vec<MappingIterItem>> = runs
-        .par_iter()
-        .enumerate()
-        .map(|(run_index, node_indices)| {
-            let block_qubit_map = node_indices
-                .iter()
-                .find_map(|node_index| {
-                    let inst = dag.dag()[*node_index].unwrap_operation();
-                    let qubits = dag.get_qargs(inst.qubits);
-                    if qubits.len() == 2 {
-                        if qubits[0] > qubits[1] {
-                            Some([qubits[1], qubits[0]])
+    let run_mapping: PyResult<Vec<MappingIterItem>> = py.allow_threads(|| {
+        // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
+        // This is done in parallel
+        runs.par_iter()
+            .enumerate()
+            .map(|(run_index, node_indices)| {
+                let block_qubit_map = node_indices
+                    .iter()
+                    .find_map(|node_index| {
+                        let inst = dag.dag()[*node_index].unwrap_operation();
+                        let qubits = dag.get_qargs(inst.qubits);
+                        if qubits.len() == 2 {
+                            if qubits[0] > qubits[1] {
+                                Some([qubits[1], qubits[0]])
+                            } else {
+                                Some([qubits[0], qubits[1]])
+                            }
                         } else {
-                            Some([qubits[0], qubits[1]])
+                            None
                         }
-                    } else {
-                        None
-                    }
-                })
-                .unwrap();
-            let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
-            let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
-            let mut decomposer_scores: Vec<Option<(f64, usize)>> = vec![None; decomposers.len()];
+                    })
+                    .unwrap();
+                let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
+                let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
+                let mut decomposer_scores: Vec<Option<(f64, usize)>> =
+                    vec![None; decomposers.len()];
 
-            let order_sequence =
-                |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
-                 (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
+                let order_sequence = |(index_a, sequence_a): &(
+                    usize,
+                    (TwoQubitGateSequence, String),
+                ),
+                                      (index_b, sequence_b): &(
+                    usize,
+                    (TwoQubitGateSequence, String),
+                )| {
                     let score_a = match decomposer_scores[*index_a] {
                         Some(score) => score,
                         None => {
@@ -268,95 +274,96 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
                 };
 
-            let sequence = decomposers
-                .iter()
-                .map(|decomposer| match decomposer {
-                    TwoQubitDecomposer::Basis(decomposer) => (
-                        decomposer
-                            .call_inner(matrix.view(), None, true, None)
-                            .unwrap(),
-                        decomposer.gate_name().to_string(),
-                    ),
-                    TwoQubitDecomposer::ControlledU(decomposer) => (
-                        decomposer.call_inner(matrix.view(), 1e-12).unwrap(),
-                        match decomposer.rxx_equivalent_gate {
-                            RXXEquivalent::Standard(gate) => gate.name().to_string(),
-                            RXXEquivalent::CustomPython(_) => {
-                                unreachable!("Decomposer only uses standard gates")
+                let sequence = decomposers
+                    .iter()
+                    .map(|decomposer| match decomposer {
+                        TwoQubitDecomposer::Basis(decomposer) => (
+                            decomposer
+                                .call_inner(matrix.view(), None, true, None)
+                                .unwrap(),
+                            decomposer.gate_name().to_string(),
+                        ),
+                        TwoQubitDecomposer::ControlledU(decomposer) => (
+                            decomposer.call_inner(matrix.view(), 1e-12).unwrap(),
+                            match decomposer.rxx_equivalent_gate {
+                                RXXEquivalent::Standard(gate) => gate.name().to_string(),
+                                RXXEquivalent::CustomPython(_) => {
+                                    unreachable!("Decomposer only uses standard gates")
+                                }
+                            },
+                        ),
+                    })
+                    .enumerate()
+                    .min_by(order_sequence)
+                    .unwrap();
+                let mut original_err: f64 = 1.;
+                let mut original_count: usize = 0;
+                let mut outside_target = false;
+                for node_index in node_indices {
+                    let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
+                        unreachable!("All run nodes will be ops")
+                    };
+                    let qubits = dag
+                        .get_qargs(inst.qubits)
+                        .iter()
+                        .map(|qubit| PhysicalQubit(qubit.0))
+                        .collect::<Vec<_>>();
+                    if qubits.len() == 2 {
+                        original_count += 1;
+                    }
+                    let name = inst.op.name();
+                    let gate_err = match target.get_error(name, qubits.as_slice()) {
+                        Some(err) => 1. - err,
+                        None => {
+                            // If error rate is None this can mean either the gate is not supported
+                            // in the target or the gate is ideal. We need to do a second lookup
+                            // to determine if the gate is supported, and if it isn't we don't need
+                            // to finish scoring because we know we'll use the synthesis output
+                            let physical_qargs =
+                                qubits.iter().map(|bit| PhysicalQubit(bit.0)).collect();
+                            if !target.instruction_supported(name, Some(&physical_qargs)) {
+                                outside_target = true;
+                                break;
                             }
-                        },
+                            1.
+                        }
+                    };
+                    original_err *= gate_err;
+                }
+                let original_score = (1. - original_err, original_count);
+                let new_score: (f64, usize) = match decomposer_scores[sequence.0] {
+                    Some(score) => score,
+                    None => score_sequence(
+                        target,
+                        sequence.1 .1.as_str(),
+                        sequence
+                            .1
+                             .0
+                            .gates
+                            .iter()
+                            .map(|(gate, _params, local_qubits)| {
+                                let qubits = local_qubits
+                                    .iter()
+                                    .map(|qubit| block_qubit_map[*qubit as usize])
+                                    .collect();
+                                (*gate, qubits)
+                            }),
                     ),
-                })
-                .enumerate()
-                .min_by(order_sequence)
-                .unwrap();
-            let mut original_err: f64 = 1.;
-            let mut original_count: usize = 0;
-            let mut outside_target = false;
-            for node_index in node_indices {
-                let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
-                    unreachable!("All run nodes will be ops")
                 };
-                let qubits = dag
-                    .get_qargs(inst.qubits)
-                    .iter()
-                    .map(|qubit| PhysicalQubit(qubit.0))
-                    .collect::<Vec<_>>();
-                if qubits.len() == 2 {
-                    original_count += 1;
+                if !outside_target && new_score > original_score {
+                    return Ok(None);
                 }
-                let name = inst.op.name();
-                let gate_err = match target.get_error(name, qubits.as_slice()) {
-                    Some(err) => 1. - err,
-                    None => {
-                        // If error rate is None this can mean either the gate is not supported
-                        // in the target or the gate is ideal. We need to do a second lookup
-                        // to determine if the gate is supported, and if it isn't we don't need
-                        // to finish scoring because we know we'll use the synthesis output
-                        let physical_qargs =
-                            qubits.iter().map(|bit| PhysicalQubit(bit.0)).collect();
-                        if !target.instruction_supported(name, Some(&physical_qargs)) {
-                            outside_target = true;
-                            break;
-                        }
-                        1.
-                    }
-                };
-                original_err *= gate_err;
-            }
-            let original_score = (1. - original_err, original_count);
-            let new_score: (f64, usize) = match decomposer_scores[sequence.0] {
-                Some(score) => score,
-                None => score_sequence(
-                    target,
-                    sequence.1 .1.as_str(),
-                    sequence
-                        .1
-                         .0
-                        .gates
-                        .iter()
-                        .map(|(gate, _params, local_qubits)| {
-                            let qubits = local_qubits
-                                .iter()
-                                .map(|qubit| block_qubit_map[*qubit as usize])
-                                .collect();
-                            (*gate, qubits)
-                        }),
-                ),
-            };
-            if !outside_target && new_score > original_score {
-                return Ok(None);
-            }
-            // This is done at the end of the map in some attempt to minimize
-            // lock contention. If this were serial code it'd make more sense
-            // to do this as part of the iteration building the
-            let mut node_mapping = locked_node_mapping.lock().unwrap();
-            for node in node_indices {
-                node_mapping.insert(*node, run_index);
-            }
-            Ok(Some((sequence.1, block_qubit_map)))
-        })
-        .collect();
+                // This is done at the end of the map in some attempt to minimize
+                // lock contention. If this were serial code it'd make more sense
+                // to do this as part of the iteration building the
+                let mut node_mapping = locked_node_mapping.lock().unwrap();
+                for node in node_indices {
+                    node_mapping.insert(*node, run_index);
+                }
+                Ok(Some((sequence.1, block_qubit_map)))
+            })
+            .collect()
+    });
 
     let run_mapping = run_mapping?;
     // After we've computed all the sequences to execute now serially build up a new dag.

From a175ee8bca1bfdf879d0dcf9c892e66c1522a862 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Mon, 13 Jan 2025 09:53:52 -0500
Subject: [PATCH 06/64] Fix lint

---
 crates/accelerate/src/two_qubit_peephole.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index ac1a0f13a8c7..0763b1aa4def 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -152,7 +152,7 @@ fn score_sequence<'a>(
     sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
 ) -> (f64, usize) {
     let mut gate_count = 0;
-    let res = (
+    (
         1. - sequence
             .filter_map(|(gate, local_qubits)| {
                 let qubits = local_qubits
@@ -172,8 +172,7 @@ fn score_sequence<'a>(
             })
             .product::<f64>(),
         gate_count,
-    );
-    res
+    )
 }
 
 type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;

From 79a46c5fc4a2d80b7e482f811c88b87323b492c5 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 26 Jan 2025 09:28:09 -0500
Subject: [PATCH 07/64] Update ControlledUDecomposer to ensure we only run if
 the gate is continuous

---
 crates/accelerate/src/two_qubit_peephole.rs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 0763b1aa4def..ca72485661ae 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -134,9 +134,16 @@ fn get_decomposers_from_target(
         StandardGate::RZXGate,
     ] {
         if gate_names.contains(gate.name()) {
-            decomposers.push(TwoQubitDecomposer::ControlledU(
-                TwoQubitControlledUDecomposer::new(RXXEquivalent::Standard(gate))?,
-            ));
+            let op = target.operation_from_name(gate.name()).unwrap();
+            if op
+                .params
+                .iter()
+                .all(|x| matches!(x, Param::ParameterExpression(_)))
+            {
+                decomposers.push(TwoQubitDecomposer::ControlledU(
+                    TwoQubitControlledUDecomposer::new(RXXEquivalent::Standard(gate))?,
+                ));
+            }
         }
     }
     Ok(decomposers)

From 839b4c9d791d5e9ff880d2515ed7437d6a432d49 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 26 Jan 2025 17:04:35 -0500
Subject: [PATCH 08/64] Add reversed synthesis for two qubit basis decomposer

---
 crates/accelerate/src/two_qubit_peephole.rs   | 131 ++++++++++++------
 qiskit/transpiler/passes/__init__.py          |   2 +
 .../passes/optimization/__init__.py           |   1 +
 3 files changed, 94 insertions(+), 40 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index ca72485661ae..d5b63f93fd79 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -14,6 +14,8 @@ use std::cmp::Ordering;
 use std::sync::Mutex;
 
 use hashbrown::{HashMap, HashSet};
+use ndarray::prelude::*;
+use num_complex::Complex64;
 use pyo3::prelude::*;
 use rayon::prelude::*;
 use rustworkx_core::petgraph::stable_graph::NodeIndex;
@@ -50,24 +52,30 @@ fn get_decomposers_from_target(
     target: &Target,
     qubits: &[Qubit],
     fidelity: f64,
-) -> PyResult<Vec<TwoQubitDecomposer>> {
+) -> PyResult<Vec<(TwoQubitDecomposer, bool)>> {
     let physical_qubits = smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
-    let gate_names = match target.operation_names_for_qargs(Some(&physical_qubits)) {
-        Ok(names) => names,
-        Err(_) => {
-            let reverse_qubits = physical_qubits.iter().rev().copied().collect();
-            target
-                .operation_names_for_qargs(Some(&reverse_qubits))
-                .unwrap()
-        }
-    };
+    let reverse_qubits = physical_qubits.iter().rev().copied().collect();
+    let mut gate_names: HashSet<(&str, bool)> = target
+        .operation_names_for_qargs(Some(&physical_qubits))
+        .unwrap()
+        .into_iter()
+        .map(|x| (x, false))
+        .collect();
+    let reverse_names = target
+        .operation_names_for_qargs(Some(&reverse_qubits))
+        .unwrap();
 
-    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param])> = gate_names
+    if !reverse_names.is_empty() {
+        for name in reverse_names {
+            gate_names.insert((name, true));
+        }
+    }
+    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param], bool)> = gate_names
         .iter()
-        .filter_map(|name| match target.operation_from_name(name) {
+        .filter_map(|(name, rev)| match target.operation_from_name(name) {
             Ok(raw_op) => match raw_op.operation.view() {
                 OperationRef::Standard(_) | OperationRef::Gate(_) => {
-                    Some((*name, &raw_op.operation, raw_op.params.as_slice()))
+                    Some((*name, &raw_op.operation, raw_op.params.as_slice(), *rev))
                 }
                 _ => None,
             },
@@ -105,22 +113,20 @@ fn get_decomposers_from_target(
         target_basis_set.remove(EulerBasis::ZSX);
     }
 
-    let euler_bases: Vec<EulerBasis> = target_basis_set.get_bases().collect();
-
-    let decomposers: PyResult<Vec<TwoQubitDecomposer>> = available_kak_gate
+    let decomposers: PyResult<Vec<(TwoQubitDecomposer, bool)>> = available_kak_gate
         .iter()
-        .filter_map(|(two_qubit_name, two_qubit_gate, params)| {
+        .filter_map(|(two_qubit_name, two_qubit_gate, params, rev)| {
             let matrix = two_qubit_gate.matrix(params);
             matrix.map(|matrix| {
-                euler_bases.iter().map(move |euler_basis| {
+                target_basis_set.get_bases().map(move |euler_basis| {
                     TwoQubitBasisDecomposer::new_inner(
                         two_qubit_name.to_string(),
                         matrix.view(),
                         fidelity,
-                        *euler_basis,
+                        euler_basis,
                         None,
                     )
-                    .map(TwoQubitDecomposer::Basis)
+                    .map(|x| (TwoQubitDecomposer::Basis(x), *rev))
                 })
             })
         })
@@ -133,15 +139,18 @@ fn get_decomposers_from_target(
         StandardGate::RYYGate,
         StandardGate::RZXGate,
     ] {
-        if gate_names.contains(gate.name()) {
+        if gate_names.contains(&(gate.name(), false)) {
             let op = target.operation_from_name(gate.name()).unwrap();
             if op
                 .params
                 .iter()
                 .all(|x| matches!(x, Param::ParameterExpression(_)))
             {
-                decomposers.push(TwoQubitDecomposer::ControlledU(
-                    TwoQubitControlledUDecomposer::new(RXXEquivalent::Standard(gate))?,
+                decomposers.push((
+                    TwoQubitDecomposer::ControlledU(TwoQubitControlledUDecomposer::new(
+                        RXXEquivalent::Standard(gate),
+                    )?),
+                    false,
                 ));
             }
         }
@@ -149,6 +158,15 @@ fn get_decomposers_from_target(
     Ok(decomposers)
 }
 
+fn reverse_mat(matrix: &mut Array2<Complex64>) {
+    // Swap rows 1 and 2
+    let (mut row_1, mut row_2) = matrix.multi_slice_mut((s![1, ..], s![2, ..]));
+    azip!((x in &mut row_1, y in &mut row_2) (*x, *y) = (*y, *x));
+    // Swap columns 1 and 2
+    let (mut col_1, mut col_2) = matrix.multi_slice_mut((s![.., 1], s![.., 2]));
+    azip!((x in &mut col_1, y in &mut col_2) (*x, *y) = (*y, *x));
+}
+
 /// Score a given sequence using the error rate reported in the target
 ///
 /// Return a tuple of the predicted fidelity and the number of 2q gates in the sequence
@@ -282,22 +300,55 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
 
                 let sequence = decomposers
                     .iter()
-                    .map(|decomposer| match decomposer {
-                        TwoQubitDecomposer::Basis(decomposer) => (
-                            decomposer
-                                .call_inner(matrix.view(), None, true, None)
-                                .unwrap(),
-                            decomposer.gate_name().to_string(),
-                        ),
-                        TwoQubitDecomposer::ControlledU(decomposer) => (
-                            decomposer.call_inner(matrix.view(), 1e-12).unwrap(),
-                            match decomposer.rxx_equivalent_gate {
-                                RXXEquivalent::Standard(gate) => gate.name().to_string(),
-                                RXXEquivalent::CustomPython(_) => {
-                                    unreachable!("Decomposer only uses standard gates")
+                    .map(|decomposer| {
+                        if decomposer.1 {
+                            let mut mat = matrix.clone();
+                            reverse_mat(&mut mat);
+                            match &decomposer.0 {
+                                TwoQubitDecomposer::Basis(decomposer) => {
+                                    let synth = decomposer
+                                        .call_inner(mat.view(), None, true, None)
+                                        .unwrap();
+                                    let mut reversed_gates = Vec::with_capacity(synth.gates.len());
+                                    let flip_bits: [u8; 2] = [1, 0];
+                                    for (gate, params, qubit_ids) in synth.gates() {
+                                        let new_qubit_ids = qubit_ids
+                                            .into_iter()
+                                            .map(|x| flip_bits[*x as usize])
+                                            .collect::<SmallVec<[u8; 2]>>();
+                                        reversed_gates.push((
+                                            *gate,
+                                            params.clone(),
+                                            new_qubit_ids.clone(),
+                                        ));
+                                    }
+                                    let mut reversed_synth: TwoQubitGateSequence =
+                                        TwoQubitGateSequence::new();
+                                    reversed_synth
+                                        .set_state((reversed_gates, synth.global_phase()));
+                                    (reversed_synth, decomposer.gate_name().to_string())
                                 }
-                            },
-                        ),
+                                _ => unreachable!("Only TwoQubitBasisDecomposer is reversible"),
+                            }
+                        } else {
+                            match &decomposer.0 {
+                                TwoQubitDecomposer::Basis(decomposer) => (
+                                    decomposer
+                                        .call_inner(matrix.view(), None, true, None)
+                                        .unwrap(),
+                                    decomposer.gate_name().to_string(),
+                                ),
+                                TwoQubitDecomposer::ControlledU(decomposer) => (
+                                    decomposer.call_inner(matrix.view(), 1e-12).unwrap(),
+                                    match decomposer.rxx_equivalent_gate {
+                                        RXXEquivalent::Standard(gate) => gate.name().to_string(),
+                                        RXXEquivalent::CustomPython(_) => {
+                                            unreachable!("Decomposer only uses standard gates")
+                                        }
+                                    },
+                                ),
+                            }
+                        }
                     })
                     .enumerate()
                     .min_by(order_sequence)
@@ -435,7 +486,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     gate.operation.clone(),
                                     qubits.as_slice(),
                                     &[],
-                                    out_params,
+                                    Some(gate.params.clone()),
                                     ExtraInstructionAttributes::default(),
                                     None,
                                 )
@@ -447,7 +498,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     gate.operation.clone(),
                                     qubits.as_slice(),
                                     &[],
-                                    out_params,
+                                    Some(gate.params.clone()),
                                     ExtraInstructionAttributes::default(),
                                 )
                             }
diff --git a/qiskit/transpiler/passes/__init__.py b/qiskit/transpiler/passes/__init__.py
index 8823e1ce7233..36104162e7b9 100644
--- a/qiskit/transpiler/passes/__init__.py
+++ b/qiskit/transpiler/passes/__init__.py
@@ -93,6 +93,7 @@
    OptimizeAnnotated
    Split2QUnitaries
    RemoveIdentityEquivalent
+   TwoQubitPeepholeOptimization
 
 Calibration
 =============
@@ -248,6 +249,7 @@
 from .optimization import OptimizeAnnotated
 from .optimization import RemoveIdentityEquivalent
 from .optimization import Split2QUnitaries
+from .optimization import TwoQubitPeepholeOptimization
 
 # circuit analysis
 from .analysis import ResourceEstimation
diff --git a/qiskit/transpiler/passes/optimization/__init__.py b/qiskit/transpiler/passes/optimization/__init__.py
index 0e5108f44d2a..cbb493018efe 100644
--- a/qiskit/transpiler/passes/optimization/__init__.py
+++ b/qiskit/transpiler/passes/optimization/__init__.py
@@ -40,3 +40,4 @@
 from .remove_identity_equiv import RemoveIdentityEquivalent
 from .split_2q_unitaries import Split2QUnitaries
 from .collect_and_collapse import CollectAndCollapse
+from .two_qubit_peephole import TwoQubitPeepholeOptimization

From d9399a618ef4426466a5c0a881ad8034e38f02ce Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 26 Jan 2025 18:39:17 -0500
Subject: [PATCH 09/64] Fix handling of single direction gates

---
 crates/accelerate/src/two_qubit_peephole.rs | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index d5b63f93fd79..20707e0b99c2 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -61,13 +61,11 @@ fn get_decomposers_from_target(
         .into_iter()
         .map(|x| (x, false))
         .collect();
-    let reverse_names = target
-        .operation_names_for_qargs(Some(&reverse_qubits))
-        .unwrap();
-
-    if !reverse_names.is_empty() {
-        for name in reverse_names {
-            gate_names.insert((name, true));
+    if let Ok(reverse_names) = target.operation_names_for_qargs(Some(&reverse_qubits)) {
+        if !reverse_names.is_empty() {
+            for name in reverse_names {
+                gate_names.insert((name, true));
+            }
         }
     }
     let available_kak_gate: Vec<(&str, &PackedOperation, &[Param], bool)> = gate_names
@@ -351,8 +349,11 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                         }
                     })
                     .enumerate()
-                    .min_by(order_sequence)
-                    .unwrap();
+                    .min_by(order_sequence);
+                if sequence.is_none() {
+                    return Ok(None);
+                }
+                let sequence = sequence.unwrap();
                 let mut original_err: f64 = 1.;
                 let mut original_count: usize = 0;
                 let mut outside_target = false;

From b4c4360a6be0dcae7dc0a2474dcbebdbe6f11ef7 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 26 Jan 2025 18:41:16 -0500
Subject: [PATCH 10/64] Fix import cycle

---
 qiskit/transpiler/passes/optimization/two_qubit_peephole.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index 60536d09aa71..2b8cc426308f 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -16,8 +16,6 @@
 
 from qiskit.transpiler.basepasses import TransformationPass
 from qiskit.transpiler.passmanager import PassManager
-from qiskit.transpiler.passes.optimization import Collect2qBlocks, ConsolidateBlocks
-from qiskit.transpiler.passes.synthesis import UnitarySynthesis
 from qiskit.transpiler.target import Target
 from qiskit.dagcircuit.dagcircuit import DAGCircuit
 from qiskit._accelerate.two_qubit_peephole import two_qubit_unitary_peephole_optimize
@@ -38,6 +36,9 @@ def __init__(
         self._approximation_degree = approximation_degree
         self._pm = None
         if method != "default":
+            from qiskit.transpiler.passes.optimization import Collect2qBlocks, ConsolidateBlocks
+            from qiskit.transpiler.passes.synthesis import UnitarySynthesis
+
             self._pm = PassManager(
                 [
                     Collect2qBlocks(),

From f2bb1fbf350ef30848b352bed742b4cfccddbde8 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 19 Feb 2025 17:48:55 -0500
Subject: [PATCH 11/64] Flip scoring value to (2q gate count, predicted error)

The priority for the two qubit peephole pass should be decreasing the 2q
gate count. The error rate heuristic should only matter if the 2q counts
are the same. This commit flips the heuristic to first check the 2q gate
count so the first priority is reducing the 2q gate count.
---
 crates/accelerate/src/two_qubit_peephole.rs | 57 +++++++++++----------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index d6bc47793b63..984ec3ef85f6 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -176,10 +176,10 @@ fn score_sequence<'a>(
     target: &'a Target,
     kak_gate_name: &str,
     sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
-) -> (f64, usize) {
+) -> (usize, f64) {
     let mut gate_count = 0;
-    (
-        1. - sequence
+    let error = 1.
+        - sequence
             .filter_map(|(gate, local_qubits)| {
                 let qubits = local_qubits
                     .iter()
@@ -196,9 +196,8 @@ fn score_sequence<'a>(
                     .get_error(name, qubits.as_slice())
                     .map(|error| 1. - error)
             })
-            .product::<f64>(),
-        gate_count,
-    )
+            .product::<f64>();
+    (gate_count, error)
 }
 
 type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;
@@ -242,7 +241,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     .unwrap();
                 let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
                 let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
-                let mut decomposer_scores: Vec<Option<(f64, usize)>> =
+                let mut decomposer_scores: Vec<Option<(usize, f64)>> =
                     vec![None; decomposers.len()];
 
                 let order_sequence = |(index_a, sequence_a): &(
@@ -253,11 +252,11 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     usize,
                     (TwoQubitGateSequence, String),
                 )| {
-                    let score_a = match decomposer_scores[*index_a] {
-                        Some(score) => score,
-                        None => {
-                            let score: (f64, usize) =
-                                score_sequence(
+                    let score_a = (
+                        match decomposer_scores[*index_a] {
+                            Some(score) => score,
+                            None => {
+                                let score: (usize, f64) = score_sequence(
                                     target,
                                     sequence_a.1.as_str(),
                                     sequence_a.0.gates.iter().map(
@@ -270,16 +269,18 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                         },
                                     ),
                                 );
-                            decomposer_scores[*index_a] = Some(score);
-                            score
-                        }
-                    };
+                                decomposer_scores[*index_a] = Some(score);
+                                score
+                            }
+                        },
+                        index_a,
+                    );
 
-                    let score_b = match decomposer_scores[*index_b] {
-                        Some(score) => score,
-                        None => {
-                            let score: (f64, usize) =
-                                score_sequence(
+                    let score_b = (
+                        match decomposer_scores[*index_b] {
+                            Some(score) => score,
+                            None => {
+                                let score: (usize, f64) = score_sequence(
                                     target,
                                     sequence_b.1.as_str(),
                                     sequence_b.0.gates.iter().map(
@@ -292,10 +293,12 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                         },
                                     ),
                                 );
-                            decomposer_scores[*index_b] = Some(score);
-                            score
-                        }
-                    };
+                                decomposer_scores[*index_b] = Some(score);
+                                score
+                            }
+                        },
+                        index_b,
+                    );
                     score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
                 };
 
@@ -391,8 +394,8 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     };
                     original_err *= gate_err;
                 }
-                let original_score = (1. - original_err, original_count);
-                let new_score: (f64, usize) = match decomposer_scores[sequence.0] {
+                let original_score = (original_count, 1. - original_err);
+                let new_score: (usize, f64) = match decomposer_scores[sequence.0] {
                     Some(score) => score,
                     None => score_sequence(
                         target,

From 55b05c04c5d484221b7c86193001fac91e94a970 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 19 Feb 2025 18:18:50 -0500
Subject: [PATCH 12/64] Add docstring to new pass

---
 .../passes/optimization/two_qubit_peephole.py | 45 +++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index 2b8cc426308f..61713c62a91a 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -22,7 +22,26 @@
 
 
 class TwoQubitPeepholeOptimization(TransformationPass):
-    """Unified two qubit unitary peephole optimization"""
+    """Unified two qubit unitary peephole optimization
+
+    This transpiler pass is designed to perform two qubit unitary peephole optimization. This pass
+    finds all the 2 qubit blocks in the circuit, computes the unitary of
+    that block, and then synthesizes that unitary. If the synthesized two
+    qubit unitary is "better" than the original subcircuit that subcircuit
+    is used to replace the original. The heuristic used to determine if
+    it's better first looks at the two qubit gate count in the circuit, and
+    prefers the synthesis with fewer two qubit gates.
+
+    In case the target is overcomplete the pass will try all the
+    decomposers supported for all the gates supported on a given qubit.
+    The decomposition that has the best expected performance will be selected
+    and used to replace the block.
+
+    This pass is multithreaded, and will perform the analysis in parallel
+    and use all the cores available on your local system. You can refer to
+    the `configuration guide <https://docs.quantum.ibm.com/guides/configure-qiskit-local>`__
+    for details on how to control the threading behavior
+    """
 
     def __init__(
         self,
@@ -31,17 +50,37 @@ def __init__(
         method: str = "default",
         plugin_config: dict = None,
     ):
+        """Initialize the pass
+
+        Args:
+            target: The target to run the pass for
+                approximation_degree: heuristic dial used for circuit approximation (1.0=no
+                approximation, 0.0=maximal approximation). Approximation can decrease the number
+                of gates used in the synthesized unitaries smaller at the cost of straying from the
+                original unitary. If ``None``, approximation is done based on gate fidelities
+                specified in the ``target``.
+            method: The optional unitary synthesis plugin to run. If this is specified the pass
+                behaves identically to running :class:`.ConsolidateBlocks` and
+                :class:`.UnitarySynthesis` in sequence. The heuristic described above doesn't apply
+                and the block is only resynthesized if
+                :meth:`.TwoQubitBasisDecomposer.num_basis_gates` predicts fewer 2q gates are
+                required than the original block.
+            plugin_config: The optional configuration dictionary if a plugin method is
+                specified. Refer to the documentation for the plugin being used for
+                the options accepted and how to configure the plugin.
+        """
+
+
         super().__init__()
         self._target = target
         self._approximation_degree = approximation_degree
         self._pm = None
         if method != "default":
-            from qiskit.transpiler.passes.optimization import Collect2qBlocks, ConsolidateBlocks
+            from qiskit.transpiler.passes.optimization import ConsolidateBlocks
             from qiskit.transpiler.passes.synthesis import UnitarySynthesis
 
             self._pm = PassManager(
                 [
-                    Collect2qBlocks(),
                     ConsolidateBlocks(
                         target=self._target, approximation_degree=self._approximation_degree
                     ),

From 7756d1e75126a6b94d804d44f1535fa3ab1080bb Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 19 Feb 2025 18:30:18 -0500
Subject: [PATCH 13/64] Add release note

---
 .../passes/optimization/two_qubit_peephole.py | 11 +++---
 .../two_qubit_peephole-de6d3438ed7df6a9.yaml  | 36 +++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)
 create mode 100644 releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index 61713c62a91a..eb650405647d 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 from qiskit.transpiler.basepasses import TransformationPass
-from qiskit.transpiler.passmanager import PassManager
 from qiskit.transpiler.target import Target
 from qiskit.dagcircuit.dagcircuit import DAGCircuit
 from qiskit._accelerate.two_qubit_peephole import two_qubit_unitary_peephole_optimize
@@ -70,14 +69,18 @@ def __init__(
                 the options accepted and how to configure the plugin.
         """
 
-
         super().__init__()
         self._target = target
         self._approximation_degree = approximation_degree
         self._pm = None
         if method != "default":
-            from qiskit.transpiler.passes.optimization import ConsolidateBlocks
-            from qiskit.transpiler.passes.synthesis import UnitarySynthesis
+            from qiskit.transpiler.passes.optimization import (
+                ConsolidateBlocks,
+            )  # pylint: disable=cyclic-import
+            from qiskit.transpiler.passes.synthesis import (
+                UnitarySynthesis,
+            )  # pylint: disable=cyclic-import
+            from qiskit.transpiler.passmanager import PassManager  # pylint: disable=cyclic-import
 
             self._pm = PassManager(
                 [
diff --git a/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml b/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
new file mode 100644
index 000000000000..a45914dcc832
--- /dev/null
+++ b/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
@@ -0,0 +1,36 @@
+---
+features_transpiler:
+  - |
+    Added a new transpiler pass :class:`.TwoQubitPeepholeOptimization` which
+    is intended to perform two qubit unitary peephole optimization, where it
+    analyzes the circuit to find two qubit blocks in the circuit, compute
+    the unitary of that subcircuit, and then replace the original block with
+    the synthesized unitary which uses fewer operations. For example:
+
+    .. plot::
+
+        from qiskit.circuit import QuantumCircuit
+        from qiskit.transpiler.passes import TwoQubitPeepholeOptimization
+        from qiskit.providers import GenericBackendV2
+
+        # Build an unoptimized 2 qubit circuit
+        unoptimized = QuantumCircuit(2)
+        for i in range(10):
+          if i % 2:
+            unoptimized.cx(0, 1)
+          else:
+            unoptimized.cx(1, 0)
+
+        # Generate a target with random error rates
+        target = GenericBackendV2(2, ["u", "cx"], coupling_map=[0, 1]).target
+        # Instantiate pass
+        peephole_pass = TwoQubitPeepholeOptimization(target)
+        # Run pass and visualize output
+        optimized = peephole_pass(unoptimized)
+        optimized.draw("mpl")
+
+    This functionality can be perfomed by running
+    :class:`.Collect2qBlocks`, :class:`.ConsolidateBlocks`, and
+    :class:`.UnitarySynthesis` sequentially. However this new pass offers
+    improved runtime performance and also better quality output in cases
+    of overcomplete and hetergeneous targets.

From 6a01332e6b46d7b93332f4a9ba56f07fa5669cd5 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 19 Feb 2025 18:42:36 -0500
Subject: [PATCH 14/64] Run serially in multiprocessing context

---
 crates/accelerate/src/two_qubit_peephole.rs | 355 ++++++++++----------
 1 file changed, 178 insertions(+), 177 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 984ec3ef85f6..87136db56601 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -31,6 +31,7 @@ use crate::convert_2q_block_matrix::blocks_to_matrix;
 use crate::euler_one_qubit_decomposer::{
     EulerBasis, EulerBasisSet, EULER_BASES, EULER_BASIS_NAMES,
 };
+use crate::getenv_use_multiple_threads;
 use crate::nlayout::PhysicalQubit;
 use crate::target_transpiler::Target;
 use crate::two_qubit_decompose::{
@@ -217,46 +218,38 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
         HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
     let locked_node_mapping = Mutex::new(node_mapping);
 
-    let run_mapping: PyResult<Vec<MappingIterItem>> = py.allow_threads(|| {
-        // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
-        // This is done in parallel
-        runs.par_iter()
-            .enumerate()
-            .map(|(run_index, node_indices)| {
-                let block_qubit_map = node_indices
-                    .iter()
-                    .find_map(|node_index| {
-                        let inst = dag.dag()[*node_index].unwrap_operation();
-                        let qubits = dag.get_qargs(inst.qubits);
-                        if qubits.len() == 2 {
-                            if qubits[0] > qubits[1] {
-                                Some([qubits[1], qubits[0]])
-                            } else {
-                                Some([qubits[0], qubits[1]])
-                            }
-                        } else {
-                            None
-                        }
-                    })
-                    .unwrap();
-                let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
-                let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
-                let mut decomposer_scores: Vec<Option<(usize, f64)>> =
-                    vec![None; decomposers.len()];
+    let find_best_sequence = |run_index: usize,
+                              node_indices: &[NodeIndex]|
+     -> PyResult<MappingIterItem> {
+        let block_qubit_map = node_indices
+            .iter()
+            .find_map(|node_index| {
+                let inst = dag.dag()[*node_index].unwrap_operation();
+                let qubits = dag.get_qargs(inst.qubits);
+                if qubits.len() == 2 {
+                    if qubits[0] > qubits[1] {
+                        Some([qubits[1], qubits[0]])
+                    } else {
+                        Some([qubits[0], qubits[1]])
+                    }
+                } else {
+                    None
+                }
+            })
+            .unwrap();
+        let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
+        let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
+        let mut decomposer_scores: Vec<Option<(usize, f64)>> = vec![None; decomposers.len()];
 
-                let order_sequence = |(index_a, sequence_a): &(
-                    usize,
-                    (TwoQubitGateSequence, String),
-                ),
-                                      (index_b, sequence_b): &(
-                    usize,
-                    (TwoQubitGateSequence, String),
-                )| {
-                    let score_a = (
-                        match decomposer_scores[*index_a] {
-                            Some(score) => score,
-                            None => {
-                                let score: (usize, f64) = score_sequence(
+        let order_sequence =
+            |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
+             (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
+                let score_a = (
+                    match decomposer_scores[*index_a] {
+                        Some(score) => score,
+                        None => {
+                            let score: (usize, f64) =
+                                score_sequence(
                                     target,
                                     sequence_a.1.as_str(),
                                     sequence_a.0.gates.iter().map(
@@ -269,18 +262,19 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                         },
                                     ),
                                 );
-                                decomposer_scores[*index_a] = Some(score);
-                                score
-                            }
-                        },
-                        index_a,
-                    );
+                            decomposer_scores[*index_a] = Some(score);
+                            score
+                        }
+                    },
+                    index_a,
+                );
 
-                    let score_b = (
-                        match decomposer_scores[*index_b] {
-                            Some(score) => score,
-                            None => {
-                                let score: (usize, f64) = score_sequence(
+                let score_b = (
+                    match decomposer_scores[*index_b] {
+                        Some(score) => score,
+                        None => {
+                            let score: (usize, f64) =
+                                score_sequence(
                                     target,
                                     sequence_b.1.as_str(),
                                     sequence_b.0.gates.iter().map(
@@ -293,141 +287,148 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                         },
                                     ),
                                 );
-                                decomposer_scores[*index_b] = Some(score);
-                                score
-                            }
-                        },
-                        index_b,
-                    );
-                    score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
-                };
+                            decomposer_scores[*index_b] = Some(score);
+                            score
+                        }
+                    },
+                    index_b,
+                );
+                score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
+            };
 
-                let sequence = decomposers
-                    .iter()
-                    .map(|decomposer| {
-                        if decomposer.1 {
-                            let mut mat = matrix.clone();
-                            reverse_mat(&mut mat);
-                            match &decomposer.0 {
-                                TwoQubitDecomposer::Basis(decomposer) => {
-                                    let synth = decomposer
-                                        .call_inner(mat.view(), None, true, None)
-                                        .unwrap();
-                                    let mut reversed_gates = Vec::with_capacity(synth.gates.len());
-                                    let flip_bits: [u8; 2] = [1, 0];
-                                    for (gate, params, qubit_ids) in synth.gates() {
-                                        let new_qubit_ids = qubit_ids
-                                            .into_iter()
-                                            .map(|x| flip_bits[*x as usize])
-                                            .collect::<SmallVec<[u8; 2]>>();
-                                        reversed_gates.push((
-                                            *gate,
-                                            params.clone(),
-                                            new_qubit_ids.clone(),
-                                        ));
-                                    }
-                                    let mut reversed_synth: TwoQubitGateSequence =
-                                        TwoQubitGateSequence::new();
-                                    reversed_synth
-                                        .set_state((reversed_gates, synth.global_phase()));
-                                    (reversed_synth, decomposer.gate_name().to_string())
-                                }
-                                _ => unreachable!("Only TwoQubitBasisDecomposer is reversible"),
-                            }
-                        } else {
-                            match &decomposer.0 {
-                                TwoQubitDecomposer::Basis(decomposer) => (
-                                    decomposer
-                                        .call_inner(matrix.view(), None, true, None)
-                                        .unwrap(),
-                                    decomposer.gate_name().to_string(),
-                                ),
-                                TwoQubitDecomposer::ControlledU(decomposer) => (
-                                    decomposer.call_inner(matrix.view(), Some(1e-12)).unwrap(),
-                                    match decomposer.rxx_equivalent_gate {
-                                        RXXEquivalent::Standard(gate) => gate.name().to_string(),
-                                        RXXEquivalent::CustomPython(_) => {
-                                            unreachable!("Decomposer only uses standard gates")
-                                        }
-                                    },
-                                ),
+        let sequence = decomposers
+            .iter()
+            .map(|decomposer| {
+                if decomposer.1 {
+                    let mut mat = matrix.clone();
+                    reverse_mat(&mut mat);
+                    match &decomposer.0 {
+                        TwoQubitDecomposer::Basis(decomposer) => {
+                            let synth =
+                                decomposer.call_inner(mat.view(), None, true, None).unwrap();
+                            let mut reversed_gates = Vec::with_capacity(synth.gates.len());
+                            let flip_bits: [u8; 2] = [1, 0];
+                            for (gate, params, qubit_ids) in synth.gates() {
+                                let new_qubit_ids = qubit_ids
+                                    .into_iter()
+                                    .map(|x| flip_bits[*x as usize])
+                                    .collect::<SmallVec<[u8; 2]>>();
+                                reversed_gates.push((*gate, params.clone(), new_qubit_ids.clone()));
                             }
+                            let mut reversed_synth: TwoQubitGateSequence =
+                                TwoQubitGateSequence::new();
+                            reversed_synth.set_state((reversed_gates, synth.global_phase()));
+                            (reversed_synth, decomposer.gate_name().to_string())
                         }
-                    })
-                    .enumerate()
-                    .min_by(order_sequence);
-                if sequence.is_none() {
-                    return Ok(None);
+                        _ => unreachable!("Only TwoQubitBasisDecomposer is reversible"),
+                    }
+                } else {
+                    match &decomposer.0 {
+                        TwoQubitDecomposer::Basis(decomposer) => (
+                            decomposer
+                                .call_inner(matrix.view(), None, true, None)
+                                .unwrap(),
+                            decomposer.gate_name().to_string(),
+                        ),
+                        TwoQubitDecomposer::ControlledU(decomposer) => (
+                            decomposer.call_inner(matrix.view(), Some(1e-12)).unwrap(),
+                            match decomposer.rxx_equivalent_gate {
+                                RXXEquivalent::Standard(gate) => gate.name().to_string(),
+                                RXXEquivalent::CustomPython(_) => {
+                                    unreachable!("Decomposer only uses standard gates")
+                                }
+                            },
+                        ),
+                    }
                 }
-                let sequence = sequence.unwrap();
-                let mut original_err: f64 = 1.;
-                let mut original_count: usize = 0;
-                let mut outside_target = false;
-                for node_index in node_indices {
-                    let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
-                        unreachable!("All run nodes will be ops")
-                    };
-                    let qubits = dag
-                        .get_qargs(inst.qubits)
-                        .iter()
-                        .map(|qubit| PhysicalQubit(qubit.0))
-                        .collect::<Vec<_>>();
-                    if qubits.len() == 2 {
-                        original_count += 1;
+            })
+            .enumerate()
+            .min_by(order_sequence);
+        if sequence.is_none() {
+            return Ok(None);
+        }
+        let sequence = sequence.unwrap();
+        let mut original_err: f64 = 1.;
+        let mut original_count: usize = 0;
+        let mut outside_target = false;
+        for node_index in node_indices {
+            let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
+                unreachable!("All run nodes will be ops")
+            };
+            let qubits = dag
+                .get_qargs(inst.qubits)
+                .iter()
+                .map(|qubit| PhysicalQubit(qubit.0))
+                .collect::<Vec<_>>();
+            if qubits.len() == 2 {
+                original_count += 1;
+            }
+            let name = inst.op.name();
+            let gate_err = match target.get_error(name, qubits.as_slice()) {
+                Some(err) => 1. - err,
+                None => {
+                    // If error rate is None this can mean either the gate is not supported
+                    // in the target or the gate is ideal. We need to do a second lookup
+                    // to determine if the gate is supported, and if it isn't we don't need
+                    // to finish scoring because we know we'll use the synthesis output
+                    let physical_qargs = qubits.iter().map(|bit| PhysicalQubit(bit.0)).collect();
+                    if !target.instruction_supported(name, Some(&physical_qargs)) {
+                        outside_target = true;
+                        break;
                     }
-                    let name = inst.op.name();
-                    let gate_err = match target.get_error(name, qubits.as_slice()) {
-                        Some(err) => 1. - err,
-                        None => {
-                            // If error rate is None this can mean either the gate is not supported
-                            // in the target or the gate is ideal. We need to do a second lookup
-                            // to determine if the gate is supported, and if it isn't we don't need
-                            // to finish scoring because we know we'll use the synthesis output
-                            let physical_qargs =
-                                qubits.iter().map(|bit| PhysicalQubit(bit.0)).collect();
-                            if !target.instruction_supported(name, Some(&physical_qargs)) {
-                                outside_target = true;
-                                break;
-                            }
-                            1.
-                        }
-                    };
-                    original_err *= gate_err;
+                    1.
                 }
-                let original_score = (original_count, 1. - original_err);
-                let new_score: (usize, f64) = match decomposer_scores[sequence.0] {
-                    Some(score) => score,
-                    None => score_sequence(
-                        target,
-                        sequence.1 .1.as_str(),
-                        sequence
-                            .1
-                             .0
-                            .gates
+            };
+            original_err *= gate_err;
+        }
+        let original_score = (original_count, 1. - original_err);
+        let new_score: (usize, f64) = match decomposer_scores[sequence.0] {
+            Some(score) => score,
+            None => score_sequence(
+                target,
+                sequence.1 .1.as_str(),
+                sequence
+                    .1
+                     .0
+                    .gates
+                    .iter()
+                    .map(|(gate, _params, local_qubits)| {
+                        let qubits = local_qubits
                             .iter()
-                            .map(|(gate, _params, local_qubits)| {
-                                let qubits = local_qubits
-                                    .iter()
-                                    .map(|qubit| block_qubit_map[*qubit as usize])
-                                    .collect();
-                                (*gate, qubits)
-                            }),
-                    ),
-                };
-                if !outside_target && new_score > original_score {
-                    return Ok(None);
-                }
-                // This is done at the end of the map in some attempt to minimize
-                // lock contention. If this were serial code it'd make more sense
-                // to do this as part of the iteration building the
-                let mut node_mapping = locked_node_mapping.lock().unwrap();
-                for node in node_indices {
-                    node_mapping.insert(*node, run_index);
-                }
-                Ok(Some((sequence.1, block_qubit_map)))
-            })
+                            .map(|qubit| block_qubit_map[*qubit as usize])
+                            .collect();
+                        (*gate, qubits)
+                    }),
+            ),
+        };
+        if !outside_target && new_score > original_score {
+            return Ok(None);
+        }
+        // This is done at the end of the map in some attempt to minimize
+        // lock contention. If this were serial code it'd make more sense
+        // to do this as part of the iteration building the
+        let mut node_mapping = locked_node_mapping.lock().unwrap();
+        for node in node_indices {
+            node_mapping.insert(*node, run_index);
+        }
+        Ok(Some((sequence.1, block_qubit_map)))
+    };
+
+    let run_mapping: PyResult<Vec<MappingIterItem>> = if getenv_use_multiple_threads() {
+        py.allow_threads(|| {
+            // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
+            // This is done in parallel
+            runs.par_iter()
+                .enumerate()
+                .map(|(index, sequence)| find_best_sequence(index, sequence.as_slice()))
+                .collect()
+        })
+    } else {
+        runs.iter()
+            .enumerate()
+            .map(|(index, sequence)| find_best_sequence(index, sequence.as_slice()))
             .collect()
-    });
+    };
 
     let run_mapping = run_mapping?;
     // After we've computed all the sequences to execute now serially build up a new dag.

From d83562b0d77cfdff7675ef0f52ef49c167d1bd59 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 19 Feb 2025 20:12:05 -0500
Subject: [PATCH 15/64] Fix cache build

---
 crates/accelerate/src/two_qubit_peephole.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 87136db56601..0cbac9086bea 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -465,7 +465,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                             {
                                 out_dag.apply_operation_back(
                                     py,
-                                    PackedOperation::from_standard(*gate),
+                                    PackedOperation::from_standard_gate(*gate),
                                     qubits.as_slice(),
                                     &[],
                                     out_params,

From 5f73b9359c6fec4dc3fc1e88ed468efd8f83152b Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 26 Feb 2025 12:53:52 -0500
Subject: [PATCH 16/64] Add tests

---
 .../transpiler/test_two_qubit_peephole.py     | 471 ++++++++++++++++++
 1 file changed, 471 insertions(+)
 create mode 100644 test/python/transpiler/test_two_qubit_peephole.py

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
new file mode 100644
index 000000000000..74203b0349f8
--- /dev/null
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -0,0 +1,471 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2025
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+# pylint: disable=missing-function-docstring
+
+"""
+Tests for the default UnitarySynthesis transpiler pass.
+"""
+
+import unittest
+import math
+import numpy as np
+import scipy
+from ddt import ddt, data
+
+from qiskit import transpile, generate_preset_pass_manager
+from qiskit.providers.fake_provider import GenericBackendV2
+from qiskit.circuit import QuantumCircuit, QuantumRegister, ClassicalRegister
+from qiskit.circuit.library import quantum_volume
+from qiskit.circuit.parameterexpression import ParameterValueType
+from qiskit.converters import circuit_to_dag, dag_to_circuit
+from qiskit.transpiler.passes import UnitarySynthesis
+from qiskit.quantum_info.operators import Operator
+from qiskit.quantum_info.random import random_unitary
+from qiskit.transpiler import PassManager, CouplingMap, Target, InstructionProperties
+from qiskit.exceptions import QiskitError
+from qiskit.transpiler.passes import TwoQubitPeepholeOptimization, TrivialLayout
+from qiskit.circuit.library import (
+    IGate,
+    CXGate,
+    RZGate,
+    RXGate,
+    SXGate,
+    XGate,
+    iSwapGate,
+    ECRGate,
+    UGate,
+    ZGate,
+    RYYGate,
+    RZZGate,
+    RXXGate,
+    PauliEvolutionGate,
+    CPhaseGate,
+)
+from qiskit.quantum_info import SparsePauliOp
+from qiskit.circuit import Measure
+from qiskit.circuit.controlflow import IfElseOp
+from qiskit.circuit import Parameter, Gate
+from qiskit.synthesis.unitary.qsd import qs_decomposition
+
+from test import combine  # pylint: disable=wrong-import-order
+from test import QiskitTestCase  # pylint: disable=wrong-import-order
+from test.python.providers.fake_mumbai_v2 import (  # pylint: disable=wrong-import-order
+    FakeMumbaiFractionalCX,
+)
+from ..legacy_cmaps import YORKTOWN_CMAP
+
+
+class FakeBackend2QV2(GenericBackendV2):
+    """A 2-qubit fake backend"""
+
+    def __init__(self):
+        super().__init__(num_qubits=2, basis_gates=["rx", "u"], seed=42)
+        cx_props = {
+            (0, 1): InstructionProperties(duration=5.23e-7, error=0.00098115),
+        }
+        self._target.add_instruction(CXGate(), cx_props)
+        ecr_props = {
+            (1, 0): InstructionProperties(duration=4.52e-9, error=0.0000132115),
+        }
+        self._target.add_instruction(ECRGate(), ecr_props)
+
+
+class FakeBackend5QV2(GenericBackendV2):
+    """A 5-qubit fake backend"""
+
+    def __init__(self, bidirectional=True):
+        super().__init__(num_qubits=5, basis_gates=["u"], seed=42)
+        cx_props = {
+            (0, 1): InstructionProperties(duration=5.23e-7, error=0.00098115),
+            (3, 4): InstructionProperties(duration=5.23e-7, error=0.00098115),
+        }
+        if bidirectional:
+            cx_props[(1, 0)] = InstructionProperties(duration=6.23e-7, error=0.00099115)
+            cx_props[(4, 3)] = InstructionProperties(duration=7.23e-7, error=0.00099115)
+        self._target.add_instruction(CXGate(), cx_props)
+        ecr_props = {
+            (1, 2): InstructionProperties(duration=4.52e-9, error=0.0000132115),
+            (2, 3): InstructionProperties(duration=4.52e-9, error=0.0000132115),
+        }
+        if bidirectional:
+            ecr_props[(2, 1)] = InstructionProperties(duration=5.52e-9, error=0.0000232115)
+            ecr_props[(3, 2)] = InstructionProperties(duration=5.52e-9, error=0.0000232115)
+
+@ddt
+class TestTwoQubitPeepholeOptimization(QiskitTestCase):
+    """Test TwoQubitPeepholeOptimization."""
+
+    @combine(
+        bidirectional=[True, False],
+        dsc=(
+            "test natural_direction works with transpile using a"
+            "target with multiple 2q gates with bidirectional={bidirectional}"
+        ),
+        name="bidirectional_{bidirectional}",
+    )
+    def test_coupling_map_transpile_with_backendv2(self, bidirectional):
+        backend = FakeBackend5QV2(bidirectional)
+        qr = QuantumRegister(2)
+        circ = QuantumCircuit(qr)
+        circ.append(random_unitary(4, seed=1), [0, 1])
+        circ_01 = TwoQubitPeepholeOptimization(backend.target)(circ)
+        circ_01_index = {qubit: index for index, qubit in enumerate(circ_01.qubits)}
+        self.assertGreaterEqual(len(circ_01.get_instructions("cx")), 1)
+        for instr in circ_01.get_instructions("cx"):
+            self.assertEqual(
+                (0, 1), (circ_01_index[instr.qubits[0]], circ_01_index[instr.qubits[1]])
+            )
+
+    @combine(
+        bidirectional=[True, False],
+        dsc=(
+            "Test direction with transpile using a "
+            "target with multiple 2q gates with bidirectional={bidirectional}"
+            "direction [0, 1] is lower error and should be picked."
+        ),
+        name="bidirectional_{bidirectional}",
+    )
+    def test_coupling_unequal_duration_with_backendv2(self, bidirectional):
+        qr = QuantumRegister(2)
+        circ = QuantumCircuit(qr)
+        circ.append(random_unitary(4, seed=1), [1, 0])
+        backend = FakeBackend5QV2(bidirectional)
+        tqc = TwoQubitPeepholeOptimization(backend.target)(circ)
+        tqc_index = {qubit: index for index, qubit in enumerate(tqc.qubits)}
+        self.assertGreaterEqual(len(tqc.get_instructions("cx")), 1)
+        for instr in tqc.get_instructions("cx"):
+            self.assertEqual((0, 1), (tqc_index[instr.qubits[0]], tqc_index[instr.qubits[1]]))
+
+    def test_non_overlapping_kak_gates_with_backendv2(self):
+        qr = QuantumRegister(2)
+        circ = QuantumCircuit(qr)
+        circ.append(random_unitary(4, seed=1), [1, 0])
+        backend = FakeBackend2QV2()
+        tqc = TwoQubitPeepholeOptimization(backend.target)(circ)
+
+        tqc_index = {qubit: index for index, qubit in enumerate(tqc.qubits)}
+        self.assertGreaterEqual(len(tqc.get_instructions("ecr")), 1)
+        for instr in tqc.get_instructions("ecr"):
+            self.assertEqual((1, 0), (tqc_index[instr.qubits[0]], tqc_index[instr.qubits[1]]))
+
+    def test_fractional_cx_with_backendv2(self):
+        """Test fractional CX gets used if present in target."""
+        qr = QuantumRegister(2)
+        circ = QuantumCircuit(qr)
+        circ.append(random_unitary(4, seed=1), [0, 1])
+        backend = FakeMumbaiFractionalCX()
+        synth_pass = TwoQubitPeepholeOptimization(target=backend.target)
+        tqc = synth_pass(circ)
+        tqc_index = {qubit: index for index, qubit in enumerate(tqc.qubits)}
+        print(tqc)
+        self.assertGreaterEqual(len(tqc.get_instructions("rzx")), 1)
+        for instr in tqc.get_instructions("rzx"):
+            self.assertEqual((0, 1), (tqc_index[instr.qubits[0]], tqc_index[instr.qubits[1]]))
+
+    def test_reverse_direction(self):
+        target = Target(2)
+        target.add_instruction(CXGate(), {(0, 1): InstructionProperties(error=1.2e-6)})
+        target.add_instruction(ECRGate(), {(0, 1): InstructionProperties(error=1.2e-7)})
+        target.add_instruction(
+            UGate(Parameter("theta"), Parameter("phi"), Parameter("lam")), {(0,): None, (1,): None}
+        )
+        qr = QuantumRegister(2)
+        circ = QuantumCircuit(qr)
+        circ.append(random_unitary(4, seed=1), [1, 0])
+        tqc = TwoQubitPeepholeOptimization(target)(circ)
+        tqc_index = {qubit: index for index, qubit in enumerate(tqc.qubits)}
+        self.assertGreaterEqual(len(tqc.get_instructions("ecr")), 1)
+        for instr in tqc.get_instructions("ecr"):
+            self.assertEqual((0, 1), (tqc_index[instr.qubits[0]], tqc_index[instr.qubits[1]]))
+
+    def test_controlled_basis(self):
+        target = Target(2)
+        target.add_instruction(RYYGate(np.pi / 8), {(0, 1): InstructionProperties(error=1.2e-6)})
+        target.add_instruction(
+            UGate(Parameter("theta"), Parameter("phi"), Parameter("lam")), {(0,): None, (1,): None}
+        )
+        qr = QuantumRegister(2)
+        circ = QuantumCircuit(qr)
+        circ.append(random_unitary(4, seed=1), [1, 0])
+        tqc = TwoQubitPeepholeOptimization(target)(circ)
+        self.assertGreaterEqual(len(tqc.get_instructions("ryy")), 1)
+        self.assertEqual(Operator(tqc), Operator(circ))
+
+    def test_approximation_controlled(self):
+        target = Target(2)
+        target.add_instruction(RZZGate(np.pi / 10), {(0, 1): InstructionProperties(error=0.006)})
+        target.add_instruction(RXXGate(np.pi / 3), {(0, 1): InstructionProperties(error=0.01)})
+        target.add_instruction(
+            UGate(Parameter("theta"), Parameter("phi"), Parameter("lam")),
+            {(0,): InstructionProperties(error=0.001), (1,): InstructionProperties(error=0.002)},
+        )
+        circ = QuantumCircuit(2)
+        circ.append(random_unitary(4, seed=7), [1, 0])
+
+        dag = circuit_to_dag(circ)
+        dag_100 = TwoQubitPeepholeOptimization(target=target, approximation_degree=1.0).run(dag)
+        dag_99 = TwoQubitPeepholeOptimization(target=target, approximation_degree=0.99).run(dag)
+        self.assertGreaterEqual(dag_100.depth(), dag_99.depth())
+        self.assertEqual(Operator(dag_to_circuit(dag_100)), Operator(circ))
+
+    def test_mapping_control_flow(self):
+        """Test that inner dags use proper qubit mapping."""
+        qr = QuantumRegister(3, "q")
+        qc = QuantumCircuit(qr)
+
+        # Create target that supports CX only between 0 and 2.
+        fake_target = Target()
+        fake_target.add_instruction(CXGate(), {(0, 2): None})
+        fake_target.add_instruction(
+            UGate(Parameter("t"), Parameter("p"), Parameter("l")),
+            {
+                (0,): None,
+                (1,): None,
+                (2,): None,
+            },
+        )
+
+        qc_uni1 = QuantumCircuit(2)
+        qc_uni1.swap(0, 1)
+        qc_uni1_mat = Operator(qc_uni1)
+
+        loop_body = QuantumCircuit(2)
+        loop_body.unitary(qc_uni1_mat, [0, 1])
+
+        # Loop body uses qubits 0 and 2, mapped to 0 and 1 in the block.
+        # If synthesis doesn't handle recursive mapping, it'll incorrectly
+        # look for a CX on (0, 1) instead of on (0, 2).
+        qc.for_loop((0,), None, loop_body, [0, 2], [])
+
+        result = TwoQubitPeepholeOptimization(fake_target)(qc)
+        self.assertIsInstance(result, QuantumCircuit)
+
+    def test_single_qubit_with_target(self):
+        """Test input circuit with only 1q works with target."""
+        qc = QuantumCircuit(1)
+        qc.append(ZGate(), [qc.qubits[0]])
+        dag = circuit_to_dag(qc)
+        backend = GenericBackendV2(num_qubits=5, seed=42)
+        unitary_synth_pass = TwoQubitPeepholeOptimization(target=backend.target)
+        result_dag = unitary_synth_pass.run(dag)
+        result_qc = dag_to_circuit(result_dag)
+        self.assertEqual(qc, result_qc)
+
+    def test_single_qubit_identity_with_target(self):
+        """Test input single qubit identity works with target."""
+        qc = QuantumCircuit(1)
+        qc.unitary([[1.0, 0.0], [0.0, 1.0]], 0)
+        dag = circuit_to_dag(qc)
+        backend = GenericBackendV2(num_qubits=5)
+        unitary_synth_pass = TwoQubitPeepholeOptimization(target=backend.target)
+        result_dag = unitary_synth_pass.run(dag)
+        result_qc = dag_to_circuit(result_dag)
+        self.assertEqual(result_qc, QuantumCircuit(1))
+
+    def test_unitary_synthesis_with_ideal_and_variable_width_ops(self):
+        """Test unitary synthesis works with a target that contains ideal and variadic ops."""
+        qc = QuantumCircuit(2)
+        qc.unitary(np.eye(4), [0, 1])
+        dag = circuit_to_dag(qc)
+        target = GenericBackendV2(num_qubits=5).target
+        target.add_instruction(IfElseOp, name="if_else")
+        target.add_instruction(ZGate())
+        target.add_instruction(ECRGate())
+        unitary_synth_pass = TwoQubitPeepholeOptimization(target=target)
+        result_dag = unitary_synth_pass.run(dag)
+        result_qc = dag_to_circuit(result_dag)
+        self.assertEqual(result_qc, QuantumCircuit(2))
+
+    def test_unitary_synthesis_custom_gate_target(self):
+        qc = QuantumCircuit(2)
+        qc.unitary(np.eye(4), [0, 1])
+        dag = circuit_to_dag(qc)
+
+        class CustomGate(Gate):
+            """Custom Opaque Gate"""
+
+            def __init__(self):
+                super().__init__("custom", 2, [])
+
+        target = Target(num_qubits=2)
+        target.add_instruction(
+            UGate(Parameter("t"), Parameter("p"), Parameter("l")), {(0,): None, (1,): None}
+        )
+        target.add_instruction(CustomGate(), {(0, 1): None, (1, 0): None})
+        unitary_synth_pass = TwoQubitPeepholeOptimization(target=target)
+        result_dag = unitary_synth_pass.run(dag)
+        result_qc = dag_to_circuit(result_dag)
+        self.assertEqual(result_qc, qc)
+
+    def test_iswap_no_cx_synthesis_succeeds(self):
+        """Test basis set with iswap but no cx can synthesize a circuit"""
+        target = Target()
+        theta = Parameter("theta")
+
+        i_props = {
+            (0,): InstructionProperties(duration=35.5e-9, error=0.000413),
+            (1,): InstructionProperties(duration=35.5e-9, error=0.000502),
+        }
+        target.add_instruction(IGate(), i_props)
+        rz_props = {
+            (0,): InstructionProperties(duration=0, error=0),
+            (1,): InstructionProperties(duration=0, error=0),
+        }
+        target.add_instruction(RZGate(theta), rz_props)
+        sx_props = {
+            (0,): InstructionProperties(duration=35.5e-9, error=0.000413),
+            (1,): InstructionProperties(duration=35.5e-9, error=0.000502),
+        }
+        target.add_instruction(SXGate(), sx_props)
+        x_props = {
+            (0,): InstructionProperties(duration=35.5e-9, error=0.000413),
+            (1,): InstructionProperties(duration=35.5e-9, error=0.000502),
+        }
+        target.add_instruction(XGate(), x_props)
+        iswap_props = {
+            (0, 1): InstructionProperties(duration=519.11e-9, error=0.01201),
+            (1, 0): InstructionProperties(duration=554.66e-9, error=0.01201),
+        }
+        target.add_instruction(iSwapGate(), iswap_props)
+        measure_props = {
+            (0,): InstructionProperties(duration=5.813e-6, error=0.0751),
+            (1,): InstructionProperties(duration=5.813e-6, error=0.0225),
+        }
+        target.add_instruction(Measure(), measure_props)
+
+        qc = QuantumCircuit(2)
+        cxmat = Operator(CXGate()).to_matrix()
+        qc.unitary(cxmat, [0, 1])
+        unitary_synth_pass = TwoQubitPeepholeOptimization(target=target)
+        dag = circuit_to_dag(qc)
+        result_dag = unitary_synth_pass.run(dag)
+        result_qc = dag_to_circuit(result_dag)
+        self.assertTrue(np.allclose(Operator(result_qc.to_gate()).to_matrix(), cxmat))
+
+
+    def test_rxx_gate_in_target(self):
+        """Test synthesis with custom parameterized gate in target."""
+
+        class CustomXXGate(RXXGate):
+            """Custom RXXGate subclass that's not a standard gate"""
+
+            _standard_gate = None
+
+            def __init__(self, theta, label=None):
+                super().__init__(theta, label)
+                self.name = "MyCustomXXGate"
+
+        theta = Parameter("θ")
+        lam = Parameter("λ")
+        phi = Parameter("ϕ")
+
+        target = Target(num_qubits=2)
+        target.add_instruction(RZGate(lam))
+        target.add_instruction(RXGate(phi))
+        target.add_instruction(RXXGate(theta))
+
+        qc = QuantumCircuit(2)
+        qc.unitary(random_unitary(4, seed=1234), [0, 1])
+        qc_transpiled = TwoQubitPeepholeOptimization(target=target)(qc)
+        opcount = qc_transpiled.count_ops()
+        self.assertTrue(set(opcount).issubset({"rz", "rx", "rxx"}))
+
+        self.assertTrue(np.allclose(Operator(qc_transpiled), Operator(qc)))
+
+    def test_custom_parameterized_gate_in_target_skips(self):
+        """Test that synthesis is skipped with custom parameterized
+        gate in target that is not RXX equivalent."""
+
+        class CustomXYGate(Gate):
+            """Custom Gate subclass that's not a standard gate and not RXX equivalent"""
+
+            _standard_gate = None
+
+            def __init__(self, theta: ParameterValueType, label=None):
+                """Create new custom rotstion XY gate."""
+                super().__init__("MyCustomXYGate", 2, [theta])
+
+            def __array__(self, dtype=None):
+                """Return a Numpy.array for the custom gate."""
+                theta = self.params[0]
+                cos = math.cos(theta)
+                isin = 1j * math.sin(theta)
+                return np.array(
+                    [[1, 0, 0, 0], [0, cos, -isin, 0], [0, -isin, cos, 0], [0, 0, 0, 1]],
+                    dtype=dtype,
+                )
+
+            def inverse(self, annotated: bool = False):
+                return CustomXYGate(-self.params[0])
+
+        theta = Parameter("θ")
+        lam = Parameter("λ")
+        phi = Parameter("ϕ")
+
+        target = Target(num_qubits=2)
+        target.add_instruction(RZGate(lam))
+        target.add_instruction(RXGate(phi))
+        target.add_instruction(CustomXYGate(theta))
+
+        qc = QuantumCircuit(2)
+        qc.unitary(random_unitary(4, seed=1234), [0, 1])
+        qc_transpiled = TwoQubitPeepholeOptimization(target=target)(qc)
+        opcount = qc_transpiled.count_ops()
+        self.assertTrue(set(opcount).issubset({"unitary"}))
+        self.assertTrue(np.allclose(Operator(qc_transpiled), Operator(qc)))
+
+    def test_determinism(self):
+        """Test that the decomposition is deterministic."""
+        gate_counts = {"rx": 6, "rz": 12, "iswap": 2}
+        basis_gates = ["rx", "rz", "iswap"]
+        target = Target.from_configuration(basis_gates=basis_gates)
+        pm = generate_preset_pass_manager(target=target, optimization_level=2, seed_transpiler=42)
+
+        qc = QuantumCircuit(2)
+        qc.h(0)
+        qc.cx(0, 1)
+
+        for _ in range(10):
+            out = pm.run(qc)
+            self.assertTrue(Operator(out).equiv(qc))
+            self.assertTrue(set(out.count_ops()).issubset(basis_gates))
+            for basis_gate in basis_gates:
+                self.assertLessEqual(out.count_ops()[basis_gate], gate_counts[basis_gate])
+
+    @combine(gate=["unitary", "swap"])
+    def test_two_qubit_synthesis_to_directional_cx_target(self, gate):
+        """Verify two qubit unitaries are synthesized to match basis gates."""
+        # TODO: should make check more explicit e.g. explicitly set gate
+        # direction in test instead of using specific fake backend
+        backend = GenericBackendV2(
+            num_qubits=5,
+            basis_gates=["id", "rz", "sx", "x", "cx", "reset"],
+            coupling_map=YORKTOWN_CMAP,
+            seed=1,
+        )
+        coupling_map = CouplingMap(backend.coupling_map)
+        triv_layout_pass = TrivialLayout(coupling_map)
+
+        qr = QuantumRegister(2)
+        qc = QuantumCircuit(qr)
+        if gate == "unitary":
+            qc.unitary(random_unitary(4, seed=12), [0, 1])
+        elif gate == "swap":
+            qc.swap(qr[0], qr[1])
+
+        unisynth_pass = TwoQubitPeepholeOptimization(
+            target=backend.target,
+        )
+        pm = PassManager([triv_layout_pass, unisynth_pass])
+        qc_out = pm.run(qc)
+        self.assertEqual(Operator(qc), Operator(qc_out))

From 8c7e67c3e4a92ce3d48ab071abb1d53de507cbed Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Fri, 11 Apr 2025 15:58:35 -0400
Subject: [PATCH 17/64] Rebase updates

---
 crates/accelerate/src/high_level_synthesis.rs |  2 +-
 crates/accelerate/src/two_qubit_peephole.rs   | 19 +++++++++----------
 .../transpiler/test_two_qubit_peephole.py     |  2 +-
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/crates/accelerate/src/high_level_synthesis.rs b/crates/accelerate/src/high_level_synthesis.rs
index 9f081e8bf378..80722c5bc84c 100644
--- a/crates/accelerate/src/high_level_synthesis.rs
+++ b/crates/accelerate/src/high_level_synthesis.rs
@@ -698,7 +698,7 @@ fn extract_definition(
                         "cx".to_string(),
                         aview2(&CX_GATE),
                         1.0,
-                        "U",
+                        EulerBasis::U,
                         None,
                     )?;
                     let two_qubit_sequence =
diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 0cbac9086bea..e36012a04ae4 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -21,7 +21,6 @@ use rayon::prelude::*;
 use rustworkx_core::petgraph::stable_graph::NodeIndex;
 use smallvec::{smallvec, SmallVec};
 
-use qiskit_circuit::circuit_instruction::ExtraInstructionAttributes;
 use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
 use qiskit_circuit::operations::{Operation, OperationRef, Param, StandardGate};
 use qiskit_circuit::packed_instruction::PackedOperation;
@@ -133,10 +132,10 @@ fn get_decomposers_from_target(
         .collect();
     let mut decomposers = decomposers?;
     for gate in [
-        StandardGate::RXXGate,
-        StandardGate::RZZGate,
-        StandardGate::RYYGate,
-        StandardGate::RZXGate,
+        StandardGate::RXX,
+        StandardGate::RZZ,
+        StandardGate::RYY,
+        StandardGate::RZX,
     ] {
         if gate_names.contains(&(gate.name(), false)) {
             let op = target.operation_from_name(gate.name()).unwrap();
@@ -469,7 +468,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     qubits.as_slice(),
                                     &[],
                                     out_params,
-                                    ExtraInstructionAttributes::default(),
+                                    None,
                                     None,
                                 )
                             }
@@ -481,7 +480,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     qubits.as_slice(),
                                     &[],
                                     out_params,
-                                    ExtraInstructionAttributes::default(),
+                                    None,
                                 )
                             }
                         }
@@ -495,7 +494,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     qubits.as_slice(),
                                     &[],
                                     Some(gate.params.clone()),
-                                    ExtraInstructionAttributes::default(),
+                                    None,
                                     None,
                                 )
                             }
@@ -507,13 +506,13 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     qubits.as_slice(),
                                     &[],
                                     Some(gate.params.clone()),
-                                    ExtraInstructionAttributes::default(),
+                                    None,
                                 )
                             }
                         }
                     }?;
                 }
-                out_dag.add_global_phase(py, &Param::Float(sequence.0.global_phase))?;
+                out_dag.add_global_phase(&Param::Float(sequence.0.global_phase))?;
                 processed_runs.insert(*run_index);
             }
             None => {
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 74203b0349f8..1df97d1c1a38 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -101,6 +101,7 @@ def __init__(self, bidirectional=True):
             ecr_props[(2, 1)] = InstructionProperties(duration=5.52e-9, error=0.0000232115)
             ecr_props[(3, 2)] = InstructionProperties(duration=5.52e-9, error=0.0000232115)
 
+
 @ddt
 class TestTwoQubitPeepholeOptimization(QiskitTestCase):
     """Test TwoQubitPeepholeOptimization."""
@@ -352,7 +353,6 @@ def test_iswap_no_cx_synthesis_succeeds(self):
         result_qc = dag_to_circuit(result_dag)
         self.assertTrue(np.allclose(Operator(result_qc.to_gate()).to_matrix(), cxmat))
 
-
     def test_rxx_gate_in_target(self):
         """Test synthesis with custom parameterized gate in target."""
 

From f41e8559a87737437aad42410db5b7a6db5fc561 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Fri, 11 Apr 2025 16:16:09 -0400
Subject: [PATCH 18/64] Adjust tests

---
 test/python/transpiler/test_two_qubit_peephole.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 1df97d1c1a38..d7b221365ed8 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -168,7 +168,6 @@ def test_fractional_cx_with_backendv2(self):
         synth_pass = TwoQubitPeepholeOptimization(target=backend.target)
         tqc = synth_pass(circ)
         tqc_index = {qubit: index for index, qubit in enumerate(tqc.qubits)}
-        print(tqc)
         self.assertGreaterEqual(len(tqc.get_instructions("rzx")), 1)
         for instr in tqc.get_instructions("rzx"):
             self.assertEqual((0, 1), (tqc_index[instr.qubits[0]], tqc_index[instr.qubits[1]]))
@@ -262,16 +261,16 @@ def test_single_qubit_with_target(self):
         result_qc = dag_to_circuit(result_dag)
         self.assertEqual(qc, result_qc)
 
-    def test_single_qubit_identity_with_target(self):
+    def test_two_qubit_identity_with_target(self):
         """Test input single qubit identity works with target."""
-        qc = QuantumCircuit(1)
-        qc.unitary([[1.0, 0.0], [0.0, 1.0]], 0)
+        qc = QuantumCircuit(2)
+        qc.unitary(np.eye(4, dtype=complex), [0, 1])
         dag = circuit_to_dag(qc)
         backend = GenericBackendV2(num_qubits=5)
         unitary_synth_pass = TwoQubitPeepholeOptimization(target=backend.target)
         result_dag = unitary_synth_pass.run(dag)
         result_qc = dag_to_circuit(result_dag)
-        self.assertEqual(result_qc, QuantumCircuit(1))
+        self.assertEqual(result_qc, QuantumCircuit(2))
 
     def test_unitary_synthesis_with_ideal_and_variable_width_ops(self):
         """Test unitary synthesis works with a target that contains ideal and variadic ops."""
@@ -353,7 +352,8 @@ def test_iswap_no_cx_synthesis_succeeds(self):
         result_qc = dag_to_circuit(result_dag)
         self.assertTrue(np.allclose(Operator(result_qc.to_gate()).to_matrix(), cxmat))
 
-    def test_rxx_gate_in_target(self):
+    @unittest.skip("Add support for custom parameterized gates")
+    def test_custom_rxx_gate_in_target(self):
         """Test synthesis with custom parameterized gate in target."""
 
         class CustomXXGate(RXXGate):

From 3cdee4f7ee23d3cf340b92dfb2958cc622b3d013 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 17 Apr 2025 16:23:04 -0400
Subject: [PATCH 19/64] Clean-up test lint

---
 .../transpiler/test_two_qubit_peephole.py     | 25 +++----------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index d7b221365ed8..c3099461a591 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -16,23 +16,18 @@
 Tests for the default UnitarySynthesis transpiler pass.
 """
 
-import unittest
 import math
 import numpy as np
-import scipy
-from ddt import ddt, data
+import ddt
 
-from qiskit import transpile, generate_preset_pass_manager
+from qiskit import generate_preset_pass_manager
 from qiskit.providers.fake_provider import GenericBackendV2
-from qiskit.circuit import QuantumCircuit, QuantumRegister, ClassicalRegister
-from qiskit.circuit.library import quantum_volume
+from qiskit.circuit import QuantumCircuit, QuantumRegister
 from qiskit.circuit.parameterexpression import ParameterValueType
 from qiskit.converters import circuit_to_dag, dag_to_circuit
-from qiskit.transpiler.passes import UnitarySynthesis
 from qiskit.quantum_info.operators import Operator
 from qiskit.quantum_info.random import random_unitary
 from qiskit.transpiler import PassManager, CouplingMap, Target, InstructionProperties
-from qiskit.exceptions import QiskitError
 from qiskit.transpiler.passes import TwoQubitPeepholeOptimization, TrivialLayout
 from qiskit.circuit.library import (
     IGate,
@@ -48,14 +43,10 @@
     RYYGate,
     RZZGate,
     RXXGate,
-    PauliEvolutionGate,
-    CPhaseGate,
 )
-from qiskit.quantum_info import SparsePauliOp
 from qiskit.circuit import Measure
 from qiskit.circuit.controlflow import IfElseOp
 from qiskit.circuit import Parameter, Gate
-from qiskit.synthesis.unitary.qsd import qs_decomposition
 
 from test import combine  # pylint: disable=wrong-import-order
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
@@ -352,19 +343,9 @@ def test_iswap_no_cx_synthesis_succeeds(self):
         result_qc = dag_to_circuit(result_dag)
         self.assertTrue(np.allclose(Operator(result_qc.to_gate()).to_matrix(), cxmat))
 
-    @unittest.skip("Add support for custom parameterized gates")
     def test_custom_rxx_gate_in_target(self):
         """Test synthesis with custom parameterized gate in target."""
 
-        class CustomXXGate(RXXGate):
-            """Custom RXXGate subclass that's not a standard gate"""
-
-            _standard_gate = None
-
-            def __init__(self, theta, label=None):
-                super().__init__(theta, label)
-                self.name = "MyCustomXXGate"
-
         theta = Parameter("θ")
         lam = Parameter("λ")
         phi = Parameter("ϕ")

From fd96145aef0a5ca7e54378b95cb8fd8c098d0a43 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 17 Apr 2025 16:23:10 -0400
Subject: [PATCH 20/64] Remove plugin method support from the new pass

This commit removes the unitary synthesis plugin mechanism from the
pass. This was a layer violation to support this when the pass logic
doesn't actually support using the plugin interface. It is easier and
more clear that if the plugin interface usage is desired to handle that
in the pass manager construction rather than have this pass internally
build a pass manager and execute other passes to emulate behavior it
doesn't have.
---
 .../passes/optimization/two_qubit_peephole.py | 61 ++++++-------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index eb650405647d..6dafe944a0f3 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -23,13 +23,13 @@
 class TwoQubitPeepholeOptimization(TransformationPass):
     """Unified two qubit unitary peephole optimization
 
-    This transpiler pass is designed to perform two qubit unitary peephole optimization. This pass
-    finds all the 2 qubit blocks in the circuit, computes the unitary of
-    that block, and then synthesizes that unitary. If the synthesized two
-    qubit unitary is "better" than the original subcircuit that subcircuit
-    is used to replace the original. The heuristic used to determine if
-    it's better first looks at the two qubit gate count in the circuit, and
-    prefers the synthesis with fewer two qubit gates.
+    This transpiler pass is designed to perform two qubit unitary peephole
+    optimization. This pass finds all the 2 qubit blocks in the circuit,
+    computes the unitary of that block, and then synthesizes that unitary.
+    If the synthesized two qubit unitary is "better" than the original
+    subcircuit that subcircuit is used to replace the original. The heuristic
+    used to determine if it's better first looks at the two qubit gate count
+    in the circuit, and prefers the synthesis with fewer two qubit gates.
 
     In case the target is overcomplete the pass will try all the
     decomposers supported for all the gates supported on a given qubit.
@@ -39,15 +39,22 @@ class TwoQubitPeepholeOptimization(TransformationPass):
     This pass is multithreaded, and will perform the analysis in parallel
     and use all the cores available on your local system. You can refer to
     the `configuration guide <https://docs.quantum.ibm.com/guides/configure-qiskit-local>`__
-    for details on how to control the threading behavior
+    for details on how to control the threading behavior for Qiskit more broadly
+    which will also control this pass.
+
+    Unlike :class:`.UnitarySynthesis` pass this does not use the :ref`unitary-synth-plugin`.
+    This is a tradeoff for performance and it forgoes the pluggability exposed
+    via that interface. Internally it currently only uses the :class:`.TwoQubitBasisDecomposer`
+    and :class:`.TwoQubitControlledUDecomposer` for synthesizing the two qubit unitaries.
+    You should not use this pass if you need to use the pluggable interface and the ability
+    to use different synthesis algorithms, instead you should use a combination of
+    :class:`.ConsolidateBlocks` and :class:`.UnitarySynthesis` to use the plugin mechanism
     """
 
     def __init__(
         self,
         target: Target,
         approximation_degree: float | None = 1.0,
-        method: str = "default",
-        plugin_config: dict = None,
     ):
         """Initialize the pass
 
@@ -58,45 +65,11 @@ def __init__(
                 of gates used in the synthesized unitaries smaller at the cost of straying from the
                 original unitary. If ``None``, approximation is done based on gate fidelities
                 specified in the ``target``.
-            method: The optional unitary synthesis plugin to run. If this is specified the pass
-                behaves identically to running :class:`.ConsolidateBlocks` and
-                :class:`.UnitarySynthesis` in sequence. The heuristic described above doesn't apply
-                and the block is only resynthesized if
-                :meth:`.TwoQubitBasisDecomposer.num_basis_gates` predicts fewer 2q gates are
-                required than the original block.
-            plugin_config: The optional configuration dictionary if a plugin method is
-                specified. Refer to the documentation for the plugin being used for
-                the options accepted and how to configure the plugin.
         """
 
         super().__init__()
         self._target = target
         self._approximation_degree = approximation_degree
-        self._pm = None
-        if method != "default":
-            from qiskit.transpiler.passes.optimization import (
-                ConsolidateBlocks,
-            )  # pylint: disable=cyclic-import
-            from qiskit.transpiler.passes.synthesis import (
-                UnitarySynthesis,
-            )  # pylint: disable=cyclic-import
-            from qiskit.transpiler.passmanager import PassManager  # pylint: disable=cyclic-import
-
-            self._pm = PassManager(
-                [
-                    ConsolidateBlocks(
-                        target=self._target, approximation_degree=self._approximation_degree
-                    ),
-                    UnitarySynthesis(
-                        target=target,
-                        approximation_degree=approximation_degree,
-                        method=method,
-                        plugin_config=plugin_config,
-                    ),
-                ]
-            )
 
     def run(self, dag: DAGCircuit) -> DAGCircuit:
-        if self._pm is not None:
-            return self._pm.run(dag)
         return two_qubit_unitary_peephole_optimize(dag, self._target, self._approximation_degree)

From 89931a49566381a257260e6a310ad201c0269fe7 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 17 Apr 2025 16:40:28 -0400
Subject: [PATCH 21/64] Fix oversight in test code

---
 test/python/transpiler/test_two_qubit_peephole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index c3099461a591..e7cff0f3431a 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -93,7 +93,7 @@ def __init__(self, bidirectional=True):
             ecr_props[(3, 2)] = InstructionProperties(duration=5.52e-9, error=0.0000232115)
 
 
-@ddt
+@ddt.ddt
 class TestTwoQubitPeepholeOptimization(QiskitTestCase):
     """Test TwoQubitPeepholeOptimization."""
 

From 14e421c7aa61a7a28d9bb5cb0ede3b83b02b2ddf Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 17 Apr 2025 17:51:41 -0400
Subject: [PATCH 22/64] Fix test failures

There were two issues identified by the testing which required fixing
and adjusting the tests based on limitations in the pass. The first
issue was the parameters for the target gate was not handled correctly.
In the case of using the Controlled U decomposer we were not passing the
computed parameter value correctly to the output circuit and instead the
ParameterExpression from the target was being used. Then in the case of
controlled gates (not supercontrolled) that had a fixed angle that are
normally intended for the xx decomposer were incorrectly being passed to
the TwoQubitBasisDecomposer which can't work with them. This was
resulting in invalid circuit outputs. The use of the
TwoQubitBasisDecomposer is now correctly filtering to only be run with
supercontrolled gates. The tests were adjusted for this limitation
because they were mostly copied from the UnitarySynthesis tests which
supports xx decomposer.
---
 crates/accelerate/src/two_qubit_decompose.rs  |  6 ++++
 crates/accelerate/src/two_qubit_peephole.rs   | 15 ++++++---
 .../transpiler/test_two_qubit_peephole.py     | 31 ++++++-------------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_decompose.rs b/crates/accelerate/src/two_qubit_decompose.rs
index 8dff573bf298..312d44573081 100644
--- a/crates/accelerate/src/two_qubit_decompose.rs
+++ b/crates/accelerate/src/two_qubit_decompose.rs
@@ -1383,6 +1383,12 @@ impl TwoQubitBasisDecomposer {
         __num_basis_gates(self.basis_decomposer.b, self.basis_fidelity, u)
     }
 
+    /// Is the gate super controlled
+    #[inline]
+    pub fn super_controlled(&self) -> bool {
+        self.super_controlled
+    }
+
     fn decomp1_inner(
         &self,
         target: &TwoQubitWeylDecomposition,
diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index e36012a04ae4..1e17cef55c32 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -116,7 +116,7 @@ fn get_decomposers_from_target(
         .filter_map(|(two_qubit_name, two_qubit_gate, params, rev)| {
             let matrix = two_qubit_gate.matrix(params);
             matrix.map(|matrix| {
-                target_basis_set.get_bases().map(move |euler_basis| {
+                target_basis_set.get_bases().filter_map(move |euler_basis| {
                     TwoQubitBasisDecomposer::new_inner(
                         two_qubit_name.to_string(),
                         matrix.view(),
@@ -124,7 +124,14 @@ fn get_decomposers_from_target(
                         euler_basis,
                         None,
                     )
-                    .map(|x| (TwoQubitDecomposer::Basis(x), *rev))
+                    .map(|decomp| {
+                        if !decomp.super_controlled() {
+                            None
+                        } else {
+                            Some((TwoQubitDecomposer::Basis(decomp), *rev))
+                        }
+                    })
+                    .transpose()
                 })
             })
         })
@@ -493,7 +500,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     gate.operation.clone(),
                                     qubits.as_slice(),
                                     &[],
-                                    Some(gate.params.clone()),
+                                    Some(out_params.unwrap_or(gate.params.clone())),
                                     None,
                                     None,
                                 )
@@ -505,7 +512,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                     gate.operation.clone(),
                                     qubits.as_slice(),
                                     &[],
-                                    Some(gate.params.clone()),
+                                    Some(out_params.unwrap_or(gate.params.clone())),
                                     None,
                                 )
                             }
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index e7cff0f3431a..f13c463975af 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -159,8 +159,11 @@ def test_fractional_cx_with_backendv2(self):
         synth_pass = TwoQubitPeepholeOptimization(target=backend.target)
         tqc = synth_pass(circ)
         tqc_index = {qubit: index for index, qubit in enumerate(tqc.qubits)}
-        self.assertGreaterEqual(len(tqc.get_instructions("rzx")), 1)
-        for instr in tqc.get_instructions("rzx"):
+        # RZX with discrete angles would be lower error but because XX Decomposer is not
+        # supported/availble in rust we can't synthesize to it so CX ends up being used
+        self.assertGreaterEqual(len(tqc.get_instructions("rzx")), 0)
+        self.assertEqual(len(tqc.get_instructions("cx")), 3)
+        for instr in tqc.get_instructions("cx"):
             self.assertEqual((0, 1), (tqc_index[instr.qubits[0]], tqc_index[instr.qubits[1]]))
 
     def test_reverse_direction(self):
@@ -189,25 +192,9 @@ def test_controlled_basis(self):
         circ = QuantumCircuit(qr)
         circ.append(random_unitary(4, seed=1), [1, 0])
         tqc = TwoQubitPeepholeOptimization(target)(circ)
-        self.assertGreaterEqual(len(tqc.get_instructions("ryy")), 1)
-        self.assertEqual(Operator(tqc), Operator(circ))
-
-    def test_approximation_controlled(self):
-        target = Target(2)
-        target.add_instruction(RZZGate(np.pi / 10), {(0, 1): InstructionProperties(error=0.006)})
-        target.add_instruction(RXXGate(np.pi / 3), {(0, 1): InstructionProperties(error=0.01)})
-        target.add_instruction(
-            UGate(Parameter("theta"), Parameter("phi"), Parameter("lam")),
-            {(0,): InstructionProperties(error=0.001), (1,): InstructionProperties(error=0.002)},
-        )
-        circ = QuantumCircuit(2)
-        circ.append(random_unitary(4, seed=7), [1, 0])
-
-        dag = circuit_to_dag(circ)
-        dag_100 = TwoQubitPeepholeOptimization(target=target, approximation_degree=1.0).run(dag)
-        dag_99 = TwoQubitPeepholeOptimization(target=target, approximation_degree=0.99).run(dag)
-        self.assertGreaterEqual(dag_100.depth(), dag_99.depth())
-        self.assertEqual(Operator(dag_to_circuit(dag_100)), Operator(circ))
+        # Until XX decomposer is ported we can't synthesize using a RYY(pi/8) as the only 2q
+        # gate in the target
+        self.assertGreaterEqual(len(tqc.get_instructions("unitary")), 1)
 
     def test_mapping_control_flow(self):
         """Test that inner dags use proper qubit mapping."""
@@ -343,7 +330,7 @@ def test_iswap_no_cx_synthesis_succeeds(self):
         result_qc = dag_to_circuit(result_dag)
         self.assertTrue(np.allclose(Operator(result_qc.to_gate()).to_matrix(), cxmat))
 
-    def test_custom_rxx_gate_in_target(self):
+    def test_rxx_gate_in_target(self):
         """Test synthesis with custom parameterized gate in target."""
 
         theta = Parameter("θ")

From c7ba671d048b2949f334a3a95c356a668fa804ed Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 17 Apr 2025 18:19:24 -0400
Subject: [PATCH 23/64] Update target usage for recent changes

---
 crates/accelerate/src/two_qubit_peephole.rs | 28 ++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 1e17cef55c32..6b0f73f318e9 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -32,7 +32,7 @@ use crate::euler_one_qubit_decomposer::{
 };
 use crate::getenv_use_multiple_threads;
 use crate::nlayout::PhysicalQubit;
-use crate::target_transpiler::Target;
+use crate::target_transpiler::{Target, TargetOperation};
 use crate::two_qubit_decompose::{
     RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
 };
@@ -71,13 +71,19 @@ fn get_decomposers_from_target(
     let available_kak_gate: Vec<(&str, &PackedOperation, &[Param], bool)> = gate_names
         .iter()
         .filter_map(|(name, rev)| match target.operation_from_name(name) {
-            Ok(raw_op) => match raw_op.operation.view() {
-                OperationRef::StandardGate(_) | OperationRef::Gate(_) => {
-                    Some((*name, &raw_op.operation, raw_op.params.as_slice(), *rev))
+            Some(raw_op) => {
+                if let TargetOperation::Normal(op) = raw_op {
+                    match op.operation.view() {
+                        OperationRef::StandardGate(_) | OperationRef::Gate(_) => {
+                            Some((*name, &op.operation, op.params.as_slice(), *rev))
+                        }
+                        _ => None,
+                    }
+                } else {
+                    None
                 }
-                _ => None,
-            },
-            Err(_) => None,
+            }
+            None => None,
         })
         .collect();
 
@@ -147,7 +153,7 @@ fn get_decomposers_from_target(
         if gate_names.contains(&(gate.name(), false)) {
             let op = target.operation_from_name(gate.name()).unwrap();
             if op
-                .params
+                .params()
                 .iter()
                 .all(|x| matches!(x, Param::ParameterExpression(_)))
             {
@@ -492,7 +498,11 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                             }
                         }
                         None => {
-                            let gate = target.operation_from_name(sequence.1.as_str()).unwrap();
+                            let Some(TargetOperation::Normal(gate)) =
+                                target.operation_from_name(sequence.1.as_str())
+                            else {
+                                unreachable!()
+                            };
                             #[cfg(feature = "cache_pygates")]
                             {
                                 out_dag.apply_operation_back(

From 9f820c1f3a87cb51d44989d724eb4a4c57f2f8e6 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Fri, 18 Apr 2025 07:02:33 -0400
Subject: [PATCH 24/64] Simplify argument typing on new dag builder

The typing for some of the new methods on the DAGCircuitBuilder where a
bit too strict and requried the caller to do more work than was
necessary. This commit loosens the typing to make it a bit more
ergonomic and straightforward to use. It also more closely matches the
DAGCircuit methods the builder struct is mirroring. Right now the only
signature difference is qubits and clbits are wrapped in an Option while
on DAGCircuit it's not. This commit doesn't change this difference,
although there really isn't a reason to make this distinction and both
methods could have the same signature.
---
 .../src/basis/basis_translator/mod.rs         | 30 +++++++++----------
 crates/accelerate/src/unitary_synthesis.rs    | 15 +++++-----
 crates/circuit/src/dag_circuit.rs             | 23 +++++++-------
 3 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/crates/accelerate/src/basis/basis_translator/mod.rs b/crates/accelerate/src/basis/basis_translator/mod.rs
index 042b5ee7ccb3..f552920fc3f1 100644
--- a/crates/accelerate/src/basis/basis_translator/mod.rs
+++ b/crates/accelerate/src/basis/basis_translator/mod.rs
@@ -509,12 +509,12 @@ fn apply_translation(
                 out_dag_builder.apply_operation_back(
                     py,
                     new_op.operation,
-                    Some(node_qarg.into()),
-                    Some(node_carg.into()),
+                    Some(node_qarg),
+                    Some(node_carg),
                     if new_op.params.is_empty() {
                         None
                     } else {
-                        Some(Box::new(new_op.params))
+                        Some(new_op.params)
                     },
                     new_op.label.as_deref().cloned(),
                     #[cfg(feature = "cache_pygates")]
@@ -524,9 +524,9 @@ fn apply_translation(
                 out_dag_builder.apply_operation_back(
                     py,
                     node_obj.op.clone(),
-                    Some(node_qarg.into()),
-                    Some(node_carg.into()),
-                    node_obj.params.clone(),
+                    Some(node_qarg),
+                    Some(node_carg),
+                    node_obj.params.as_ref().map(|x| *x.clone()),
                     node_obj.label.as_deref().cloned(),
                     #[cfg(feature = "cache_pygates")]
                     None,
@@ -542,9 +542,9 @@ fn apply_translation(
             out_dag_builder.apply_operation_back(
                 py,
                 node_obj.op.clone(),
-                Some(node_qarg.into()),
-                Some(node_carg.into()),
-                node_obj.params.clone(),
+                Some(node_qarg),
+                Some(node_carg),
+                node_obj.params.as_ref().map(|x| *x.clone()),
                 node_obj.label.as_deref().cloned(),
                 #[cfg(feature = "cache_pygates")]
                 None,
@@ -625,12 +625,12 @@ fn replace_node(
             dag.apply_operation_back(
                 py,
                 new_op,
-                Some(new_qubits.into()),
-                Some(new_clbits.into()),
+                Some(&new_qubits),
+                Some(&new_clbits),
                 if new_params.is_empty() {
                     None
                 } else {
-                    Some(Box::new(new_params))
+                    Some(new_params)
                 },
                 node.label.as_deref().cloned(),
                 #[cfg(feature = "cache_pygates")]
@@ -729,12 +729,12 @@ fn replace_node(
             dag.apply_operation_back(
                 py,
                 new_op,
-                Some(new_qubits.into()),
-                Some(new_clbits.into()),
+                Some(&new_qubits),
+                Some(&new_clbits),
                 if new_params.is_empty() {
                     None
                 } else {
-                    Some(Box::new(new_params))
+                    Some(new_params)
                 },
                 inner_node.label.as_deref().cloned(),
                 #[cfg(feature = "cache_pygates")]
diff --git a/crates/accelerate/src/unitary_synthesis.rs b/crates/accelerate/src/unitary_synthesis.rs
index da06cf8c0727..b17009a3e760 100644
--- a/crates/accelerate/src/unitary_synthesis.rs
+++ b/crates/accelerate/src/unitary_synthesis.rs
@@ -144,7 +144,7 @@ fn apply_synth_dag(
             .iter()
             .map(|qarg| out_qargs[qarg.0 as usize])
             .collect();
-        out_packed_instr.qubits = out_dag.insert_qargs(mapped_qargs.into());
+        out_packed_instr.qubits = out_dag.insert_qargs(&mapped_qargs);
         out_dag.push_back(py, out_packed_instr)?;
     }
     out_dag.add_global_phase(&synth_dag.get_global_phase())?;
@@ -167,15 +167,15 @@ fn apply_synth_sequence(
             Some(gate) => &PackedOperation::from_standard_gate(*gate),
         };
         let mapped_qargs: Vec<Qubit> = qubit_ids.iter().map(|id| out_qargs[*id as usize]).collect();
-        let new_params: Option<Box<SmallVec<[Param; 3]>>> = match gate {
-            Some(_) => Some(Box::new(params.iter().map(|p| Param::Float(*p)).collect())),
+        let new_params: Option<SmallVec<[Param; 3]>> = match gate {
+            Some(_) => Some(params.iter().map(|p| Param::Float(*p)).collect()),
             None => {
                 if !sequence.decomp_params.is_empty()
                     && matches!(sequence.decomp_params[0], Param::Float(_))
                 {
-                    Some(Box::new(sequence.decomp_params.clone()))
+                    Some(sequence.decomp_params.clone())
                 } else {
-                    Some(Box::new(params.iter().map(|p| Param::Float(*p)).collect()))
+                    Some(params.iter().map(|p| Param::Float(*p)).collect())
                 }
             }
         };
@@ -187,7 +187,6 @@ fn apply_synth_sequence(
                     "params",
                     new_params
                         .as_deref()
-                        .map(SmallVec::as_slice)
                         .unwrap_or(&[])
                         .iter()
                         .map(|param| param.clone_ref(py))
@@ -213,7 +212,7 @@ fn apply_synth_sequence(
         out_dag.apply_operation_back(
             py,
             new_op,
-            Some(mapped_qargs.into()),
+            Some(&mapped_qargs),
             None,
             new_params,
             None,
@@ -1083,7 +1082,7 @@ fn reversed_synth_su4_dag(
             .iter()
             .map(|x| flip_bits[x.0 as usize])
             .collect();
-        inst.qubits = target_dag_builder.insert_qargs(qubits.into());
+        inst.qubits = target_dag_builder.insert_qargs(&qubits);
         target_dag_builder.push_back(py, inst)?;
     }
     Ok(target_dag_builder.build())
diff --git a/crates/circuit/src/dag_circuit.rs b/crates/circuit/src/dag_circuit.rs
index 543f1ec8e9e6..246e5cdd1683 100644
--- a/crates/circuit/src/dag_circuit.rs
+++ b/crates/circuit/src/dag_circuit.rs
@@ -10,7 +10,6 @@
 // copyright notice, and modified files need to carry a notice indicating
 // that they have been altered from the originals.
 
-use std::borrow::Cow;
 use std::hash::Hash;
 use std::sync::OnceLock;
 
@@ -7278,9 +7277,9 @@ impl DAGCircuitBuilder {
         &mut self,
         py: Python,
         op: PackedOperation,
-        qubits: Option<Cow<[Qubit]>>,
-        clbits: Option<Cow<[Clbit]>>,
-        params: Option<Box<SmallVec<[Param; 3]>>>,
+        qubits: Option<&[Qubit]>,
+        clbits: Option<&[Clbit]>,
+        params: Option<SmallVec<[Param; 3]>>,
         label: Option<String>,
         #[cfg(feature = "cache_pygates")] py_op: Option<PyObject>,
     ) -> PyResult<NodeIndex> {
@@ -7386,9 +7385,9 @@ impl DAGCircuitBuilder {
     pub fn pack_instruction(
         &mut self,
         op: PackedOperation,
-        qubits: Option<Cow<[Qubit]>>,
-        clbits: Option<Cow<[Clbit]>>,
-        params: Option<Box<SmallVec<[Param; 3]>>>,
+        qubits: Option<&[Qubit]>,
+        clbits: Option<&[Clbit]>,
+        params: Option<SmallVec<[Param; 3]>>,
         label: Option<String>,
         #[cfg(feature = "cache_pygates")] py_op: Option<PyObject>,
     ) -> PackedInstruction {
@@ -7412,7 +7411,7 @@ impl DAGCircuitBuilder {
             op,
             qubits,
             clbits,
-            params,
+            params: params.map(Box::new),
             label: label.map(|label| label.into()),
             #[cfg(feature = "cache_pygates")]
             py_op,
@@ -7430,13 +7429,13 @@ impl DAGCircuitBuilder {
     }
 
     /// Packs qargs into the circuit.
-    pub fn insert_qargs(&mut self, qargs: Cow<[Qubit]>) -> Interned<[Qubit]> {
-        self.dag.qargs_interner.insert_cow(qargs)
+    pub fn insert_qargs(&mut self, qargs: &[Qubit]) -> Interned<[Qubit]> {
+        self.dag.qargs_interner.insert(qargs)
     }
 
     /// Packs qargs into the circuit.
-    pub fn insert_cargs(&mut self, cargs: Cow<[Clbit]>) -> Interned<[Clbit]> {
-        self.dag.cargs_interner.insert_cow(cargs)
+    pub fn insert_cargs(&mut self, cargs: &[Clbit]) -> Interned<[Clbit]> {
+        self.dag.cargs_interner.insert(cargs)
     }
 
     /// Adds a new value to the global phase of the inner [DAGCircuit].

From bfdfe462fc2bb1d2ff1dc82e9f99e82e60da6aae Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Fri, 18 Apr 2025 08:27:21 -0400
Subject: [PATCH 25/64] Use dag builder api

---
 crates/accelerate/src/two_qubit_peephole.rs | 37 +++++++++++----------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 6b0f73f318e9..d9b642eb67bf 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -445,7 +445,8 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
     let run_mapping = run_mapping?;
     // After we've computed all the sequences to execute now serially build up a new dag.
     let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
-    let mut out_dag = dag.copy_empty_like(py, "alike")?;
+    let out_dag = dag.copy_empty_like(py, "alike")?;
+    let mut out_dag_builder = out_dag.into_builder(py);
     let node_mapping = locked_node_mapping.into_inner().unwrap();
     for node in dag.topological_op_nodes()? {
         match node_mapping.get(&node) {
@@ -457,7 +458,7 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     let NodeType::Operation(ref instr) = dag.dag()[node] else {
                         unreachable!("Must be an op node")
                     };
-                    out_dag.push_back(py, instr.clone())?;
+                    out_dag_builder.push_back(py, instr.clone())?;
                     continue;
                 }
                 let (sequence, qubit_map) = &run_mapping[*run_index].as_ref().unwrap();
@@ -469,17 +470,17 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                     let out_params = if params.is_empty() {
                         None
                     } else {
-                        Some(params.iter().map(|val| Param::Float(*val)).collect())
+                        Some(params.into_iter().map(|val| Param::Float(*val)).collect())
                     };
                     match gate {
                         Some(gate) => {
                             #[cfg(feature = "cache_pygates")]
                             {
-                                out_dag.apply_operation_back(
+                                out_dag_builder.apply_operation_back(
                                     py,
                                     PackedOperation::from_standard_gate(*gate),
-                                    qubits.as_slice(),
-                                    &[],
+                                    Some(&qubits),
+                                    None,
                                     out_params,
                                     None,
                                     None,
@@ -487,11 +488,11 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                             }
                             #[cfg(not(feature = "cache_pygates"))]
                             {
-                                out_dag.apply_operation_back(
+                                out_dag_builder.apply_operation_back(
                                     py,
                                     PackedOperation::from_standard_gate(*gate),
-                                    qubits.as_slice(),
-                                    &[],
+                                    Some(&qubits),
+                                    None,
                                     out_params,
                                     None,
                                 )
@@ -505,11 +506,11 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                             };
                             #[cfg(feature = "cache_pygates")]
                             {
-                                out_dag.apply_operation_back(
+                                out_dag_builder.apply_operation_back(
                                     py,
                                     gate.operation.clone(),
-                                    qubits.as_slice(),
-                                    &[],
+                                    Some(&qubits),
+                                    None,
                                     Some(out_params.unwrap_or(gate.params.clone())),
                                     None,
                                     None,
@@ -517,11 +518,11 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                             }
                             #[cfg(not(feature = "cache_pygates"))]
                             {
-                                out_dag.apply_operation_back(
+                                out_dag_builder.apply_operation_back(
                                     py,
                                     gate.operation.clone(),
-                                    qubits.as_slice(),
-                                    &[],
+                                    Some(&qubits),
+                                    None,
                                     Some(out_params.unwrap_or(gate.params.clone())),
                                     None,
                                 )
@@ -529,18 +530,18 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                         }
                     }?;
                 }
-                out_dag.add_global_phase(&Param::Float(sequence.0.global_phase))?;
+                out_dag_builder.add_global_phase(&Param::Float(sequence.0.global_phase))?;
                 processed_runs.insert(*run_index);
             }
             None => {
                 let NodeType::Operation(ref instr) = dag.dag()[node] else {
                     unreachable!("Must be an op node")
                 };
-                out_dag.push_back(py, instr.clone())?;
+                out_dag_builder.push_back(py, instr.clone())?;
             }
         }
     }
-    Ok(out_dag)
+    Ok(out_dag_builder.build())
 }
 
 pub fn two_qubit_peephole_mod(m: &Bound<PyModule>) -> PyResult<()> {

From f72df7d401ceefe58b0642531a959ffd6850e7db Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Fri, 18 Apr 2025 08:52:21 -0400
Subject: [PATCH 26/64] Fix error handling for target lookup

---
 crates/accelerate/src/two_qubit_peephole.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index d9b642eb67bf..29fe6002b7d5 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -32,6 +32,7 @@ use crate::euler_one_qubit_decomposer::{
 };
 use crate::getenv_use_multiple_threads;
 use crate::nlayout::PhysicalQubit;
+use crate::target_transpiler::exceptions::TranspilerError;
 use crate::target_transpiler::{Target, TargetOperation};
 use crate::two_qubit_decompose::{
     RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
@@ -57,7 +58,7 @@ fn get_decomposers_from_target(
     let reverse_qubits = physical_qubits.iter().rev().copied().collect();
     let mut gate_names: HashSet<(&str, bool)> = target
         .operation_names_for_qargs(Some(&physical_qubits))
-        .unwrap()
+        .map_err(|e| TranspilerError::new_err(e.message))?
         .into_iter()
         .map(|x| (x, false))
         .collect();

From bb31eb5a9d4d2094e742d71534292e587ed54bfa Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Fri, 18 Apr 2025 18:15:50 -0400
Subject: [PATCH 27/64] Fix some bugs and expand tests

This commit updates some of the pass's logic and handles some bugs that
were found during development. It also adds a couple new test cases, one
of which is failing. There still seem to be bugs in the case we need to
use a reverse edge in the target.
---
 crates/accelerate/src/two_qubit_peephole.rs   | 476 +++++++++---------
 .../transpiler/test_two_qubit_peephole.py     | 128 ++++-
 2 files changed, 357 insertions(+), 247 deletions(-)

diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
index 29fe6002b7d5..68dd30d5c96c 100644
--- a/crates/accelerate/src/two_qubit_peephole.rs
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -42,7 +42,7 @@ use crate::two_qubit_decompose::{
 // is 1640 bytes and TwoQubitControlledUDecomposer is only 24 bytes. This means
 // each element of ControlledU is wasting > 1600 bytes but that is acceptable in
 // this case to avoid the layer of pointer indirection as these are stored
-// temporarily in a vec inside a thread to decompose a unitary.
+// temporarily in a vec inside a thread to decompose a unitary and don't persist.
 #[allow(clippy::large_enum_variant)]
 enum TwoQubitDecomposer {
     Basis(TwoQubitBasisDecomposer),
@@ -53,23 +53,33 @@ fn get_decomposers_from_target(
     target: &Target,
     qubits: &[Qubit],
     fidelity: f64,
-) -> PyResult<Vec<(TwoQubitDecomposer, bool)>> {
+) -> PyResult<Vec<(TwoQubitDecomposer, Option<bool>)>> {
     let physical_qubits = smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
     let reverse_qubits = physical_qubits.iter().rev().copied().collect();
-    let mut gate_names: HashSet<(&str, bool)> = target
-        .operation_names_for_qargs(Some(&physical_qubits))
-        .map_err(|e| TranspilerError::new_err(e.message))?
-        .into_iter()
-        .map(|x| (x, false))
-        .collect();
-    if let Ok(reverse_names) = target.operation_names_for_qargs(Some(&reverse_qubits)) {
-        if !reverse_names.is_empty() {
-            for name in reverse_names {
-                gate_names.insert((name, true));
+    let mut reverse_used = false;
+    let mut gate_names: HashSet<(&str, Option<bool>)> =
+        match target.operation_names_for_qargs(Some(&physical_qubits)) {
+            Ok(names) => names.into_iter().map(|x| (x, None)).collect(),
+            Err(err) => {
+                reverse_used = true;
+                target
+                    .operation_names_for_qargs(Some(&reverse_qubits))
+                    .map_err(|_| TranspilerError::new_err(err.message))?
+                    .into_iter()
+                    .map(|x| (x, Some(false)))
+                    .collect()
+            }
+        };
+    if !reverse_used {
+        if let Ok(reverse_names) = target.operation_names_for_qargs(Some(&reverse_qubits)) {
+            if !reverse_names.is_empty() {
+                for name in reverse_names {
+                    gate_names.insert((name, Some(true)));
+                }
             }
         }
     }
-    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param], bool)> = gate_names
+    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param], Option<bool>)> = gate_names
         .iter()
         .filter_map(|(name, rev)| match target.operation_from_name(name) {
             Some(raw_op) => {
@@ -118,7 +128,7 @@ fn get_decomposers_from_target(
         target_basis_set.remove(EulerBasis::ZSX);
     }
 
-    let decomposers: PyResult<Vec<(TwoQubitDecomposer, bool)>> = available_kak_gate
+    let decomposers: PyResult<Vec<(TwoQubitDecomposer, Option<bool>)>> = available_kak_gate
         .iter()
         .filter_map(|(two_qubit_name, two_qubit_gate, params, rev)| {
             let matrix = two_qubit_gate.matrix(params);
@@ -151,7 +161,7 @@ fn get_decomposers_from_target(
         StandardGate::RYY,
         StandardGate::RZX,
     ] {
-        if gate_names.contains(&(gate.name(), false)) {
+        if gate_names.contains(&(gate.name(), None)) {
             let op = target.operation_from_name(gate.name()).unwrap();
             if op
                 .params()
@@ -164,7 +174,7 @@ fn get_decomposers_from_target(
                             RXXEquivalent::Standard(gate),
                             euler_basis.as_str(),
                         )?),
-                        false,
+                        None,
                     ));
                 }
             }
@@ -192,26 +202,24 @@ fn score_sequence<'a>(
     sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
 ) -> (usize, f64) {
     let mut gate_count = 0;
-    let error = 1.
-        - sequence
-            .filter_map(|(gate, local_qubits)| {
-                let qubits = local_qubits
-                    .iter()
-                    .map(|qubit| PhysicalQubit(qubit.0))
-                    .collect::<Vec<_>>();
-                if qubits.len() == 2 {
-                    gate_count += 1;
-                }
-                let name = match gate.as_ref() {
-                    Some(g) => g.name(),
-                    None => kak_gate_name,
-                };
-                target
-                    .get_error(name, qubits.as_slice())
-                    .map(|error| 1. - error)
-            })
-            .product::<f64>();
-    (gate_count, error)
+    let fidelity = sequence
+        .filter_map(|(gate, local_qubits)| {
+            let qubits = local_qubits
+                .iter()
+                .map(|qubit| PhysicalQubit(qubit.0))
+                .collect::<Vec<_>>();
+            if qubits.len() == 2 {
+                gate_count += 1;
+            }
+            let name = match gate.as_ref() {
+                Some(g) => g.name(),
+                None => kak_gate_name,
+            };
+            let error = target.get_error(name, qubits.as_slice());
+            error.map(|error| 1. - error)
+        })
+        .product::<f64>();
+    (gate_count, 1. - fidelity)
 }
 
 type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;
@@ -231,38 +239,36 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
         HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
     let locked_node_mapping = Mutex::new(node_mapping);
 
-    let find_best_sequence = |run_index: usize,
-                              node_indices: &[NodeIndex]|
-     -> PyResult<MappingIterItem> {
-        let block_qubit_map = node_indices
-            .iter()
-            .find_map(|node_index| {
-                let inst = dag.dag()[*node_index].unwrap_operation();
-                let qubits = dag.get_qargs(inst.qubits);
-                if qubits.len() == 2 {
-                    if qubits[0] > qubits[1] {
-                        Some([qubits[1], qubits[0]])
+    let find_best_sequence =
+        |run_index: usize, node_indices: &[NodeIndex]| -> PyResult<MappingIterItem> {
+            let block_qubit_map = node_indices
+                .iter()
+                .find_map(|node_index| {
+                    let inst = dag.dag()[*node_index].unwrap_operation();
+                    let qubits = dag.get_qargs(inst.qubits);
+                    if qubits.len() == 2 {
+                        if qubits[0] > qubits[1] {
+                            Some([qubits[1], qubits[0]])
+                        } else {
+                            Some([qubits[0], qubits[1]])
+                        }
                     } else {
-                        Some([qubits[0], qubits[1]])
+                        None
                     }
-                } else {
-                    None
-                }
-            })
-            .unwrap();
-        let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
-        let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
-        let mut decomposer_scores: Vec<Option<(usize, f64)>> = vec![None; decomposers.len()];
+                })
+                .unwrap();
+            let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
+            let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
+            let mut decomposer_scores: Vec<Option<(usize, f64)>> = vec![None; decomposers.len()];
 
-        let order_sequence =
-            |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
-             (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
-                let score_a = (
-                    match decomposer_scores[*index_a] {
-                        Some(score) => score,
-                        None => {
-                            let score: (usize, f64) =
-                                score_sequence(
+            let order_sequence =
+                |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
+                 (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
+                    let score_a = (
+                        match decomposer_scores[*index_a] {
+                            Some(score) => score,
+                            None => {
+                                let score: (usize, f64) = score_sequence(
                                     target,
                                     sequence_a.1.as_str(),
                                     sequence_a.0.gates.iter().map(
@@ -275,19 +281,18 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                         },
                                     ),
                                 );
-                            decomposer_scores[*index_a] = Some(score);
-                            score
-                        }
-                    },
-                    index_a,
-                );
+                                decomposer_scores[*index_a] = Some(score);
+                                score
+                            }
+                        },
+                        index_a,
+                    );
 
-                let score_b = (
-                    match decomposer_scores[*index_b] {
-                        Some(score) => score,
-                        None => {
-                            let score: (usize, f64) =
-                                score_sequence(
+                    let score_b = (
+                        match decomposer_scores[*index_b] {
+                            Some(score) => score,
+                            None => {
+                                let score: (usize, f64) = score_sequence(
                                     target,
                                     sequence_b.1.as_str(),
                                     sequence_b.0.gates.iter().map(
@@ -300,132 +305,139 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                                         },
                                     ),
                                 );
-                            decomposer_scores[*index_b] = Some(score);
-                            score
-                        }
-                    },
-                    index_b,
-                );
-                score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
-            };
-
-        let sequence = decomposers
-            .iter()
-            .map(|decomposer| {
-                if decomposer.1 {
-                    let mut mat = matrix.clone();
-                    reverse_mat(&mut mat);
-                    match &decomposer.0 {
-                        TwoQubitDecomposer::Basis(decomposer) => {
-                            let synth =
-                                decomposer.call_inner(mat.view(), None, true, None).unwrap();
-                            let mut reversed_gates = Vec::with_capacity(synth.gates.len());
-                            let flip_bits: [u8; 2] = [1, 0];
-                            for (gate, params, qubit_ids) in synth.gates() {
-                                let new_qubit_ids = qubit_ids
-                                    .into_iter()
-                                    .map(|x| flip_bits[*x as usize])
-                                    .collect::<SmallVec<[u8; 2]>>();
-                                reversed_gates.push((*gate, params.clone(), new_qubit_ids.clone()));
+                                decomposer_scores[*index_b] = Some(score);
+                                score
                             }
-                            let mut reversed_synth: TwoQubitGateSequence =
-                                TwoQubitGateSequence::new();
-                            reversed_synth.set_state((reversed_gates, synth.global_phase()));
-                            (reversed_synth, decomposer.gate_name().to_string())
-                        }
-                        _ => unreachable!("Only TwoQubitBasisDecomposer is reversible"),
-                    }
-                } else {
-                    match &decomposer.0 {
-                        TwoQubitDecomposer::Basis(decomposer) => (
-                            decomposer
-                                .call_inner(matrix.view(), None, true, None)
-                                .unwrap(),
-                            decomposer.gate_name().to_string(),
-                        ),
-                        TwoQubitDecomposer::ControlledU(decomposer) => (
-                            decomposer.call_inner(matrix.view(), Some(1e-12)).unwrap(),
-                            match decomposer.rxx_equivalent_gate {
-                                RXXEquivalent::Standard(gate) => gate.name().to_string(),
-                                RXXEquivalent::CustomPython(_) => {
-                                    unreachable!("Decomposer only uses standard gates")
+                        },
+                        index_b,
+                    );
+                    score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
+                };
+            let sequence = decomposers
+                .iter()
+                .map(|decomposer| {
+                    if let Some(rev) = decomposer.1 {
+                        let mut mat = matrix.clone();
+                        reverse_mat(&mut mat);
+                        match &decomposer.0 {
+                            TwoQubitDecomposer::Basis(decomposer) => {
+                                let synth =
+                                    decomposer.call_inner(mat.view(), None, true, None).unwrap();
+                                if rev {
+                                    let mut reversed_gates = Vec::with_capacity(synth.gates.len());
+                                    let flip_bits: [u8; 2] = [1, 0];
+                                    for (gate, params, qubit_ids) in synth.gates() {
+                                        let new_qubit_ids = qubit_ids
+                                            .into_iter()
+                                            .map(|x| flip_bits[*x as usize])
+                                            .collect::<SmallVec<[u8; 2]>>();
+                                        reversed_gates.push((
+                                            *gate,
+                                            params.clone(),
+                                            new_qubit_ids.clone(),
+                                        ));
+                                    }
+                                    let mut reversed_synth: TwoQubitGateSequence =
+                                        TwoQubitGateSequence::new();
+                                    reversed_synth
+                                        .set_state((reversed_gates, synth.global_phase()));
+                                    (reversed_synth, decomposer.gate_name().to_string())
+                                } else {
+                                    (synth, decomposer.gate_name().to_string())
                                 }
-                            },
-                        ),
+                            }
+                            _ => unreachable!("Only TwoQubitBasisDecomposer is reversible"),
+                        }
+                    } else {
+                        match &decomposer.0 {
+                            TwoQubitDecomposer::Basis(decomposer) => (
+                                decomposer
+                                    .call_inner(matrix.view(), None, true, None)
+                                    .unwrap(),
+                                decomposer.gate_name().to_string(),
+                            ),
+                            TwoQubitDecomposer::ControlledU(decomposer) => (
+                                decomposer.call_inner(matrix.view(), Some(1e-12)).unwrap(),
+                                match decomposer.rxx_equivalent_gate {
+                                    RXXEquivalent::Standard(gate) => gate.name().to_string(),
+                                    RXXEquivalent::CustomPython(_) => {
+                                        unreachable!("Decomposer only uses standard gates")
+                                    }
+                                },
+                            ),
+                        }
                     }
-                }
-            })
-            .enumerate()
-            .min_by(order_sequence);
-        if sequence.is_none() {
-            return Ok(None);
-        }
-        let sequence = sequence.unwrap();
-        let mut original_err: f64 = 1.;
-        let mut original_count: usize = 0;
-        let mut outside_target = false;
-        for node_index in node_indices {
-            let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
-                unreachable!("All run nodes will be ops")
-            };
-            let qubits = dag
-                .get_qargs(inst.qubits)
-                .iter()
-                .map(|qubit| PhysicalQubit(qubit.0))
-                .collect::<Vec<_>>();
-            if qubits.len() == 2 {
-                original_count += 1;
+                })
+                .enumerate()
+                .min_by(order_sequence);
+            if sequence.is_none() {
+                return Ok(None);
             }
-            let name = inst.op.name();
-            let gate_err = match target.get_error(name, qubits.as_slice()) {
-                Some(err) => 1. - err,
-                None => {
-                    // If error rate is None this can mean either the gate is not supported
-                    // in the target or the gate is ideal. We need to do a second lookup
-                    // to determine if the gate is supported, and if it isn't we don't need
-                    // to finish scoring because we know we'll use the synthesis output
-                    let physical_qargs = qubits.iter().map(|bit| PhysicalQubit(bit.0)).collect();
-                    if !target.instruction_supported(name, Some(&physical_qargs)) {
-                        outside_target = true;
-                        break;
-                    }
-                    1.
+            let sequence = sequence.unwrap();
+            let mut original_fidelity: f64 = 1.;
+            let mut original_count: usize = 0;
+            let mut outside_target = false;
+            for node_index in node_indices {
+                let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
+                    unreachable!("All run nodes will be ops")
+                };
+                let qubits: SmallVec<[PhysicalQubit; 2]> = dag
+                    .get_qargs(inst.qubits)
+                    .iter()
+                    .map(|qubit| PhysicalQubit(qubit.0))
+                    .collect();
+                if qubits.len() == 2 {
+                    original_count += 1;
                 }
+                let name = inst.op.name();
+                let gate_fidelity = match target.get_error(name, qubits.as_slice()) {
+                    Some(err) => 1. - err,
+                    None => {
+                        // If error rate is None this can mean either the gate is not supported
+                        // in the target or the gate is ideal. We need to do a second lookup
+                        // to determine if the gate is supported, and if it isn't we don't need
+                        // to finish scoring because we know we'll use the synthesis output
+                        if !target.instruction_supported(name, Some(&qubits)) {
+                            outside_target = true;
+                            break;
+                        }
+                        1.
+                    }
+                };
+                original_fidelity *= gate_fidelity;
+            }
+            let original_score = (original_count, 1. - original_fidelity);
+            let new_score: (usize, f64) = match decomposer_scores[sequence.0] {
+                Some(score) => score,
+                None => score_sequence(
+                    target,
+                    sequence.1 .1.as_str(),
+                    sequence
+                        .1
+                         .0
+                        .gates
+                        .iter()
+                        .map(|(gate, _params, local_qubits)| {
+                            let qubits = local_qubits
+                                .iter()
+                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                .collect();
+                            (*gate, qubits)
+                        }),
+                ),
             };
-            original_err *= gate_err;
-        }
-        let original_score = (original_count, 1. - original_err);
-        let new_score: (usize, f64) = match decomposer_scores[sequence.0] {
-            Some(score) => score,
-            None => score_sequence(
-                target,
-                sequence.1 .1.as_str(),
-                sequence
-                    .1
-                     .0
-                    .gates
-                    .iter()
-                    .map(|(gate, _params, local_qubits)| {
-                        let qubits = local_qubits
-                            .iter()
-                            .map(|qubit| block_qubit_map[*qubit as usize])
-                            .collect();
-                        (*gate, qubits)
-                    }),
-            ),
+            if !outside_target && new_score > original_score {
+                return Ok(None);
+            }
+            // This is done at the end of the map in some attempt to minimize
+            // lock contention. If this were serial code it'd make more sense
+            // to do this as part of the iteration building the
+            let mut node_mapping = locked_node_mapping.lock().unwrap();
+            for node in node_indices {
+                node_mapping.insert(*node, run_index);
+            }
+            Ok(Some((sequence.1, block_qubit_map)))
         };
-        if !outside_target && new_score > original_score {
-            return Ok(None);
-        }
-        // This is done at the end of the map in some attempt to minimize
-        // lock contention. If this were serial code it'd make more sense
-        // to do this as part of the iteration building the
-        let mut node_mapping = locked_node_mapping.lock().unwrap();
-        for node in node_indices {
-            node_mapping.insert(*node, run_index);
-        }
-        Ok(Some((sequence.1, block_qubit_map)))
-    };
 
     let run_mapping: PyResult<Vec<MappingIterItem>> = if getenv_use_multiple_threads() {
         py.allow_threads(|| {
@@ -455,79 +467,53 @@ pub(crate) fn two_qubit_unitary_peephole_optimize(
                 if processed_runs.contains(run_index) {
                     continue;
                 }
-                if run_mapping[*run_index].is_none() {
+                let run = run_mapping[*run_index].as_ref();
+                if run.is_none() {
                     let NodeType::Operation(ref instr) = dag.dag()[node] else {
                         unreachable!("Must be an op node")
                     };
                     out_dag_builder.push_back(py, instr.clone())?;
                     continue;
                 }
-                let (sequence, qubit_map) = &run_mapping[*run_index].as_ref().unwrap();
+                let (sequence, qubit_map) = run.unwrap();
                 for (gate, params, local_qubits) in &sequence.0.gates {
                     let qubits: Vec<Qubit> = local_qubits
                         .iter()
                         .map(|index| qubit_map[*index as usize])
                         .collect();
+
                     let out_params = if params.is_empty() {
                         None
                     } else {
                         Some(params.into_iter().map(|val| Param::Float(*val)).collect())
                     };
                     match gate {
-                        Some(gate) => {
+                        Some(gate) => out_dag_builder.apply_operation_back(
+                            py,
+                            PackedOperation::from_standard_gate(*gate),
+                            Some(&qubits),
+                            None,
+                            out_params,
+                            None,
                             #[cfg(feature = "cache_pygates")]
-                            {
-                                out_dag_builder.apply_operation_back(
-                                    py,
-                                    PackedOperation::from_standard_gate(*gate),
-                                    Some(&qubits),
-                                    None,
-                                    out_params,
-                                    None,
-                                    None,
-                                )
-                            }
-                            #[cfg(not(feature = "cache_pygates"))]
-                            {
-                                out_dag_builder.apply_operation_back(
-                                    py,
-                                    PackedOperation::from_standard_gate(*gate),
-                                    Some(&qubits),
-                                    None,
-                                    out_params,
-                                    None,
-                                )
-                            }
-                        }
+                            None,
+                        ),
                         None => {
                             let Some(TargetOperation::Normal(gate)) =
                                 target.operation_from_name(sequence.1.as_str())
                             else {
                                 unreachable!()
                             };
-                            #[cfg(feature = "cache_pygates")]
-                            {
-                                out_dag_builder.apply_operation_back(
-                                    py,
-                                    gate.operation.clone(),
-                                    Some(&qubits),
-                                    None,
-                                    Some(out_params.unwrap_or(gate.params.clone())),
-                                    None,
-                                    None,
-                                )
-                            }
-                            #[cfg(not(feature = "cache_pygates"))]
-                            {
-                                out_dag_builder.apply_operation_back(
-                                    py,
-                                    gate.operation.clone(),
-                                    Some(&qubits),
-                                    None,
-                                    Some(out_params.unwrap_or(gate.params.clone())),
-                                    None,
-                                )
-                            }
+                            out_dag_builder.apply_operation_back(
+                                py,
+                                gate.operation.clone(),
+                                Some(&qubits),
+                                None,
+                                Some(out_params.unwrap_or(gate.params.clone())),
+                                None,
+                                #[cfg(feature = "cache_pygates")]
+                                None,
+                            )
                         }
                     }?;
                 }
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index f13c463975af..c8e8361f7622 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -20,7 +20,7 @@
 import numpy as np
 import ddt
 
-from qiskit import generate_preset_pass_manager
+from qiskit import generate_preset_pass_manager, transpile
 from qiskit.providers.fake_provider import GenericBackendV2
 from qiskit.circuit import QuantumCircuit, QuantumRegister
 from qiskit.circuit.parameterexpression import ParameterValueType
@@ -41,7 +41,7 @@
     UGate,
     ZGate,
     RYYGate,
-    RZZGate,
+    CZGate,
     RXXGate,
 )
 from qiskit.circuit import Measure
@@ -437,3 +437,127 @@ def test_two_qubit_synthesis_to_directional_cx_target(self, gate):
         pm = PassManager([triv_layout_pass, unisynth_pass])
         qc_out = pm.run(qc)
         self.assertEqual(Operator(qc), Operator(qc_out))
+
+    def test_swap_on_cz_target(self):
+        """Test that a cz target synthesizes simple circuits correctly."""
+        target = Target(num_qubits=2)
+        target.add_instruction(
+            CZGate(),
+            {
+                (0, 1): InstructionProperties(error=3.058e-3, duration=6.8e-8),
+                (1, 0): InstructionProperties(error=3.058e-3, duration=6.8e-8),
+            },
+        )
+        target.add_instruction(
+            RZGate(Parameter("theta")),
+            {
+                (0,): InstructionProperties(error=0, duration=0),
+                (1,): InstructionProperties(error=0, duration=0),
+            },
+        )
+        target.add_instruction(
+            SXGate(),
+            {
+                (0,): InstructionProperties(error=2.421e-4, duration=2.4e-8),
+                (1,): InstructionProperties(error=2.229e-4, duration=2.41e-8),
+            },
+        )
+        target.add_instruction(
+            XGate(),
+            {
+                (0,): InstructionProperties(error=2.41e-4, duration=3.4e-8),
+                (1,): InstructionProperties(error=2.29e-4, duration=3.41e-8),
+            },
+        )
+        peephole = TwoQubitPeepholeOptimization(target)
+        qc = QuantumCircuit(2)
+        qc.swap(0, 1)
+        qc = transpile(qc, target=target, seed_transpiler=1234, optimization_level=0)
+        res = peephole(qc)
+        self.assertEqual(res, qc)
+        # Check run of swaps
+        qc_duplicated = QuantumCircuit(2)
+        for _ in range(100):
+            qc_duplicated.swap(0, 1)
+        qc_duplicated = transpile(
+            qc_duplicated, target=target, seed_transpiler=1234, optimization_level=0
+        )
+        res = peephole(qc_duplicated)
+        self.assertEqual(res, QuantumCircuit(2))
+
+        qc_duplicated = QuantumCircuit(2)
+        for _ in range(101):
+            qc_duplicated.swap(0, 1)
+        qc_duplicated = transpile(
+            qc_duplicated, target=target, seed_transpiler=1234, optimization_level=0
+        )
+        res = peephole(qc_duplicated)
+        self.assertEqual(Operator(res), Operator(qc))
+
+    def test_pass_respects_directionality(self):
+        """Test that a cz target synthesizes simple circuits correctly."""
+        target = Target(num_qubits=2)
+        target.add_instruction(
+            CXGate(),
+            {
+                (1, 0): InstructionProperties(error=3.058e-3, duration=6.8e-8),
+            },
+        )
+        target.add_instruction(
+            RZGate(Parameter("theta")),
+            {
+                (0,): InstructionProperties(error=0, duration=0),
+                (1,): InstructionProperties(error=0, duration=0),
+            },
+        )
+        target.add_instruction(
+            SXGate(),
+            {
+                (0,): InstructionProperties(error=2.421e-4, duration=2.4e-8),
+                (1,): InstructionProperties(error=2.229e-4, duration=2.41e-8),
+            },
+        )
+        target.add_instruction(
+            XGate(),
+            {
+                (0,): InstructionProperties(error=2.41e-4, duration=3.4e-8),
+                (1,): InstructionProperties(error=2.29e-4, duration=3.41e-8),
+            },
+        )
+        peephole = TwoQubitPeepholeOptimization(target)
+        qc = QuantumCircuit(2)
+        qc.swap(0, 1)
+        qc = transpile(qc, target=target, seed_transpiler=1234, optimization_level=0)
+        res = peephole(qc)
+        self.assertTrue(self.all_inst_in_target(res, target))
+        self.assertEqual(res, qc)
+        # Check run of swaps
+        qc_duplicated = QuantumCircuit(2)
+        for _ in range(100):
+            qc_duplicated.swap(0, 1)
+        qc_duplicated = transpile(
+            qc_duplicated, target=target, seed_transpiler=1234, optimization_level=0
+        )
+        res = peephole(qc_duplicated)
+        self.assertTrue(self.all_inst_in_target(res, target))
+        self.assertEqual(Operator(res), QuantumCircuit(2))
+
+        qc_duplicated = QuantumCircuit(2)
+        for _ in range(101):
+            qc_duplicated.swap(0, 1)
+        qc_duplicated = transpile(
+            qc_duplicated, target=target, seed_transpiler=1234, optimization_level=0
+        )
+        res = peephole(qc_duplicated)
+        self.assertTrue(self.all_inst_in_target(res, target))
+        self.assertEqual(Operator(res), Operator(qc))
+
+    def all_inst_in_target(self, circuit: QuantumCircuit, target: Target):
+        for inst in circuit.data:
+            if not target.instruction_supported(
+                inst.name, tuple(circuit.find_bit(x).index for x in inst.qubits)
+            ):
+                raise self.fail(
+                    f"{inst.name} {tuple(circuit.find_bit(x).index for x in inst.qubits)} not supported"
+                )
+        return True

From ed262ac6d64cc3f2ea1b2ed1483824387c903fbf Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 14 Jun 2025 11:01:30 -0400
Subject: [PATCH 28/64] Deduplicate synthesis path between unitary synthesis
 and peephole

This commit deduplicates the code paths between the unitary synthesis
code and the two qubit peephole pass. The two passes both have the job
of taking a unitary matrix and synthesizing it to a given gate sequence
in roughly the same way. The context both passes are run is slightly
different, as unitary synthesis is more general and two qubit peephole
optimization is specifically only on physical circuits and also only
operates on 4x4 matrices. But we didn't need to duplicate the code that
goes from a matrix and decomposer to a sequence. This commit moves the
code from unitary synthesis into a common module and then updates
the peephole pass to use that instead of duplicating the implementation.
---
 crates/transpiler/src/passes/mod.rs           |   1 +
 .../src/passes/two_qubit_peephole.rs          | 173 +++++--------
 .../two_qubit_unitary_synthesis_utils.rs      | 244 ++++++++++++++++++
 .../src/passes/unitary_synthesis.rs           | 218 +---------------
 4 files changed, 320 insertions(+), 316 deletions(-)
 create mode 100644 crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs

diff --git a/crates/transpiler/src/passes/mod.rs b/crates/transpiler/src/passes/mod.rs
index e4010eefc383..34705fdfea06 100644
--- a/crates/transpiler/src/passes/mod.rs
+++ b/crates/transpiler/src/passes/mod.rs
@@ -41,6 +41,7 @@ mod remove_identity_equiv;
 pub mod sabre;
 mod split_2q_unitaries;
 mod two_qubit_peephole;
+mod two_qubit_unitary_synthesis_utils;
 mod unitary_synthesis;
 mod vf2;
 
diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 3d426c0ce2f2..427825dcaf13 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -14,8 +14,6 @@ use std::cmp::Ordering;
 use std::sync::Mutex;
 
 use hashbrown::{HashMap, HashSet};
-use ndarray::prelude::*;
-use num_complex::Complex64;
 use pyo3::prelude::*;
 use rayon::prelude::*;
 use rustworkx_core::petgraph::stable_graph::NodeIndex;
@@ -26,7 +24,10 @@ use qiskit_circuit::operations::{Operation, OperationRef, Param, StandardGate};
 use qiskit_circuit::packed_instruction::PackedOperation;
 use qiskit_circuit::Qubit;
 
-use crate::target::{Target, TargetOperation};
+use super::two_qubit_unitary_synthesis_utils::{
+    preferred_direction, synth_su4_sequence, DecomposerElement, DecomposerType,
+};
+use crate::target::{Qargs, Target, TargetOperation};
 use crate::TranspilerError;
 use qiskit_circuit::getenv_use_multiple_threads;
 use qiskit_circuit::PhysicalQubit;
@@ -38,57 +39,44 @@ use qiskit_synthesis::two_qubit_decompose::{
     RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
 };
 
-// The difference between these two types is large where TwoQubitBasisDecomposer
-// is 1640 bytes and TwoQubitControlledUDecomposer is only 24 bytes. This means
-// each element of ControlledU is wasting > 1600 bytes but that is acceptable in
-// this case to avoid the layer of pointer indirection as these are stored
-// temporarily in a vec inside a thread to decompose a unitary and don't persist.
-#[allow(clippy::large_enum_variant)]
-enum TwoQubitDecomposer {
-    Basis(TwoQubitBasisDecomposer),
-    ControlledU(TwoQubitControlledUDecomposer),
-}
-
 fn get_decomposers_from_target(
     target: &Target,
     qubits: &[Qubit],
     fidelity: f64,
-) -> PyResult<Vec<(TwoQubitDecomposer, Option<bool>)>> {
+) -> PyResult<Vec<DecomposerElement>> {
     let physical_qubits: SmallVec<[PhysicalQubit; 2]> =
         smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
     let reverse_qubits: SmallVec<[PhysicalQubit; 2]> =
         physical_qubits.iter().rev().copied().collect();
     let mut reverse_used = false;
-    let mut gate_names: HashSet<(&str, Option<bool>)> =
-        match target.operation_names_for_qargs(&physical_qubits) {
-            Ok(names) => names.into_iter().map(|x| (x, None)).collect(),
-            Err(err) => {
-                reverse_used = true;
-                target
-                    .operation_names_for_qargs(&reverse_qubits)
-                    .map_err(|_| TranspilerError::new_err(err.to_string()))?
-                    .into_iter()
-                    .map(|x| (x, Some(false)))
-                    .collect()
-            }
-        };
+    let mut gate_names: HashSet<&str> = match target.operation_names_for_qargs(&physical_qubits) {
+        Ok(names) => names.into_iter().collect(),
+        Err(err) => {
+            reverse_used = true;
+            target
+                .operation_names_for_qargs(&reverse_qubits)
+                .map_err(|_| TranspilerError::new_err(err.to_string()))?
+                .into_iter()
+                .collect()
+        }
+    };
     if !reverse_used {
         if let Ok(reverse_names) = target.operation_names_for_qargs(&reverse_qubits) {
             if !reverse_names.is_empty() {
                 for name in reverse_names {
-                    gate_names.insert((name, Some(true)));
+                    gate_names.insert(name);
                 }
             }
         }
     }
-    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param], Option<bool>)> = gate_names
+    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param])> = gate_names
         .iter()
-        .filter_map(|(name, rev)| match target.operation_from_name(name) {
+        .filter_map(|name| match target.operation_from_name(name) {
             Some(raw_op) => {
                 if let TargetOperation::Normal(op) = raw_op {
                     match op.operation.view() {
                         OperationRef::StandardGate(_) | OperationRef::Gate(_) => {
-                            Some((*name, &op.operation, op.params.as_slice(), *rev))
+                            Some((*name, &op.operation, op.params.as_slice()))
                         }
                         _ => None,
                     }
@@ -129,9 +117,9 @@ fn get_decomposers_from_target(
         target_basis_set.remove(EulerBasis::ZSX);
     }
 
-    let decomposers: PyResult<Vec<(TwoQubitDecomposer, Option<bool>)>> = available_kak_gate
+    let decomposers: PyResult<Vec<DecomposerElement>> = available_kak_gate
         .iter()
-        .filter_map(|(two_qubit_name, two_qubit_gate, params, rev)| {
+        .filter_map(|(two_qubit_name, two_qubit_gate, params)| {
             let matrix = two_qubit_gate.matrix(params);
             matrix.map(|matrix| {
                 target_basis_set.get_bases().filter_map(move |euler_basis| {
@@ -146,7 +134,11 @@ fn get_decomposers_from_target(
                         if !decomp.super_controlled() {
                             None
                         } else {
-                            Some((TwoQubitDecomposer::Basis(decomp), *rev))
+                            Some(DecomposerElement {
+                                decomposer: DecomposerType::TwoQubitBasis(Box::new(decomp)),
+                                packed_op: (*two_qubit_gate).clone(),
+                                params: params.iter().cloned().collect(),
+                            })
                         }
                     })
                     .transpose()
@@ -162,7 +154,7 @@ fn get_decomposers_from_target(
         StandardGate::RYY,
         StandardGate::RZX,
     ] {
-        if gate_names.contains(&(gate.name(), None)) {
+        if gate_names.contains(gate.name()) {
             let op = target.operation_from_name(gate.name()).unwrap();
             if op
                 .params()
@@ -170,13 +162,16 @@ fn get_decomposers_from_target(
                 .all(|x| matches!(x, Param::ParameterExpression(_)))
             {
                 for euler_basis in target_basis_set.get_bases() {
-                    decomposers.push((
-                        TwoQubitDecomposer::ControlledU(TwoQubitControlledUDecomposer::new(
-                            RXXEquivalent::Standard(gate),
-                            euler_basis.as_str(),
-                        )?),
-                        None,
-                    ));
+                    decomposers.push(DecomposerElement {
+                        decomposer: DecomposerType::TwoQubitControlledU(Box::new(
+                            TwoQubitControlledUDecomposer::new(
+                                RXXEquivalent::Standard(gate),
+                                euler_basis.as_str(),
+                            )?,
+                        )),
+                        packed_op: gate.into(),
+                        params: op.params().iter().cloned().collect(),
+                    });
                 }
             }
         }
@@ -184,15 +179,6 @@ fn get_decomposers_from_target(
     Ok(decomposers)
 }
 
-fn reverse_mat(matrix: &mut Array2<Complex64>) {
-    // Swap rows 1 and 2
-    let (mut row_1, mut row_2) = matrix.multi_slice_mut((s![1, ..], s![2, ..]));
-    azip!((x in &mut row_1, y in &mut row_2) (*x, *y) = (*y, *x));
-    // Swap columns 1 and 2
-    let (mut col_1, mut col_2) = matrix.multi_slice_mut((s![.., 1], s![.., 2]));
-    azip!((x in &mut col_1, y in &mut col_2) (*x, *y) = (*y, *x));
-}
-
 /// Score a given sequence using the error rate reported in the target
 ///
 /// Return a tuple of the predicted fidelity and the number of 2q gates in the sequence
@@ -239,6 +225,20 @@ pub fn two_qubit_unitary_peephole_optimize(
     let node_mapping: HashMap<NodeIndex, usize> =
         HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
     let locked_node_mapping = Mutex::new(node_mapping);
+    let coupling_edges = target
+        .qargs()
+        .unwrap()
+        .filter_map(|qargs| match qargs {
+            Qargs::Concrete(qargs) => {
+                if qargs.len() == 2 {
+                    Some([qargs[0], qargs[1]])
+                } else {
+                    None
+                }
+            }
+            Qargs::Global => None,
+        })
+        .collect();
 
     let find_best_sequence =
         |run_index: usize, node_indices: &[NodeIndex]| -> PyResult<MappingIterItem> {
@@ -317,57 +317,24 @@ pub fn two_qubit_unitary_peephole_optimize(
             let sequence = decomposers
                 .iter()
                 .map(|decomposer| {
-                    if let Some(rev) = decomposer.1 {
-                        let mut mat = matrix.clone();
-                        reverse_mat(&mut mat);
-                        match &decomposer.0 {
-                            TwoQubitDecomposer::Basis(decomposer) => {
-                                let synth =
-                                    decomposer.call_inner(mat.view(), None, true, None).unwrap();
-                                if rev {
-                                    let mut reversed_gates = Vec::with_capacity(synth.gates.len());
-                                    let flip_bits: [u8; 2] = [1, 0];
-                                    for (gate, params, qubit_ids) in synth.gates() {
-                                        let new_qubit_ids = qubit_ids
-                                            .into_iter()
-                                            .map(|x| flip_bits[*x as usize])
-                                            .collect::<SmallVec<[u8; 2]>>();
-                                        reversed_gates.push((
-                                            *gate,
-                                            params.clone(),
-                                            new_qubit_ids.clone(),
-                                        ));
-                                    }
-                                    let mut reversed_synth: TwoQubitGateSequence =
-                                        TwoQubitGateSequence::new();
-                                    reversed_synth
-                                        .set_state((reversed_gates, synth.global_phase()));
-                                    (reversed_synth, decomposer.gate_name().to_string())
-                                } else {
-                                    (synth, decomposer.gate_name().to_string())
-                                }
-                            }
-                            _ => unreachable!("Only TwoQubitBasisDecomposer is reversible"),
-                        }
-                    } else {
-                        match &decomposer.0 {
-                            TwoQubitDecomposer::Basis(decomposer) => (
-                                decomposer
-                                    .call_inner(matrix.view(), None, true, None)
-                                    .unwrap(),
-                                decomposer.gate_name().to_string(),
-                            ),
-                            TwoQubitDecomposer::ControlledU(decomposer) => (
-                                decomposer.call_inner(matrix.view(), Some(1e-12)).unwrap(),
-                                match decomposer.rxx_equivalent_gate {
-                                    RXXEquivalent::Standard(gate) => gate.name().to_string(),
-                                    RXXEquivalent::CustomPython(_) => {
-                                        unreachable!("Decomposer only uses standard gates")
-                                    }
-                                },
-                            ),
-                        }
-                    }
+                    let physical_block_qubit_map: [PhysicalQubit; 2] = [
+                        PhysicalQubit(block_qubit_map[0].0),
+                        PhysicalQubit(block_qubit_map[1].0),
+                    ];
+                    let dir = preferred_direction(
+                        &physical_block_qubit_map,
+                        Some(true),
+                        &coupling_edges,
+                        Some(target),
+                        decomposer,
+                    )
+                    .unwrap();
+                    (
+                        synth_su4_sequence(matrix.view(), decomposer, dir, Some(fidelity))
+                            .unwrap()
+                            .gate_sequence,
+                        decomposer.packed_op.name().to_string(),
+                    )
                 })
                 .enumerate()
                 .min_by(order_sequence);
diff --git a/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs b/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs
new file mode 100644
index 000000000000..0da1a549c3f5
--- /dev/null
+++ b/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs
@@ -0,0 +1,244 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2024
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+#![allow(clippy::too_many_arguments)]
+
+use hashbrown::HashSet;
+use ndarray::prelude::*;
+use num_complex::Complex64;
+use smallvec::SmallVec;
+
+use pyo3::prelude::*;
+
+use qiskit_circuit::operations::{Operation, Param};
+use qiskit_circuit::packed_instruction::PackedOperation;
+
+use crate::target::Qargs;
+use crate::target::Target;
+use crate::QiskitError;
+use qiskit_circuit::PhysicalQubit;
+use qiskit_synthesis::two_qubit_decompose::{
+    TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
+};
+
+#[derive(Clone, Debug)]
+pub(crate) enum DecomposerType {
+    TwoQubitBasis(Box<TwoQubitBasisDecomposer>),
+    TwoQubitControlledU(Box<TwoQubitControlledUDecomposer>),
+    XX(PyObject),
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct DecomposerElement {
+    pub(crate) decomposer: DecomposerType,
+    pub(crate) packed_op: PackedOperation,
+    pub(crate) params: SmallVec<[Param; 3]>,
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct TwoQubitUnitarySequence {
+    pub(crate) gate_sequence: TwoQubitGateSequence,
+    pub(crate) decomp_op: PackedOperation,
+    pub(crate) decomp_params: SmallVec<[Param; 3]>,
+}
+
+/// Function to evaluate hardware-native direction, this allows to correct
+/// the synthesis output to match the target constraints.
+/// Returns:
+///     * `true` if gate qubits are in the hardware-native direction
+///     * `false` if gate qubits must be flipped to match hardware-native direction
+#[inline]
+pub(crate) fn preferred_direction(
+    ref_qubits: &[PhysicalQubit; 2],
+    natural_direction: Option<bool>,
+    coupling_edges: &HashSet<[PhysicalQubit; 2]>,
+    target: Option<&Target>,
+    decomposer: &DecomposerElement,
+) -> PyResult<Option<bool>> {
+    let qubits: [PhysicalQubit; 2] = *ref_qubits;
+    let mut reverse_qubits: [PhysicalQubit; 2] = qubits;
+    reverse_qubits.reverse();
+
+    let preferred_direction = match natural_direction {
+        Some(false) => None,
+        _ => {
+            // None or Some(true)
+            let zero_one = coupling_edges.contains(&qubits);
+            let one_zero = coupling_edges.contains(&[qubits[1], qubits[0]]);
+
+            match (zero_one, one_zero) {
+                (true, false) => Some(true),
+                (false, true) => Some(false),
+                _ => {
+                    match target {
+                        Some(target) => {
+                            let mut cost_0_1: f64 = f64::INFINITY;
+                            let mut cost_1_0: f64 = f64::INFINITY;
+
+                            let compute_cost = |lengths: bool,
+                                                q_tuple: [PhysicalQubit; 2],
+                                                in_cost: f64|
+                             -> PyResult<f64> {
+                                let cost = match target
+                                    .qargs_for_operation_name(decomposer.packed_op.name())
+                                {
+                                    Ok(_) => match target[decomposer.packed_op.name()]
+                                        .get(&Qargs::from(q_tuple))
+                                    {
+                                        Some(Some(_props)) => {
+                                            if lengths {
+                                                _props.duration.unwrap_or(in_cost)
+                                            } else {
+                                                _props.error.unwrap_or(in_cost)
+                                            }
+                                        }
+                                        _ => in_cost,
+                                    },
+                                    Err(_) => in_cost,
+                                };
+                                Ok(cost)
+                            };
+                            // Try to find the cost in gate_lengths
+                            cost_0_1 = compute_cost(true, qubits, cost_0_1)?;
+                            cost_1_0 = compute_cost(true, reverse_qubits, cost_1_0)?;
+
+                            // If no valid cost was found in gate_lengths, check gate_errors
+                            if !(cost_0_1 < f64::INFINITY || cost_1_0 < f64::INFINITY) {
+                                cost_0_1 = compute_cost(false, qubits, cost_0_1)?;
+                                cost_1_0 = compute_cost(false, reverse_qubits, cost_1_0)?;
+                            }
+
+                            if cost_0_1 < cost_1_0 {
+                                Some(true)
+                            } else if cost_1_0 < cost_0_1 {
+                                Some(false)
+                            } else {
+                                None
+                            }
+                        }
+                        None => None,
+                    }
+                }
+            }
+        }
+    };
+    if natural_direction == Some(true) && preferred_direction.is_none() {
+        return Err(QiskitError::new_err(format!(
+            concat!(
+                "No preferred direction of gate on qubits {:?} ",
+                "could be determined from coupling map or gate lengths / gate errors."
+            ),
+            qubits
+        )));
+    }
+    Ok(preferred_direction)
+}
+
+/// Apply synthesis for decomposers that return a SEQUENCE (TwoQubitBasis and TwoQubitControlledU).
+#[inline]
+pub(crate) fn synth_su4_sequence(
+    su4_mat: ArrayView2<Complex64>,
+    decomposer_2q: &DecomposerElement,
+    preferred_direction: Option<bool>,
+    approximation_degree: Option<f64>,
+) -> PyResult<TwoQubitUnitarySequence> {
+    let is_approximate = approximation_degree.is_none() || approximation_degree.unwrap() != 1.0;
+    let synth = if let DecomposerType::TwoQubitBasis(decomp) = &decomposer_2q.decomposer {
+        decomp.call_inner(su4_mat.view(), None, is_approximate, None)?
+    } else if let DecomposerType::TwoQubitControlledU(decomp) = &decomposer_2q.decomposer {
+        decomp.call_inner(su4_mat.view(), None)?
+    } else {
+        unreachable!("synth_su4_sequence should only be called for TwoQubitBasisDecomposer or TwoQubitControlledUDecomposer.")
+    };
+    let sequence = TwoQubitUnitarySequence {
+        gate_sequence: synth,
+        decomp_op: decomposer_2q.packed_op.clone(),
+        decomp_params: decomposer_2q.params.clone(),
+    };
+    match preferred_direction {
+        None => Ok(sequence),
+        Some(preferred_dir) => {
+            let mut synth_direction: Option<SmallVec<[u8; 2]>> = None;
+            // if the gates in synthesis are in the opposite direction of the preferred direction
+            // resynthesize a new operator which is the original conjugated by swaps.
+            // this new operator is doubly mirrored from the original and is locally equivalent.
+            for (gate, _, qubits) in sequence.gate_sequence.gates() {
+                if gate.is_none() || gate.unwrap().name() == "cx" {
+                    synth_direction = Some(qubits.clone());
+                }
+            }
+            match synth_direction {
+                None => Ok(sequence),
+                Some(synth_direction) => {
+                    let synth_dir = match synth_direction.as_slice() {
+                        [0, 1] => true,
+                        [1, 0] => false,
+                        _ => unreachable!(),
+                    };
+                    if synth_dir != preferred_dir {
+                        reversed_synth_su4_sequence(
+                            su4_mat.to_owned(),
+                            decomposer_2q,
+                            approximation_degree,
+                        )
+                    } else {
+                        Ok(sequence)
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// Apply reverse synthesis for decomposers that return a SEQUENCE (TwoQubitBasis and TwoQubitControlledU).
+/// This function is called by `synth_su4_sequence`` if the "direct" synthesis
+/// doesn't match the hardware restrictions.
+fn reversed_synth_su4_sequence(
+    mut su4_mat: Array2<Complex64>,
+    decomposer_2q: &DecomposerElement,
+    approximation_degree: Option<f64>,
+) -> PyResult<TwoQubitUnitarySequence> {
+    let is_approximate = approximation_degree.is_none() || approximation_degree.unwrap() != 1.0;
+    // Swap rows 1 and 2
+    let (mut row_1, mut row_2) = su4_mat.multi_slice_mut((s![1, ..], s![2, ..]));
+    azip!((x in &mut row_1, y in &mut row_2) (*x, *y) = (*y, *x));
+
+    // Swap columns 1 and 2
+    let (mut col_1, mut col_2) = su4_mat.multi_slice_mut((s![.., 1], s![.., 2]));
+    azip!((x in &mut col_1, y in &mut col_2) (*x, *y) = (*y, *x));
+
+    let synth = if let DecomposerType::TwoQubitBasis(decomp) = &decomposer_2q.decomposer {
+        decomp.call_inner(su4_mat.view(), None, is_approximate, None)?
+    } else if let DecomposerType::TwoQubitControlledU(decomp) = &decomposer_2q.decomposer {
+        decomp.call_inner(su4_mat.view(), None)?
+    } else {
+        unreachable!(
+            "reversed_synth_su4_sequence should only be called for TwoQubitBasisDecomposer."
+        )
+    };
+    let flip_bits: [u8; 2] = [1, 0];
+    let mut reversed_gates = Vec::with_capacity(synth.gates().len());
+    for (gate, params, qubit_ids) in synth.gates() {
+        let new_qubit_ids = qubit_ids
+            .into_iter()
+            .map(|x| flip_bits[*x as usize])
+            .collect::<SmallVec<[u8; 2]>>();
+        reversed_gates.push((*gate, params.clone(), new_qubit_ids.clone()));
+    }
+    let mut reversed_synth: TwoQubitGateSequence = TwoQubitGateSequence::new();
+    reversed_synth.set_state((reversed_gates, synth.global_phase()));
+    let sequence = TwoQubitUnitarySequence {
+        gate_sequence: reversed_synth,
+        decomp_op: decomposer_2q.packed_op.clone(),
+        decomp_params: decomposer_2q.params.clone(),
+    };
+    Ok(sequence)
+}
diff --git a/crates/transpiler/src/passes/unitary_synthesis.rs b/crates/transpiler/src/passes/unitary_synthesis.rs
index 9b66d42e1cee..19b2d1242586 100644
--- a/crates/transpiler/src/passes/unitary_synthesis.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis.rs
@@ -37,6 +37,10 @@ use qiskit_circuit::operations::{Operation, OperationRef, Param, PyGate, Standar
 use qiskit_circuit::packed_instruction::{PackedInstruction, PackedOperation};
 use qiskit_circuit::Qubit;
 
+use super::two_qubit_unitary_synthesis_utils::{
+    preferred_direction, synth_su4_sequence, DecomposerElement, DecomposerType,
+    TwoQubitUnitarySequence,
+};
 use crate::target::{NormalOperation, Target, TargetOperation};
 use crate::target::{Qargs, QargsRef};
 use crate::QiskitError;
@@ -45,34 +49,13 @@ use qiskit_synthesis::euler_one_qubit_decomposer::{
     unitary_to_gate_sequence_inner, EulerBasis, EulerBasisSet, EULER_BASES, EULER_BASIS_NAMES,
 };
 use qiskit_synthesis::two_qubit_decompose::{
-    RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
+    RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer,
     TwoQubitWeylDecomposition,
 };
 
 const PI2: f64 = PI / 2.;
 const PI4: f64 = PI / 4.;
 
-#[derive(Clone, Debug)]
-enum DecomposerType {
-    TwoQubitBasis(Box<TwoQubitBasisDecomposer>),
-    TwoQubitControlledU(Box<TwoQubitControlledUDecomposer>),
-    XX(PyObject),
-}
-
-#[derive(Clone, Debug)]
-struct DecomposerElement {
-    decomposer: DecomposerType,
-    packed_op: PackedOperation,
-    params: SmallVec<[Param; 3]>,
-}
-
-#[derive(Clone, Debug)]
-struct TwoQubitUnitarySequence {
-    gate_sequence: TwoQubitGateSequence,
-    decomp_op: PackedOperation,
-    decomp_params: SmallVec<[Param; 3]>,
-}
-
 // These two variables are used to exit the decomposer search early in
 // `get_2q_decomposers_from_target`.
 // If the available 2q basis is a subset of GOODBYE_SET, TwoQubitBasisDecomposer provides
@@ -837,197 +820,6 @@ fn get_2q_decomposers_from_target(
     Ok(Some(decomposers))
 }
 
-/// Function to evaluate hardware-native direction, this allows to correct
-/// the synthesis output to match the target constraints.
-/// Returns:
-///     * `true` if gate qubits are in the hardware-native direction
-///     * `false` if gate qubits must be flipped to match hardware-native direction
-fn preferred_direction(
-    ref_qubits: &[PhysicalQubit; 2],
-    natural_direction: Option<bool>,
-    coupling_edges: &HashSet<[PhysicalQubit; 2]>,
-    target: Option<&Target>,
-    decomposer: &DecomposerElement,
-) -> PyResult<Option<bool>> {
-    let qubits: [PhysicalQubit; 2] = *ref_qubits;
-    let mut reverse_qubits: [PhysicalQubit; 2] = qubits;
-    reverse_qubits.reverse();
-
-    let preferred_direction = match natural_direction {
-        Some(false) => None,
-        _ => {
-            // None or Some(true)
-            let zero_one = coupling_edges.contains(&qubits);
-            let one_zero = coupling_edges.contains(&[qubits[1], qubits[0]]);
-
-            match (zero_one, one_zero) {
-                (true, false) => Some(true),
-                (false, true) => Some(false),
-                _ => {
-                    match target {
-                        Some(target) => {
-                            let mut cost_0_1: f64 = f64::INFINITY;
-                            let mut cost_1_0: f64 = f64::INFINITY;
-
-                            let compute_cost = |lengths: bool,
-                                                q_tuple: [PhysicalQubit; 2],
-                                                in_cost: f64|
-                             -> PyResult<f64> {
-                                let cost = match target
-                                    .qargs_for_operation_name(decomposer.packed_op.name())
-                                {
-                                    Ok(_) => match target[decomposer.packed_op.name()]
-                                        .get(&Qargs::from(q_tuple))
-                                    {
-                                        Some(Some(_props)) => {
-                                            if lengths {
-                                                _props.duration.unwrap_or(in_cost)
-                                            } else {
-                                                _props.error.unwrap_or(in_cost)
-                                            }
-                                        }
-                                        _ => in_cost,
-                                    },
-                                    Err(_) => in_cost,
-                                };
-                                Ok(cost)
-                            };
-                            // Try to find the cost in gate_lengths
-                            cost_0_1 = compute_cost(true, qubits, cost_0_1)?;
-                            cost_1_0 = compute_cost(true, reverse_qubits, cost_1_0)?;
-
-                            // If no valid cost was found in gate_lengths, check gate_errors
-                            if !(cost_0_1 < f64::INFINITY || cost_1_0 < f64::INFINITY) {
-                                cost_0_1 = compute_cost(false, qubits, cost_0_1)?;
-                                cost_1_0 = compute_cost(false, reverse_qubits, cost_1_0)?;
-                            }
-
-                            if cost_0_1 < cost_1_0 {
-                                Some(true)
-                            } else if cost_1_0 < cost_0_1 {
-                                Some(false)
-                            } else {
-                                None
-                            }
-                        }
-                        None => None,
-                    }
-                }
-            }
-        }
-    };
-    if natural_direction == Some(true) && preferred_direction.is_none() {
-        return Err(QiskitError::new_err(format!(
-            concat!(
-                "No preferred direction of gate on qubits {:?} ",
-                "could be determined from coupling map or gate lengths / gate errors."
-            ),
-            qubits
-        )));
-    }
-    Ok(preferred_direction)
-}
-
-/// Apply synthesis for decomposers that return a SEQUENCE (TwoQubitBasis and TwoQubitControlledU).
-fn synth_su4_sequence(
-    su4_mat: ArrayView2<Complex64>,
-    decomposer_2q: &DecomposerElement,
-    preferred_direction: Option<bool>,
-    approximation_degree: Option<f64>,
-) -> PyResult<TwoQubitUnitarySequence> {
-    let is_approximate = approximation_degree.is_none() || approximation_degree.unwrap() != 1.0;
-    let synth = if let DecomposerType::TwoQubitBasis(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None, is_approximate, None)?
-    } else if let DecomposerType::TwoQubitControlledU(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None)?
-    } else {
-        unreachable!("synth_su4_sequence should only be called for TwoQubitBasisDecomposer.")
-    };
-    let sequence = TwoQubitUnitarySequence {
-        gate_sequence: synth,
-        decomp_op: decomposer_2q.packed_op.clone(),
-        decomp_params: decomposer_2q.params.clone(),
-    };
-    match preferred_direction {
-        None => Ok(sequence),
-        Some(preferred_dir) => {
-            let mut synth_direction: Option<SmallVec<[u8; 2]>> = None;
-            // if the gates in synthesis are in the opposite direction of the preferred direction
-            // resynthesize a new operator which is the original conjugated by swaps.
-            // this new operator is doubly mirrored from the original and is locally equivalent.
-            for (gate, _, qubits) in sequence.gate_sequence.gates() {
-                if gate.is_none() || gate.unwrap().name() == "cx" {
-                    synth_direction = Some(qubits.clone());
-                }
-            }
-            match synth_direction {
-                None => Ok(sequence),
-                Some(synth_direction) => {
-                    let synth_dir = match synth_direction.as_slice() {
-                        [0, 1] => true,
-                        [1, 0] => false,
-                        _ => unreachable!(),
-                    };
-                    if synth_dir != preferred_dir {
-                        reversed_synth_su4_sequence(
-                            su4_mat.to_owned(),
-                            decomposer_2q,
-                            approximation_degree,
-                        )
-                    } else {
-                        Ok(sequence)
-                    }
-                }
-            }
-        }
-    }
-}
-
-/// Apply reverse synthesis for decomposers that return a SEQUENCE (TwoQubitBasis and TwoQubitControlledU).
-/// This function is called by `synth_su4_sequence`` if the "direct" synthesis
-/// doesn't match the hardware restrictions.
-fn reversed_synth_su4_sequence(
-    mut su4_mat: Array2<Complex64>,
-    decomposer_2q: &DecomposerElement,
-    approximation_degree: Option<f64>,
-) -> PyResult<TwoQubitUnitarySequence> {
-    let is_approximate = approximation_degree.is_none() || approximation_degree.unwrap() != 1.0;
-    // Swap rows 1 and 2
-    let (mut row_1, mut row_2) = su4_mat.multi_slice_mut((s![1, ..], s![2, ..]));
-    azip!((x in &mut row_1, y in &mut row_2) (*x, *y) = (*y, *x));
-
-    // Swap columns 1 and 2
-    let (mut col_1, mut col_2) = su4_mat.multi_slice_mut((s![.., 1], s![.., 2]));
-    azip!((x in &mut col_1, y in &mut col_2) (*x, *y) = (*y, *x));
-
-    let synth = if let DecomposerType::TwoQubitBasis(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None, is_approximate, None)?
-    } else if let DecomposerType::TwoQubitControlledU(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None)?
-    } else {
-        unreachable!(
-            "reversed_synth_su4_sequence should only be called for TwoQubitBasisDecomposer."
-        )
-    };
-    let flip_bits: [u8; 2] = [1, 0];
-    let mut reversed_gates = Vec::with_capacity(synth.gates().len());
-    for (gate, params, qubit_ids) in synth.gates() {
-        let new_qubit_ids = qubit_ids
-            .into_iter()
-            .map(|x| flip_bits[*x as usize])
-            .collect::<SmallVec<[u8; 2]>>();
-        reversed_gates.push((*gate, params.clone(), new_qubit_ids.clone()));
-    }
-    let mut reversed_synth: TwoQubitGateSequence = TwoQubitGateSequence::new();
-    reversed_synth.set_state((reversed_gates, synth.global_phase()));
-    let sequence = TwoQubitUnitarySequence {
-        gate_sequence: reversed_synth,
-        decomp_op: decomposer_2q.packed_op.clone(),
-        decomp_params: decomposer_2q.params.clone(),
-    };
-    Ok(sequence)
-}
-
 /// Apply synthesis for decomposers that return a DAG (XX).
 fn synth_su4_dag(
     py: Python,

From 817c2b160bfd3d1a1b43efd19b240c26d2d77a9c Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 14 Jun 2025 12:21:26 -0400
Subject: [PATCH 29/64] Fix target handling in peephole pass

After the previous commit deduplicated the synthesis path to fix the
directionality bug and also simplify the shared code there was one issue
introduced. The peephole pass is target only and was previously using
the target to store details about custom operations when a sentinel
value was found in the decomposition gate sequence. However unitary
synthesis can work in the absence of a target and was handling this
manually. This caused a mismatch in expectations both the peephole pass
was no using the correct name for target lookups in some cases and
during applying the synthesis custom gates in the target would not be
inserted correctly.

Ideally this will al lbe cleaned up after #14418 is fixed since we won't
have sentinel values to represent non-CX gates anymore in the two qubit
decomposers.
---
 crates/synthesis/src/two_qubit_decompose.rs   |   5 +
 .../src/passes/two_qubit_peephole.rs          | 168 +++++++++++-------
 .../two_qubit_unitary_synthesis_utils.rs      |   4 +
 .../src/passes/unitary_synthesis.rs           |  14 +-
 .../transpiler/test_two_qubit_peephole.py     |   6 +-
 5 files changed, 120 insertions(+), 77 deletions(-)

diff --git a/crates/synthesis/src/two_qubit_decompose.rs b/crates/synthesis/src/two_qubit_decompose.rs
index e0231ebf70ee..ff027f956374 100644
--- a/crates/synthesis/src/two_qubit_decompose.rs
+++ b/crates/synthesis/src/two_qubit_decompose.rs
@@ -524,6 +524,11 @@ pub struct TwoQubitWeylDecomposition {
 }
 
 impl TwoQubitWeylDecomposition {
+    #[inline]
+    pub fn is_supercontrolled(&self) -> bool {
+        relative_eq!(self.a, PI4) && relative_eq!(self.c, 0.0)
+    }
+
     pub fn a(&self) -> f64 {
         self.a
     }
diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 427825dcaf13..835a2394e3b3 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -26,6 +26,7 @@ use qiskit_circuit::Qubit;
 
 use super::two_qubit_unitary_synthesis_utils::{
     preferred_direction, synth_su4_sequence, DecomposerElement, DecomposerType,
+    TwoQubitUnitarySequence,
 };
 use crate::target::{Qargs, Target, TargetOperation};
 use crate::TranspilerError;
@@ -36,7 +37,8 @@ use qiskit_synthesis::euler_one_qubit_decomposer::{
     EulerBasis, EulerBasisSet, EULER_BASES, EULER_BASIS_NAMES,
 };
 use qiskit_synthesis::two_qubit_decompose::{
-    RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
+    RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer,
+    TwoQubitWeylDecomposition,
 };
 
 fn get_decomposers_from_target(
@@ -75,8 +77,41 @@ fn get_decomposers_from_target(
             Some(raw_op) => {
                 if let TargetOperation::Normal(op) = raw_op {
                     match op.operation.view() {
-                        OperationRef::StandardGate(_) | OperationRef::Gate(_) => {
-                            Some((*name, &op.operation, op.params.as_slice()))
+                        OperationRef::StandardGate(gate) => {
+                            if matches!(
+                                gate,
+                                StandardGate::CX | StandardGate::CZ | StandardGate::ECR
+                            ) {
+                                Some((*name, &op.operation, op.params.as_slice()))
+                            } else if let Some(matrix) = gate.matrix(&op.params) {
+                                if let Ok(weyl) =
+                                    TwoQubitWeylDecomposition::new_inner(matrix.view(), None, None)
+                                {
+                                    if weyl.is_supercontrolled() {
+                                        Some((*name, &op.operation, op.params.as_slice()))
+                                    } else {
+                                        None
+                                    }
+                                } else {
+                                    None
+                                }
+                            } else {
+                                None
+                            }
+                        }
+                        OperationRef::Gate(gate) => {
+                            if let Some(matrix) = gate.matrix(&op.params) {
+                                let weyl =
+                                    TwoQubitWeylDecomposition::new_inner(matrix.view(), None, None)
+                                        .unwrap();
+                                if weyl.is_supercontrolled() {
+                                    Some((*name, &op.operation, op.params.as_slice()))
+                                } else {
+                                    None
+                                }
+                            } else {
+                                None
+                            }
                         }
                         _ => None,
                     }
@@ -138,6 +173,7 @@ fn get_decomposers_from_target(
                                 decomposer: DecomposerType::TwoQubitBasis(Box::new(decomp)),
                                 packed_op: (*two_qubit_gate).clone(),
                                 params: params.iter().cloned().collect(),
+                                target_name: two_qubit_name.to_string(),
                             })
                         }
                     })
@@ -170,7 +206,13 @@ fn get_decomposers_from_target(
                             )?,
                         )),
                         packed_op: gate.into(),
-                        params: op.params().iter().cloned().collect(),
+                        // TODO: Add param when ParameterExpression doesn't
+                        // need python. This is a corrupt param for the gates
+                        // here, but it unused in the passes and needs to be
+                        // an unbound  parameter. Do not use this value for
+                        // constructing a circuit.
+                        params: smallvec![],
+                        target_name: gate.name().to_string(),
                     });
                 }
             }
@@ -209,7 +251,7 @@ fn score_sequence<'a>(
     (gate_count, 1. - fidelity)
 }
 
-type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;
+type MappingIterItem = Option<(TwoQubitUnitarySequence, [Qubit; 2])>;
 
 /// This transpiler pass can only run in a context where we've translated the circuit gates (or
 /// where we know all gates have a matrix). If any gate identified in the run fails to have a
@@ -225,21 +267,21 @@ pub fn two_qubit_unitary_peephole_optimize(
     let node_mapping: HashMap<NodeIndex, usize> =
         HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
     let locked_node_mapping = Mutex::new(node_mapping);
-    let coupling_edges = target
-        .qargs()
-        .unwrap()
-        .filter_map(|qargs| match qargs {
-            Qargs::Concrete(qargs) => {
-                if qargs.len() == 2 {
-                    Some([qargs[0], qargs[1]])
-                } else {
-                    None
+    let coupling_edges = match target.qargs() {
+        Some(qargs) => qargs
+            .filter_map(|qargs| match qargs {
+                Qargs::Concrete(qargs) => {
+                    if qargs.len() == 2 {
+                        Some([qargs[0], qargs[1]])
+                    } else {
+                        None
+                    }
                 }
-            }
-            Qargs::Global => None,
-        })
-        .collect();
-
+                Qargs::Global => None,
+            })
+            .collect(),
+        None => HashSet::new(),
+    };
     let find_best_sequence =
         |run_index: usize, node_indices: &[NodeIndex]| -> PyResult<MappingIterItem> {
             let block_qubit_map = node_indices
@@ -263,16 +305,16 @@ pub fn two_qubit_unitary_peephole_optimize(
             let mut decomposer_scores: Vec<Option<(usize, f64)>> = vec![None; decomposers.len()];
 
             let order_sequence =
-                |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
-                 (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
+                |(index_a, sequence_a): &(usize, TwoQubitUnitarySequence),
+                 (index_b, sequence_b): &(usize, TwoQubitUnitarySequence)| {
                     let score_a = (
                         match decomposer_scores[*index_a] {
                             Some(score) => score,
                             None => {
                                 let score: (usize, f64) = score_sequence(
                                     target,
-                                    sequence_a.1.as_str(),
-                                    sequence_a.0.gates.iter().map(
+                                    sequence_a.target_name.as_str(),
+                                    sequence_a.gate_sequence.gates.iter().map(
                                         |(gate, _params, local_qubits)| {
                                             let qubits = local_qubits
                                                 .iter()
@@ -295,8 +337,8 @@ pub fn two_qubit_unitary_peephole_optimize(
                             None => {
                                 let score: (usize, f64) = score_sequence(
                                     target,
-                                    sequence_b.1.as_str(),
-                                    sequence_b.0.gates.iter().map(
+                                    sequence_b.target_name.as_str(),
+                                    sequence_b.gate_sequence.gates.iter().map(
                                         |(gate, _params, local_qubits)| {
                                             let qubits = local_qubits
                                                 .iter()
@@ -328,13 +370,9 @@ pub fn two_qubit_unitary_peephole_optimize(
                         Some(target),
                         decomposer,
                     )
-                    .unwrap();
-                    (
-                        synth_su4_sequence(matrix.view(), decomposer, dir, Some(fidelity))
-                            .unwrap()
-                            .gate_sequence,
-                        decomposer.packed_op.name().to_string(),
-                    )
+                    .ok()
+                    .flatten();
+                    synth_su4_sequence(matrix.view(), decomposer, dir, Some(fidelity)).unwrap()
                 })
                 .enumerate()
                 .min_by(order_sequence);
@@ -375,25 +413,23 @@ pub fn two_qubit_unitary_peephole_optimize(
                 original_fidelity *= gate_fidelity;
             }
             let original_score = (original_count, 1. - original_fidelity);
-            let new_score: (usize, f64) = match decomposer_scores[sequence.0] {
-                Some(score) => score,
-                None => score_sequence(
-                    target,
-                    sequence.1 .1.as_str(),
-                    sequence
-                        .1
-                         .0
-                        .gates
-                        .iter()
-                        .map(|(gate, _params, local_qubits)| {
-                            let qubits = local_qubits
-                                .iter()
-                                .map(|qubit| block_qubit_map[*qubit as usize])
-                                .collect();
-                            (*gate, qubits)
-                        }),
-                ),
-            };
+            let new_score: (usize, f64) =
+                match decomposer_scores[sequence.0] {
+                    Some(score) => score,
+                    None => score_sequence(
+                        target,
+                        sequence.1.target_name.as_str(),
+                        sequence.1.gate_sequence.gates.iter().map(
+                            |(gate, _params, local_qubits)| {
+                                let qubits = local_qubits
+                                    .iter()
+                                    .map(|qubit| block_qubit_map[*qubit as usize])
+                                    .collect();
+                                (*gate, qubits)
+                            },
+                        ),
+                    ),
+                };
             if !outside_target && new_score > original_score {
                 return Ok(None);
             }
@@ -444,7 +480,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                     continue;
                 }
                 let (sequence, qubit_map) = run.unwrap();
-                for (gate, params, local_qubits) in &sequence.0.gates {
+                for (gate, params, local_qubits) in &sequence.gate_sequence.gates {
                     let qubits: Vec<Qubit> = local_qubits
                         .iter()
                         .map(|index| qubit_map[*index as usize])
@@ -465,25 +501,19 @@ pub fn two_qubit_unitary_peephole_optimize(
                             #[cfg(feature = "cache_pygates")]
                             None,
                         ),
-                        None => {
-                            let Some(TargetOperation::Normal(gate)) =
-                                target.operation_from_name(sequence.1.as_str())
-                            else {
-                                unreachable!()
-                            };
-                            out_dag_builder.apply_operation_back(
-                                gate.operation.clone(),
-                                &qubits,
-                                &[],
-                                Some(out_params.unwrap_or(gate.params.clone())),
-                                None,
-                                #[cfg(feature = "cache_pygates")]
-                                None,
-                            )
-                        }
+                        None => out_dag_builder.apply_operation_back(
+                            sequence.decomp_op.clone(),
+                            &qubits,
+                            &[],
+                            Some(out_params.unwrap_or(sequence.decomp_params.clone())),
+                            None,
+                            #[cfg(feature = "cache_pygates")]
+                            None,
+                        ),
                     }?;
                 }
-                out_dag_builder.add_global_phase(&Param::Float(sequence.0.global_phase))?;
+                out_dag_builder
+                    .add_global_phase(&Param::Float(sequence.gate_sequence.global_phase))?;
                 processed_runs.insert(*run_index);
             }
             None => {
diff --git a/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs b/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs
index 0da1a549c3f5..596ba5fd8882 100644
--- a/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs
+++ b/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs
@@ -41,6 +41,7 @@ pub(crate) struct DecomposerElement {
     pub(crate) decomposer: DecomposerType,
     pub(crate) packed_op: PackedOperation,
     pub(crate) params: SmallVec<[Param; 3]>,
+    pub(crate) target_name: String,
 }
 
 #[derive(Clone, Debug)]
@@ -48,6 +49,7 @@ pub(crate) struct TwoQubitUnitarySequence {
     pub(crate) gate_sequence: TwoQubitGateSequence,
     pub(crate) decomp_op: PackedOperation,
     pub(crate) decomp_params: SmallVec<[Param; 3]>,
+    pub(crate) target_name: String,
 }
 
 /// Function to evaluate hardware-native direction, this allows to correct
@@ -162,6 +164,7 @@ pub(crate) fn synth_su4_sequence(
         gate_sequence: synth,
         decomp_op: decomposer_2q.packed_op.clone(),
         decomp_params: decomposer_2q.params.clone(),
+        target_name: decomposer_2q.target_name.clone(),
     };
     match preferred_direction {
         None => Ok(sequence),
@@ -239,6 +242,7 @@ fn reversed_synth_su4_sequence(
         gate_sequence: reversed_synth,
         decomp_op: decomposer_2q.packed_op.clone(),
         decomp_params: decomposer_2q.params.clone(),
+        target_name: decomposer_2q.target_name.clone(),
     };
     Ok(sequence)
 }
diff --git a/crates/transpiler/src/passes/unitary_synthesis.rs b/crates/transpiler/src/passes/unitary_synthesis.rs
index 19b2d1242586..38457552ffa9 100644
--- a/crates/transpiler/src/passes/unitary_synthesis.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis.rs
@@ -54,7 +54,6 @@ use qiskit_synthesis::two_qubit_decompose::{
 };
 
 const PI2: f64 = PI / 2.;
-const PI4: f64 = PI / 4.;
 
 // These two variables are used to exit the decomposer search early in
 // `get_2q_decomposers_from_target`.
@@ -496,6 +495,7 @@ fn get_2q_decomposer_from_basis(
                 decomposer: DecomposerType::TwoQubitControlledU(Box::new(decomposer)),
                 packed_op: PackedOperation::from_standard_gate(std_gate),
                 params: SmallVec::new(),
+                target_name: kak_gates[0].to_string(),
             }));
         };
     };
@@ -521,6 +521,7 @@ fn get_2q_decomposer_from_basis(
             decomposer: DecomposerType::TwoQubitBasis(Box::new(decomposer)),
             packed_op: PackedOperation::from_standard_gate(std_gate),
             params: SmallVec::new(),
+            target_name: kak_gates[0].to_string(),
         }));
     }
     Ok(None)
@@ -647,6 +648,7 @@ fn get_2q_decomposers_from_target(
                         decomposer: DecomposerType::TwoQubitControlledU(Box::new(decomposer)),
                         packed_op: gate.operation.clone(),
                         params: gate.params.clone(),
+                        target_name: gate.operation.name().to_string(),
                     });
                 }
                 Err(_) => continue,
@@ -668,9 +670,9 @@ fn get_2q_decomposers_from_target(
         match op.operation.matrix(&op.params) {
             None => false,
             Some(unitary_matrix) => {
-                let kak = TwoQubitWeylDecomposition::new_inner(unitary_matrix.view(), None, None)
-                    .unwrap();
-                relative_eq!(kak.a(), PI4) && relative_eq!(kak.c(), 0.0)
+                TwoQubitWeylDecomposition::new_inner(unitary_matrix.view(), None, None)
+                    .unwrap()
+                    .is_supercontrolled()
             }
         }
     }
@@ -704,6 +706,7 @@ fn get_2q_decomposers_from_target(
                 decomposer: DecomposerType::TwoQubitBasis(Box::new(decomposer)),
                 packed_op: gate.operation.clone(),
                 params: gate.params.clone(),
+                target_name: gate.operation.name().to_string(),
             });
         }
     }
@@ -809,11 +812,12 @@ fn get_2q_decomposers_from_target(
             let decomposer_gate = decomposer
                 .getattr(intern!(py, "gate"))?
                 .extract::<NormalOperation>()?;
-
+            let gate_name = decomposer_gate.operation.name().to_string();
             decomposers.push(DecomposerElement {
                 decomposer: DecomposerType::XX(decomposer.into()),
                 packed_op: decomposer_gate.operation,
                 params: decomposer_gate.params.clone(),
+                target_name: gate_name,
             });
         }
     }
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index c8e8361f7622..4f2e712d1250 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -527,7 +527,7 @@ def test_pass_respects_directionality(self):
         peephole = TwoQubitPeepholeOptimization(target)
         qc = QuantumCircuit(2)
         qc.swap(0, 1)
-        qc = transpile(qc, target=target, seed_transpiler=1234, optimization_level=0)
+        qc = transpile(qc, target=target, seed_transpiler=1234, optimization_level=2)
         res = peephole(qc)
         self.assertTrue(self.all_inst_in_target(res, target))
         self.assertEqual(res, qc)
@@ -540,7 +540,7 @@ def test_pass_respects_directionality(self):
         )
         res = peephole(qc_duplicated)
         self.assertTrue(self.all_inst_in_target(res, target))
-        self.assertEqual(Operator(res), QuantumCircuit(2))
+        self.assertEqual(Operator(res), Operator(QuantumCircuit(2)))
 
         qc_duplicated = QuantumCircuit(2)
         for _ in range(101):
@@ -550,7 +550,7 @@ def test_pass_respects_directionality(self):
         )
         res = peephole(qc_duplicated)
         self.assertTrue(self.all_inst_in_target(res, target))
-        self.assertEqual(Operator(res), Operator(qc))
+        self.assertEqual(Operator(res), Operator(qc_duplicated))
 
     def all_inst_in_target(self, circuit: QuantumCircuit, target: Target):
         for inst in circuit.data:

From dbbcb3757afafd2456f3e91fd51b9b35fa4f229b Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 14 Jun 2025 12:42:18 -0400
Subject: [PATCH 30/64] Fix merge conflict

---
 crates/transpiler/src/passes/two_qubit_peephole.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 835a2394e3b3..52057bc2db9c 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -19,7 +19,7 @@ use rayon::prelude::*;
 use rustworkx_core::petgraph::stable_graph::NodeIndex;
 use smallvec::{smallvec, SmallVec};
 
-use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
+use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType, VarsMode};
 use qiskit_circuit::operations::{Operation, OperationRef, Param, StandardGate};
 use qiskit_circuit::packed_instruction::PackedOperation;
 use qiskit_circuit::Qubit;
@@ -462,7 +462,7 @@ pub fn two_qubit_unitary_peephole_optimize(
     let run_mapping = run_mapping?;
     // After we've computed all the sequences to execute now serially build up a new dag.
     let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
-    let out_dag = dag.copy_empty_like("alike")?;
+    let out_dag = dag.copy_empty_like(VarsMode::Alike)?;
     let mut out_dag_builder = out_dag.into_builder();
     let node_mapping = locked_node_mapping.into_inner().unwrap();
     for node in dag.topological_op_nodes()? {

From 439e1ebfa46f9b26127d41a9158305b1898fa93f Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 14 Jun 2025 12:56:56 -0400
Subject: [PATCH 31/64] Expand test coverage to check individual gate synthesis

This commit expands the test coverage to ensure that single gate
two qubit gate synthesis matches expectations with different basis.

Co-authored-by: Shelly Garion <46566946+ShellyGarion@users.noreply.github.com>
---
 .../src/passes/two_qubit_peephole.rs          |   4 +-
 .../transpiler/test_two_qubit_peephole.py     | 165 +++++++++++++++++-
 2 files changed, 167 insertions(+), 2 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 52057bc2db9c..ca3391df1729 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -430,7 +430,9 @@ pub fn two_qubit_unitary_peephole_optimize(
                         ),
                     ),
                 };
-            if !outside_target && new_score > original_score {
+            // If the we are not outside the target and the new score isn't any better just use the
+            // original (this includes a tie).
+            if !outside_target && new_score >= original_score {
                 return Ok(None);
             }
             // This is done at the end of the map in some attempt to minimize
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 4f2e712d1250..5f4fa07e8176 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -28,7 +28,12 @@
 from qiskit.quantum_info.operators import Operator
 from qiskit.quantum_info.random import random_unitary
 from qiskit.transpiler import PassManager, CouplingMap, Target, InstructionProperties
-from qiskit.transpiler.passes import TwoQubitPeepholeOptimization, TrivialLayout
+from qiskit.transpiler.passes import (
+    TwoQubitPeepholeOptimization,
+    TrivialLayout,
+    ConsolidateBlocks,
+    UnitarySynthesis,
+)
 from qiskit.circuit.library import (
     IGate,
     CXGate,
@@ -43,6 +48,13 @@
     RYYGate,
     CZGate,
     RXXGate,
+    RZZGate,
+    RZXGate,
+    CPhaseGate,
+    CRZGate,
+    CRXGate,
+    CRYGate,
+    CUGate,
 )
 from qiskit.circuit import Measure
 from qiskit.circuit.controlflow import IfElseOp
@@ -561,3 +573,154 @@ def all_inst_in_target(self, circuit: QuantumCircuit, target: Target):
                     f"{inst.name} {tuple(circuit.find_bit(x).index for x in inst.qubits)} not supported"
                 )
         return True
+
+    @combine(
+        gate=[
+            RXXGate(0.1),
+            RYYGate(0.1),
+            RZZGate(0.1),
+            RZXGate(0.1),
+            CPhaseGate(0.1),
+            CRZGate(0.1),
+            CRXGate(0.1),
+            CRYGate(0.1),
+            CUGate(0.1, 0.2, 0.3, 0.4),
+        ],
+        target_gate=[
+            CXGate(),
+            CZGate(),
+            ECRGate(),
+        ],
+        add_noise=[True, False],
+        name="{gate}_{target_gate}_noise={add_noise}",
+    )
+    def test_two_qubit_parametrized_gates_basis_decomp_target(self, gate, target_gate, add_noise):
+        """Test the synthesis of a circuit containing a 2-qubit parametrized gate
+        on a target with a CX gate"""
+        theta = Parameter("θ")
+        target = Target(num_qubits=2)
+        if add_noise:
+            target.add_instruction(
+                target_gate, {(i, i + 1): InstructionProperties(error=0.001) for i in [0]}
+            )
+            target.add_instruction(RZGate(theta))
+            target.add_instruction(
+                SXGate(), {(i,): InstructionProperties(error=0.0001) for i in [0, 1]}
+            )
+        else:
+            target.add_instruction(target_gate)
+            target.add_instruction(RZGate(theta))
+            target.add_instruction(SXGate())
+
+        qc = QuantumCircuit(2)
+        qc.append(gate, [0, 1])
+
+        peephole = TwoQubitPeepholeOptimization(target)
+        transpiled_circuit = peephole(qc)
+
+        legacy_path = PassManager(
+            [
+                ConsolidateBlocks(target=target),
+                UnitarySynthesis(
+                    target=target,
+                ),
+            ]
+        )
+
+        legacy = legacy_path.run(qc)
+        self.all_inst_in_target(transpiled_circuit, target)
+        self.assertEqual(Operator(transpiled_circuit), Operator(qc))
+        self.assertDictEqual(
+            dict(sorted(transpiled_circuit.count_ops().items())),
+            dict(sorted(legacy.count_ops().items())),
+        )
+
+    @combine(
+        gate=[
+            RXXGate(0.1),
+            RYYGate(0.1),
+            RZZGate(0.1),
+            RZXGate(0.1),
+            CPhaseGate(0.1),
+            CRZGate(0.1),
+            CRXGate(0.1),
+            CRYGate(0.1),
+            CUGate(0.1, 0.2, 0.3, 0.4),
+        ],
+        target_gate_cls=[
+            RZZGate,
+            RXXGate,
+            RZXGate,
+            RYYGate,
+        ],
+        add_noise=[True, False],
+        name="{gate}_{target_gate_cls}_noise={add_noise}",
+    )
+    def test_two_qubit_parametrized_gates_controlled_u_target(
+        self, gate, target_gate_cls, add_noise
+    ):
+        """Test the synthesis of a circuit containing a 2-qubit parametrized gate
+        on a target with a RZZ gate"""
+        theta = Parameter("θ")
+        lam = Parameter("λ")
+        phi = Parameter("ϕ")
+        target = Target(num_qubits=2)
+        target_gate = target_gate_cls(phi)
+        if add_noise:
+            target.add_instruction(
+                RXGate(lam), {(i,): InstructionProperties(error=0.0001) for i in [0, 1]}
+            )
+            target.add_instruction(RZGate(theta))
+            target.add_instruction(
+                target_gate, {(i, i + 1): InstructionProperties(error=0.001) for i in [0]}
+            )
+        else:
+            target.add_instruction(RXGate(lam))
+            target.add_instruction(RZGate(theta))
+            target.add_instruction(target_gate)
+
+        qc = QuantumCircuit(2)
+        qc.append(gate, [0, 1])
+
+        peephole = TwoQubitPeepholeOptimization(target)
+        transpiled_circuit = peephole(qc)
+
+        legacy_path = PassManager(
+            [
+                ConsolidateBlocks(target=target),
+                UnitarySynthesis(
+                    target=target,
+                ),
+            ]
+        )
+
+        legacy = legacy_path.run(qc)
+        self.all_inst_in_target(transpiled_circuit, target)
+        self.assertEqual(Operator(transpiled_circuit), Operator(qc))
+        self.assertDictEqual(
+            dict(sorted(transpiled_circuit.count_ops().items())),
+            dict(sorted(legacy.count_ops().items())),
+        )
+
+    def test_two_qubit_rzz_cz_gates_rzz_target(self):
+        """Test the synthesis of a circuit containing a RZZ and CZ gates
+        on a target with RZZ and CZ gates"""
+        theta = Parameter("θ")
+        lam = Parameter("λ")
+        phi = Parameter("ϕ")
+        target = Target(num_qubits=2)
+        target.add_instruction(RXGate(lam))
+        target.add_instruction(RZGate(theta))
+        target.add_instruction(RZZGate(phi))
+        target.add_instruction(CZGate())
+
+        qc = QuantumCircuit(2)
+        qc.rzz(0.2, 0, 1)
+        qc.cz(0, 1)
+
+        peephole = TwoQubitPeepholeOptimization(target)
+        transpiled_circuit = peephole(qc)
+        self.all_inst_in_target(transpiled_circuit, target)
+        self.assertEqual(Operator(transpiled_circuit), Operator(qc))
+        self.assertTrue(set(transpiled_circuit.count_ops()).issubset({"rz", "rx", "rzz"}))
+        self.assertEqual(transpiled_circuit.count_ops()["rzz"], 1)

From b24088ab766fb1a6d026e114048f9d5c87d5af1c Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 14 Jun 2025 13:25:51 -0400
Subject: [PATCH 32/64] Fix release note example

---
 releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml b/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
index a45914dcc832..7acb7cb5f614 100644
--- a/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
+++ b/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
@@ -11,7 +11,7 @@ features_transpiler:
 
         from qiskit.circuit import QuantumCircuit
         from qiskit.transpiler.passes import TwoQubitPeepholeOptimization
-        from qiskit.providers import GenericBackendV2
+        from qiskit.providers.fake_provider import GenericBackendV2
 
         # Build an unoptimized 2 qubit circuit
         unoptimized = QuantumCircuit(2)

From 3b926024ff13c56bcbe6607e953bd9e13032c8a0 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 9 Jul 2025 08:36:23 -0400
Subject: [PATCH 33/64] Include single qubit gate count in the heuristic

This commit adds a 3rd comparison to the heuristic used for comparing
the sequences. Previously it would first compare the 2q gate counts, and
pick the sequence with the fewest 2q gates, if the 2q gate counts are
the same then it would compute the expected error rate from the sequence and pick the sequence with the best expected fidelity. This commit updates this so if the expected fidelity are the same it will pick the sequence with the least number of gates.
---
 .../src/passes/two_qubit_peephole.rs          | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index ca3391df1729..7286e71c4822 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -229,16 +229,18 @@ fn score_sequence<'a>(
     target: &'a Target,
     kak_gate_name: &str,
     sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
-) -> (usize, f64) {
-    let mut gate_count = 0;
+) -> (usize, f64, usize) {
+    let mut two_gate_count = 0;
+    let mut total_gate_count = 0;
     let fidelity = sequence
         .filter_map(|(gate, local_qubits)| {
+            total_gate_count += 1;
             let qubits = local_qubits
                 .iter()
                 .map(|qubit| PhysicalQubit(qubit.0))
                 .collect::<Vec<_>>();
             if qubits.len() == 2 {
-                gate_count += 1;
+                two_gate_count += 1;
             }
             let name = match gate.as_ref() {
                 Some(g) => g.name(),
@@ -248,7 +250,7 @@ fn score_sequence<'a>(
             error.map(|error| 1. - error)
         })
         .product::<f64>();
-    (gate_count, 1. - fidelity)
+    (two_gate_count, 1. - fidelity, total_gate_count)
 }
 
 type MappingIterItem = Option<(TwoQubitUnitarySequence, [Qubit; 2])>;
@@ -302,7 +304,8 @@ pub fn two_qubit_unitary_peephole_optimize(
                 .unwrap();
             let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
             let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
-            let mut decomposer_scores: Vec<Option<(usize, f64)>> = vec![None; decomposers.len()];
+            let mut decomposer_scores: Vec<Option<(usize, f64, usize)>> =
+                vec![None; decomposers.len()];
 
             let order_sequence =
                 |(index_a, sequence_a): &(usize, TwoQubitUnitarySequence),
@@ -311,7 +314,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                         match decomposer_scores[*index_a] {
                             Some(score) => score,
                             None => {
-                                let score: (usize, f64) = score_sequence(
+                                let score: (usize, f64, usize) = score_sequence(
                                     target,
                                     sequence_a.target_name.as_str(),
                                     sequence_a.gate_sequence.gates.iter().map(
@@ -335,7 +338,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                         match decomposer_scores[*index_b] {
                             Some(score) => score,
                             None => {
-                                let score: (usize, f64) = score_sequence(
+                                let score: (usize, f64, usize) = score_sequence(
                                     target,
                                     sequence_b.target_name.as_str(),
                                     sequence_b.gate_sequence.gates.iter().map(
@@ -381,7 +384,8 @@ pub fn two_qubit_unitary_peephole_optimize(
             }
             let sequence = sequence.unwrap();
             let mut original_fidelity: f64 = 1.;
-            let mut original_count: usize = 0;
+            let mut original_2q_count: usize = 0;
+            let original_total_count: usize = node_indices.len();
             let mut outside_target = false;
             for node_index in node_indices {
                 let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
@@ -393,7 +397,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                     .map(|qubit| PhysicalQubit(qubit.0))
                     .collect();
                 if qubits.len() == 2 {
-                    original_count += 1;
+                    original_2q_count += 1;
                 }
                 let name = inst.op.name();
                 let gate_fidelity = match target.get_error(name, qubits.as_slice()) {
@@ -412,8 +416,12 @@ pub fn two_qubit_unitary_peephole_optimize(
                 };
                 original_fidelity *= gate_fidelity;
             }
-            let original_score = (original_count, 1. - original_fidelity);
-            let new_score: (usize, f64) =
+            let original_score = (
+                original_2q_count,
+                1. - original_fidelity,
+                original_total_count,
+            );
+            let new_score: (usize, f64, usize) =
                 match decomposer_scores[sequence.0] {
                     Some(score) => score,
                     None => score_sequence(

From 03741852370cfb8289b39532cd20c647f0de7bf3 Mon Sep 17 00:00:00 2001
From: Jake Lishman <jake.lishman@ibm.com>
Date: Fri, 13 Mar 2026 20:23:42 +0000
Subject: [PATCH 34/64] Fix approximate-by-default behaviour of
 `UnitarySynthesis`

In gh-15492[^1], we accidentally swapped the behaviour of
`approximation_degree` such that `None` was treated as "exact" and `1.0`
as "up to gate error".  Despite all _other_ explicit float values
meaning up-to-gate-fidelity multiplied by the value,
`approximation_degree=1.0` has historically _actually_ meant "exact
synthesis", and `None` is "up to gate error" instead.

While the oversight is easy to correct (and now encapsulated on entry to
`UnitarySynthesis` rather than dealing with two different systems), the
more worrying aspect was that our test suite did not catch the swap; we
came unfortunately close to releasing Qiskit 2.4 with approximate
synthesis turned on by default.

[^1]: 0b8bceb0: Rewrite default `UnitarySynthesis` to cache decomposers
---
 .../transpiler/passes/unitary_synthesis.rs    | 14 +--
 .../cext/src/transpiler/transpile_function.rs | 14 ++-
 crates/transpiler/src/passes/mod.rs           |  2 +-
 .../passes/unitary_synthesis/decomposers.rs   | 93 +++++++------------
 .../src/passes/unitary_synthesis/mod.rs       | 79 ++++++++++++++--
 crates/transpiler/src/transpiler.rs           |  5 +-
 .../transpiler/test_unitary_synthesis.py      | 65 ++++++++++++-
 7 files changed, 191 insertions(+), 81 deletions(-)

diff --git a/crates/cext/src/transpiler/passes/unitary_synthesis.rs b/crates/cext/src/transpiler/passes/unitary_synthesis.rs
index f4ddbf549a5e..c0a08883e1c6 100644
--- a/crates/cext/src/transpiler/passes/unitary_synthesis.rs
+++ b/crates/cext/src/transpiler/passes/unitary_synthesis.rs
@@ -17,6 +17,7 @@ use qiskit_circuit::circuit_data::CircuitData;
 use qiskit_circuit::dag_circuit::DAGCircuit;
 use qiskit_transpiler::passes::{
     UnitarySynthesisConfig, UnitarySynthesisState, run_unitary_synthesis,
+    unitary_synthesis::Approximation,
 };
 use qiskit_transpiler::target::Target;
 
@@ -78,16 +79,17 @@ pub unsafe extern "C" fn qk_transpiler_pass_standalone_unitary_synthesis(
         Ok(dag) => dag,
         Err(e) => panic!("{}", e),
     };
-    let approximation_degree = if approximation_degree.is_nan() {
-        None
-    } else {
-        Some(approximation_degree)
-    };
+    let approximation =
+        Approximation::from_py_approximation_degree(if approximation_degree.is_nan() {
+            None
+        } else {
+            Some(approximation_degree)
+        });
     let physical_qubits = (0..dag.num_qubits() as u32)
         .map(PhysicalQubit::new)
         .collect::<Vec<_>>();
     let mut synthesis_state = UnitarySynthesisState::new(UnitarySynthesisConfig {
-        approximation_degree,
+        approximation,
         run_python_decomposers: false,
         ..Default::default()
     });
diff --git a/crates/cext/src/transpiler/transpile_function.rs b/crates/cext/src/transpiler/transpile_function.rs
index 1b6702799391..5bbdcdb94a87 100644
--- a/crates/cext/src/transpiler/transpile_function.rs
+++ b/crates/cext/src/transpiler/transpile_function.rs
@@ -15,7 +15,7 @@ use std::ffi::c_char;
 use qiskit_circuit::circuit_data::CircuitData;
 use qiskit_circuit::dag_circuit::DAGCircuit;
 use qiskit_transpiler::commutation_checker::get_standard_commutation_checker;
-use qiskit_transpiler::passes::{UnitarySynthesisConfig, UnitarySynthesisState};
+use qiskit_transpiler::passes::{UnitarySynthesisConfig, UnitarySynthesisState, unitary_synthesis};
 use qiskit_transpiler::standard_equivalence_library::generate_standard_equivalence_library;
 use qiskit_transpiler::target::Target;
 use qiskit_transpiler::transpile;
@@ -164,7 +164,9 @@ pub unsafe extern "C" fn qk_transpile_stage_init(
         Some(options.approximation_degree)
     };
     let mut synthesis_state = UnitarySynthesisState::new(UnitarySynthesisConfig {
-        approximation_degree,
+        approximation: unitary_synthesis::Approximation::from_py_approximation_degree(
+            approximation_degree,
+        ),
         run_python_decomposers: false,
         ..Default::default()
     });
@@ -411,7 +413,9 @@ pub unsafe extern "C" fn qk_transpile_stage_optimization(
         Some(options.approximation_degree)
     };
     let mut synthesis_state = UnitarySynthesisState::new(UnitarySynthesisConfig {
-        approximation_degree,
+        approximation: unitary_synthesis::Approximation::from_py_approximation_degree(
+            approximation_degree,
+        ),
         run_python_decomposers: false,
         ..Default::default()
     });
@@ -518,7 +522,9 @@ pub unsafe extern "C" fn qk_transpile_stage_translation(
         Some(options.approximation_degree)
     };
     let mut synthesis_state = UnitarySynthesisState::new(UnitarySynthesisConfig {
-        approximation_degree,
+        approximation: unitary_synthesis::Approximation::from_py_approximation_degree(
+            approximation_degree,
+        ),
         run_python_decomposers: false,
         ..Default::default()
     });
diff --git a/crates/transpiler/src/passes/mod.rs b/crates/transpiler/src/passes/mod.rs
index 2186f8b93bde..276613d39805 100644
--- a/crates/transpiler/src/passes/mod.rs
+++ b/crates/transpiler/src/passes/mod.rs
@@ -49,7 +49,7 @@ mod remove_identity_equiv;
 pub mod sabre;
 mod split_2q_unitaries;
 mod substitute_pi4_rotations;
-mod unitary_synthesis;
+pub mod unitary_synthesis;
 mod unroll_3q_or_more;
 pub mod vf2;
 mod wrap_angles;
diff --git a/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs b/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
index 1119998f7554..b16d527dcd13 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
@@ -35,8 +35,8 @@ use pyo3::prelude::*;
 use pyo3::types::{IntoPyDict, PyDict};
 
 use super::{
-    DecompositionDirection2q, QpuConstraint, QpuConstraintKind, UnitarySynthesisConfig,
-    UsePulseOptimizer,
+    Approximation, DecompositionDirection2q, NormalizedFidelity, QpuConstraint, QpuConstraintKind,
+    UnitarySynthesisConfig, UsePulseOptimizer,
 };
 use crate::QiskitError;
 use crate::passes::optimize_clifford_t::CLIFFORD_T_GATE_NAMES;
@@ -53,42 +53,6 @@ use qiskit_synthesis::two_qubit_decompose::{
     TwoQubitWeylDecomposition,
 };
 
-/// The fidelity of the 2q basis gate used in a decomposer.
-///
-/// This is necessarily between 0.0 and 1.0 and we normalise away negative zero, which together are
-/// why it's safe to use with total equality and hashing.
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct ApproximationDegree(f64);
-impl ApproximationDegree {
-    pub const EXACT: Self = Self(1.0);
-
-    #[inline]
-    pub fn new(val: f64) -> Option<Self> {
-        // The `abs` is normalising signed zero.
-        (0.0..=1.0).contains(&val).then(|| Self(val.abs()))
-    }
-    /// Get the value.  This is guaranteed to be finite, sign positive and in `[0.0, 1.0]`.
-    #[inline]
-    pub fn get(&self) -> f64 {
-        self.0
-    }
-
-    /// Does this represent approximate synthesis?
-    #[inline]
-    pub fn is_approximate(&self) -> bool {
-        *self != Self::EXACT
-    }
-}
-// `impl Eq` is safe for this float-derived quantity because we only permit the range `[0.0, 1.0]`
-// and forbid negative zero.
-impl Eq for ApproximationDegree {}
-impl hash::Hash for ApproximationDegree {
-    fn hash<H: hash::Hasher>(&self, state: &mut H) {
-        // This is safe because we're in the range `[0.0, 1.0]` and normalised out negative zero.
-        self.0.to_le_bytes().hash(state)
-    }
-}
-
 /// Constructor for a 2q Ising-like decomposer.  This corresponds to
 /// [TwoQubitControlledUDecomposer], and requires gates that are locally equivalent to RXX (and
 /// construct like it, if they're Python-space objects).
@@ -215,7 +179,7 @@ impl StaticKakSource {
 struct StaticKakConstructor {
     source: StaticKakSource,
     euler: EulerBasis,
-    approximation: ApproximationDegree,
+    fidelity: NormalizedFidelity,
     use_pulse_optimizer: UsePulseOptimizer,
 }
 impl StaticKakConstructor {
@@ -238,7 +202,7 @@ impl StaticKakConstructor {
             self.source.gate.clone(),
             self.source.params.clone(),
             matrix.view(),
-            self.approximation.get(),
+            self.fidelity.get(),
             self.euler.as_str(),
             self.use_pulse_optimizer.to_py_pulse_optimize(),
         )
@@ -294,7 +258,7 @@ impl Decomposer2qConstructor {
                     .try_build()
                     .map(|decomposer| Decomposer2q::StaticKak {
                         decomposer: Box::new(decomposer),
-                        approximation: constructor.approximation,
+                        fidelity: constructor.fidelity,
                     })
             }
         }
@@ -314,7 +278,7 @@ pub enum Decomposer2q {
     },
     StaticKak {
         decomposer: Box<TwoQubitBasisDecomposer>,
-        approximation: ApproximationDegree,
+        fidelity: NormalizedFidelity,
     },
 }
 impl Decomposer2q {
@@ -331,8 +295,8 @@ impl Decomposer2q {
             }),
             Self::StaticKak {
                 decomposer,
-                approximation,
-            } => decomposer.call_inner(matrix, None, approximation.is_approximate(), None),
+                fidelity,
+            } => decomposer.call_inner(matrix, None, fidelity.get() < 1.0, None),
         }
     }
 }
@@ -630,13 +594,15 @@ fn get_2q_decomposers(
                     gate: kak_gate.into(),
                     params: smallvec![],
                 };
-                let approximation =
-                    ApproximationDegree::new(config.approximation_degree.unwrap_or(1.0))
-                        .unwrap_or(ApproximationDegree::EXACT);
+                let fidelity = config.approximation.synthesis_fidelity(0.0).map_err(|e| {
+                    PyValueError::new_err(format!(
+                        "requested synthesis fidelity is out of range: {e}"
+                    ))
+                })?;
                 let constructor = Decomposer2qConstructor::StaticKak(StaticKakConstructor {
                     source,
                     euler,
-                    approximation,
+                    fidelity,
                     use_pulse_optimizer: config.use_pulse_optimizer,
                 });
                 let flip = choose_flip(direction, &constructor);
@@ -714,11 +680,13 @@ fn get_2q_decomposers(
                     continue;
                 };
                 let fidelity = config
-                    .approximation_degree
-                    .map(|a| a * (1. - candidate.error))
-                    .unwrap_or(1.);
-                let approximation =
-                    ApproximationDegree::new(fidelity).unwrap_or(ApproximationDegree::EXACT);
+                    .approximation
+                    .synthesis_fidelity(candidate.error)
+                    .map_err(|e| {
+                        PyValueError::new_err(format!(
+                            "requested synthesis fidelity is out of range: {e}"
+                        ))
+                    })?;
                 // TODO: the 2q decomposers internally already do everything that's needed to handle
                 // _all_ of the 1q bases simultaneously without further decompositions, but don't
                 // expose that functionality.  This wastes huge amounts of time and needs a fix.
@@ -732,7 +700,7 @@ fn get_2q_decomposers(
                     let constructor = Decomposer2qConstructor::StaticKak(StaticKakConstructor {
                         source,
                         euler,
-                        approximation,
+                        fidelity,
                         use_pulse_optimizer: config.use_pulse_optimizer,
                     });
                     let flip = choose_flip(candidate.direction, &constructor);
@@ -776,8 +744,7 @@ fn get_xx_decomposers(
     // `StandardGate` into a a known strength (or function of strength).
     let embodiments_lookup = imports::XX_EMBODIMENTS.get_bound(py).cast::<PyDict>()?;
     let xx_decomposer_class = imports::XX_DECOMPOSER.get_bound(py);
-    let approximation_degree = config.approximation_degree.unwrap_or(1.);
-    let is_approximate = approximation_degree != 1.;
+    let approximation = config.approximation;
 
     let mut extend_with_flip = |out: &mut Vec<(usize, FlipDirection)>,
                                 candidates: &[Candidate2q],
@@ -805,7 +772,17 @@ fn get_xx_decomposers(
                 embodiment
             };
             embodiments.set_item(strength, embodiment)?;
-            fidelities.set_item(strength, (1.0 - candidate.error) * approximation_degree)?;
+            fidelities.set_item(
+                strength,
+                approximation
+                    .synthesis_fidelity(candidate.error)
+                    .map_err(|e| {
+                        PyValueError::new_err(format!(
+                            "requested synthesis fidelity is out of range: {e}"
+                        ))
+                    })?
+                    .get(),
+            )?;
         }
         if !fidelities.is_truthy()? {
             return Ok(());
@@ -818,7 +795,7 @@ fn get_xx_decomposers(
             ))?;
             let constructor = Decomposer2qConstructor::DiscretePauli(DiscretePauliConstructor {
                 decomposer: ob.unbind(),
-                is_approximate,
+                is_approximate: approximation != Approximation::Exact,
             });
             if let Some(index) = cache.cache(constructor) {
                 out.push((index, flip));
diff --git a/crates/transpiler/src/passes/unitary_synthesis/mod.rs b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
index 8b35cf93c9bb..3890aec6df0d 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/mod.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
@@ -17,6 +17,7 @@ use indexmap::IndexSet;
 use nalgebra::{DMatrix, Matrix2};
 use ndarray::prelude::*;
 use num_complex::Complex64;
+use std::hash;
 
 use numpy::PyReadonlyArray2;
 use pyo3::prelude::*;
@@ -40,7 +41,69 @@ use qiskit_synthesis::two_qubit_decompose::TwoQubitGateSequence;
 #[cfg(feature = "cache_pygates")]
 use std::sync::OnceLock;
 
-/// A borrowed view onto the hardawre constraint.
+/// The fidelity of the 2q basis gate used in a decomposer.
+///
+/// This is necessarily between 0.0 and 1.0 and we normalise away negative zero, which together are
+/// why it's safe to use with total equality and hashing.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NormalizedFidelity(f64);
+impl NormalizedFidelity {
+    #[inline]
+    pub fn new(val: f64) -> Option<Self> {
+        // The `abs` is normalising signed zero.
+        (0.0..=1.0).contains(&val).then(|| Self(val.abs()))
+    }
+    /// Get the value.  This is guaranteed to be finite, sign positive and in `[0.0, 1.0]`.
+    #[inline]
+    pub fn get(&self) -> f64 {
+        self.0
+    }
+}
+// `impl Eq` is safe for this float-derived quantity because we only permit the range `[0.0, 1.0]`
+// and forbid negative zero.
+impl Eq for NormalizedFidelity {}
+impl hash::Hash for NormalizedFidelity {
+    fn hash<H: hash::Hasher>(&self, state: &mut H) {
+        // This is safe because we're in the range `[0.0, 1.0]` and normalised out negative zero.
+        self.0.to_le_bytes().hash(state)
+    }
+}
+
+/// Whether to do approximate synthesis.
+#[derive(Clone, Copy, Debug, PartialEq, Default)]
+pub enum Approximation {
+    /// Do perfect synthesis, regardless of reported gate errors.
+    #[default]
+    Exact,
+    /// Scale the reported gate fidelity by the given amount.
+    ScaleFidelity(f64),
+}
+impl Approximation {
+    /// Convert from the Python-space representation of `approximation_degree`.
+    pub fn from_py_approximation_degree(val: Option<f64>) -> Self {
+        match val {
+            // ... yeah, I don't know why we've historically done this either.
+            None => Self::ScaleFidelity(1.0),
+            Some(1.0) => Self::Exact,
+            Some(val) => Self::ScaleFidelity(val),
+        }
+    }
+
+    /// Get the fidelity target that should be used for a given gate error.
+    ///
+    /// Returns `Err` with the value of an out-of-bounds requested fidelity.
+    pub fn synthesis_fidelity(&self, gate_error: f64) -> Result<NormalizedFidelity, f64> {
+        match self {
+            Self::Exact => Ok(NormalizedFidelity::new(1.0).expect("1.0 should be in bounds")),
+            Self::ScaleFidelity(scale) => {
+                let fidelity = scale * (1.0 - gate_error);
+                NormalizedFidelity::new(fidelity).ok_or(fidelity)
+            }
+        }
+    }
+}
+
+/// A borrowed view onto the hardware constraint.
 #[derive(Clone, Copy, Debug)]
 pub enum QpuConstraint<'a> {
     Target(&'a Target),
@@ -132,12 +195,8 @@ impl DecompositionDirection2q {
 /// This implements `Default`, which is a convenient constructor.
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct UnitarySynthesisConfig {
-    /// Whether to allow approximations (`Some`) or not (`None`).
-    ///
-    /// If `Some`, the weight is a multiplicative multiplier on fidelity, such that `1.0` means "use
-    /// the gate fidelity exactly" and `0.5` would mean "treat the gate as having half its natural
-    /// fidelity", etc.
-    pub approximation_degree: Option<f64>,
+    /// Whether to do approximate synthesis.
+    pub approximation: Approximation,
     pub use_pulse_optimizer: UsePulseOptimizer,
     pub decomposition_direction_2q: DecompositionDirection2q,
     /// Whether to allow use of Python-space decomposers.
@@ -146,7 +205,7 @@ pub struct UnitarySynthesisConfig {
 impl Default for UnitarySynthesisConfig {
     fn default() -> Self {
         Self {
-            approximation_degree: None,
+            approximation: Approximation::Exact,
             use_pulse_optimizer: UsePulseOptimizer::IfBetter,
             decomposition_direction_2q: DecompositionDirection2q::BestValid,
             run_python_decomposers: false,
@@ -603,7 +662,7 @@ pub fn py_unitary_synthesis(
     pulse_optimize: Option<bool>,
 ) -> PyResult<DAGCircuit> {
     let config = UnitarySynthesisConfig {
-        approximation_degree,
+        approximation: Approximation::from_py_approximation_degree(approximation_degree),
         use_pulse_optimizer: UsePulseOptimizer::from_py_pulse_optimize(pulse_optimize),
         decomposition_direction_2q: DecompositionDirection2q::from_py_natural_direction(
             natural_direction,
@@ -647,7 +706,7 @@ pub fn py_synthesize_unitary_matrix(
     pulse_optimize: Option<bool>,
 ) -> PyResult<DAGCircuit> {
     let config = UnitarySynthesisConfig {
-        approximation_degree,
+        approximation: Approximation::from_py_approximation_degree(approximation_degree),
         use_pulse_optimizer: UsePulseOptimizer::from_py_pulse_optimize(pulse_optimize),
         decomposition_direction_2q: DecompositionDirection2q::from_py_natural_direction(
             natural_direction,
diff --git a/crates/transpiler/src/transpiler.rs b/crates/transpiler/src/transpiler.rs
index cf31fc7fd5ae..f8d4fa8462a9 100644
--- a/crates/transpiler/src/transpiler.rs
+++ b/crates/transpiler/src/transpiler.rs
@@ -16,6 +16,7 @@ use crate::commutation_checker::CommutationChecker;
 use crate::commutation_checker::get_standard_commutation_checker;
 use crate::equivalence::EquivalenceLibrary;
 use crate::passes::sabre::route::PyRoutingTarget;
+use crate::passes::unitary_synthesis;
 use crate::passes::*;
 use crate::standard_equivalence_library::generate_standard_equivalence_library;
 use crate::target::Target;
@@ -469,7 +470,9 @@ pub fn transpile(
     let mut equivalence_library = generate_standard_equivalence_library();
     let sabre_heuristic = get_sabre_heuristic(target)?;
     let mut synthesis_state = UnitarySynthesisState::new(UnitarySynthesisConfig {
-        approximation_degree,
+        approximation: unitary_synthesis::Approximation::from_py_approximation_degree(
+            approximation_degree,
+        ),
         run_python_decomposers: false,
         ..Default::default()
     });
diff --git a/test/python/transpiler/test_unitary_synthesis.py b/test/python/transpiler/test_unitary_synthesis.py
index f1571294e0e4..ba1e172dd0bf 100644
--- a/test/python/transpiler/test_unitary_synthesis.py
+++ b/test/python/transpiler/test_unitary_synthesis.py
@@ -62,7 +62,7 @@
     PauliEvolutionGate,
     CPhaseGate,
 )
-from qiskit.quantum_info import SparsePauliOp
+from qiskit.quantum_info import SparsePauliOp, average_gate_fidelity
 from qiskit.circuit import Measure
 from qiskit.circuit.controlflow import IfElseOp
 from qiskit.circuit import Parameter, Gate
@@ -1055,6 +1055,69 @@ def test_two_qubit_natural_direction_true_duration_fallback_target(self):
             {(0, 1)},
         )
 
+    def test_approximate_synthesis(self):
+        # Arbitrary Hermitian matrix with a norm known to be sensibly sized (it's about 2.8).
+        herm = np.array(
+            [
+                [-0.742, 0.643 + 0.341j, 0.166 - 0.432j, 0.803 + 0.285j],
+                [0.643 - 0.341j, 1.462, -0.477 + 0.0674j, 0.216 + 0.653j],
+                [0.166 + 0.432j, -0.477 - 0.0674j, 0.007, -0.231 - 0.226j],
+                [0.803 - 0.285j, 0.216 - 0.653j, -0.231 + 0.226j, -0.743],
+            ]
+        )
+        # A unitary perturbation that is a small distance from the identity.  It needs 3 cx to
+        # synthesise.
+        perturbation = scipy.linalg.expm(-1j * herm * 1e-3)
+
+        target = Target(2)
+        target.add_instruction(CXGate(), {(0, 1): InstructionProperties(error=1e-4)})
+        target.add_instruction(RZGate(Parameter("a")))
+        target.add_instruction(SXGate())
+
+        pass_exact = UnitarySynthesis(target=target, approximation_degree=1.0)
+        pass_approximate = UnitarySynthesis(target=target, approximation_degree=None)
+
+        # iSwap can be synthesised with 2 CX.
+        near_2cx = iSwapGate().to_matrix() @ perturbation
+        qc = QuantumCircuit(2)
+        qc.ensure_physical()
+        qc.unitary(near_2cx, [0, 1])
+
+        # First, a sanity check: the pass defaults should produce an exact synthesis, and it should
+        # have taken 3 cx since it
+        from_default = UnitarySynthesis(target=target)(qc)
+        self.assertLess(
+            1 - average_gate_fidelity(Operator(near_2cx), Operator(from_default)), 1e-15
+        )
+        self.assertEqual(from_default.count_ops()["cx"], 3)
+        # These two circuits should be exactly identical, since it's the same decomposition.
+        self.assertEqual(from_default, pass_exact(qc))
+        # ... but now if we allow approximation up to the gate error, we should be able to find the
+        # 2-cx synthesis of iSwap (or something else that's nearby).
+        self.assertEqual(pass_approximate(qc).count_ops()["cx"], 2)
+
+        # The same applies for gates that are near a 1-cx decomposition...
+        near_1cx = CXGate().to_matrix() @ perturbation
+        qc = QuantumCircuit(2)
+        qc.ensure_physical()
+        qc.unitary(near_1cx, [0, 1])
+        from_exact = pass_exact(qc)
+        self.assertLess(1 - average_gate_fidelity(Operator(near_1cx), Operator(from_exact)), 1e-15)
+        self.assertEqual(from_exact.count_ops()["cx"], 3)
+        self.assertEqual(pass_approximate(qc).count_ops()["cx"], 1)
+
+        # ... and near a 0q decomposition.
+        near_separable = np.kron(XGate().to_matrix(), ZGate().to_matrix()) @ perturbation
+        qc = QuantumCircuit(2)
+        qc.ensure_physical()
+        qc.unitary(near_separable, [0, 1])
+        from_exact = pass_exact(qc)
+        self.assertLess(
+            1 - average_gate_fidelity(Operator(near_separable), Operator(from_exact)), 1e-15
+        )
+        self.assertEqual(from_exact.count_ops()["cx"], 3)
+        self.assertNotIn("cx", pass_approximate(qc).count_ops())
+
 
 if __name__ == "__main__":
     unittest.main()

From a64f6c093537897854f44e7854e45c5bc037529a Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Mon, 16 Mar 2026 09:21:32 -0400
Subject: [PATCH 35/64] Update the pass to leverage unitary synthesis logic for
 decomposition

Since originally writing this pass the UnitarySynthesis pass was
rewritten to have a much cleaner internal interface for working with
synthesizing unitaries. This commit reworks the new pass to rely on the
unitary synthesis code for the actual synthesis and decomposer handling.
This simplifies the new logic in the pass as it no longer needs to
duplicate the logic internally.
---
 .../src/convert_2q_block_matrix.rs            |   1 -
 crates/synthesis/src/qsd.rs                   |   2 +-
 crates/synthesis/src/two_qubit_decompose.rs   | 249 +--------
 .../src/passes/consolidate_blocks.rs          |   5 +-
 crates/transpiler/src/passes/mod.rs           |   5 +-
 .../src/passes/two_qubit_peephole.rs          | 528 +++++-------------
 .../two_qubit_unitary_synthesis_utils.rs      | 248 --------
 .../passes/unitary_synthesis/decomposers.rs   |   2 +-
 .../src/passes/unitary_synthesis/mod.rs       |  96 ++--
 .../passes/optimization/two_qubit_peephole.py |  26 +-
 .../transpiler/test_two_qubit_peephole.py     |  76 ++-
 11 files changed, 290 insertions(+), 948 deletions(-)
 delete mode 100644 crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs

diff --git a/crates/quantum_info/src/convert_2q_block_matrix.rs b/crates/quantum_info/src/convert_2q_block_matrix.rs
index 491bf02bd738..66d68eaf670b 100644
--- a/crates/quantum_info/src/convert_2q_block_matrix.rs
+++ b/crates/quantum_info/src/convert_2q_block_matrix.rs
@@ -32,7 +32,6 @@ use crate::versor_u2::{VersorSU2, VersorU2, VersorU2Error};
 
 #[inline]
 pub fn get_matrix_from_inst(inst: &PackedInstruction) -> PyResult<Array2<Complex64>> {
-
     if let Some(mat) = inst.try_matrix() {
         return Ok(mat);
     }
diff --git a/crates/synthesis/src/qsd.rs b/crates/synthesis/src/qsd.rs
index 168f644cd01f..a12d367c4cc1 100644
--- a/crates/synthesis/src/qsd.rs
+++ b/crates/synthesis/src/qsd.rs
@@ -104,7 +104,7 @@ pub fn quantum_shannon_decomposition(
         smallvec![],
         aview2(&qiskit_circuit::gate_matrix::CX_GATE),
         1.0,
-        "U",
+        EulerBasis::U,
         None,
     )?;
     let one_qubit_decomposer = one_qubit_decomposer_basis_set.unwrap_or(&default_1q_basis);
diff --git a/crates/synthesis/src/two_qubit_decompose.rs b/crates/synthesis/src/two_qubit_decompose.rs
index 0aeef89fc9d7..9e5dbb787dcd 100644
--- a/crates/synthesis/src/two_qubit_decompose.rs
+++ b/crates/synthesis/src/two_qubit_decompose.rs
@@ -18,6 +18,8 @@
 // of real components and one of imaginary components.
 // In order to avoid copying we want to use `MatRef<c64>` or `MatMut<c64>`.
 
+use std::str::FromStr;
+
 use approx::{abs_diff_eq, relative_eq};
 use num_complex::{Complex, Complex64, ComplexFloat};
 use num_traits::Zero;
@@ -1974,243 +1976,6 @@ fn decomp0_inner(target: &TwoQubitWeylDecomposition) -> SmallVec<[Array2<Complex
     smallvec![target.K1r.dot(&target.K2r), target.K1l.dot(&target.K2l),]
 }
 
-#[pymethods]
-impl TwoQubitBasisDecomposer {
-    fn __getnewargs__(&self, py: Python) -> (String, PyObject, f64, &str, Option<bool>) {
-        (
-            self.gate.clone(),
-            self.basis_decomposer
-                .unitary_matrix
-                .to_pyarray(py)
-                .into_any()
-                .unbind(),
-            self.basis_fidelity,
-            self.euler_basis.as_str(),
-            self.pulse_optimize,
-        )
-    }
-
-    #[new]
-    #[pyo3(signature=(gate, gate_matrix, basis_fidelity=1.0, euler_basis="U", pulse_optimize=None))]
-    fn new(
-        gate: String,
-        gate_matrix: PyReadonlyArray2<Complex64>,
-        basis_fidelity: f64,
-        euler_basis: &str,
-        pulse_optimize: Option<bool>,
-    ) -> PyResult<Self> {
-        TwoQubitBasisDecomposer::new_inner(
-            gate,
-            gate_matrix.as_array(),
-            basis_fidelity,
-            EulerBasis::__new__(euler_basis)?,
-            pulse_optimize,
-        )
-    }
-
-    fn traces(&self, target: &TwoQubitWeylDecomposition) -> [Complex64; 4] {
-        [
-            4. * c64(
-                target.a.cos() * target.b.cos() * target.c.cos(),
-                target.a.sin() * target.b.sin() * target.c.sin(),
-            ),
-            4. * c64(
-                (PI4 - target.a).cos()
-                    * (self.basis_decomposer.b - target.b).cos()
-                    * target.c.cos(),
-                (PI4 - target.a).sin()
-                    * (self.basis_decomposer.b - target.b).sin()
-                    * target.c.sin(),
-            ),
-            c64(4. * target.c.cos(), 0.),
-            c64(4., 0.),
-        ]
-    }
-
-    /// Decompose target :math:`\sim U_d(x, y, z)` with :math:`0` uses of the basis gate.
-    /// Result :math:`U_r` has trace:
-    ///
-    /// .. math::
-    ///
-    ///     \Big\vert\text{Tr}(U_r\cdot U_\text{target}^{\dag})\Big\vert =
-    ///     4\Big\vert (\cos(x)\cos(y)\cos(z)+ j \sin(x)\sin(y)\sin(z)\Big\vert
-    ///
-    /// which is optimal for all targets and bases
-    #[staticmethod]
-    fn decomp0(py: Python, target: &TwoQubitWeylDecomposition) -> SmallVec<[PyObject; 2]> {
-        decomp0_inner(target)
-            .into_iter()
-            .map(|x| x.into_pyarray(py).into_any().unbind())
-            .collect()
-    }
-
-    /// Decompose target :math:`\sim U_d(x, y, z)` with :math:`1` use of the basis gate
-    /// math:`\sim U_d(a, b, c)`.
-    /// Result :math:`U_r` has trace:
-    ///
-    /// .. math::
-    ///
-    ///     \Big\vert\text{Tr}(U_r \cdot U_\text{target}^{\dag})\Big\vert =
-    ///     4\Big\vert \cos(x-a)\cos(y-b)\cos(z-c) + j \sin(x-a)\sin(y-b)\sin(z-c)\Big\vert
-    ///
-    /// which is optimal for all targets and bases with ``z==0`` or ``c==0``.
-    fn decomp1(&self, py: Python, target: &TwoQubitWeylDecomposition) -> SmallVec<[PyObject; 4]> {
-        self.decomp1_inner(target)
-            .into_iter()
-            .map(|x| x.into_pyarray(py).into_any().unbind())
-            .collect()
-    }
-
-    /// Decompose target :math:`\sim U_d(x, y, z)` with :math:`2` uses of the basis gate.
-    ///
-    /// For supercontrolled basis :math:`\sim U_d(\pi/4, b, 0)`, all b, result :math:`U_r` has trace
-    ///
-    /// .. math::
-    ///
-    ///     \Big\vert\text{Tr}(U_r \cdot U_\text{target}^\dag) \Big\vert = 4\cos(z)
-    ///
-    /// which is the optimal approximation for basis of CNOT-class :math:`\sim U_d(\pi/4, 0, 0)`
-    /// or DCNOT-class :math:`\sim U_d(\pi/4, \pi/4, 0)` and any target. It may
-    /// be sub-optimal for :math:`b \neq 0` (i.e. there exists an exact decomposition for any target
-    /// using :math:`B \sim U_d(\pi/4, \pi/8, 0)`, but it may not be this decomposition).
-    /// This is an exact decomposition for supercontrolled basis and target :math:`\sim U_d(x, y, 0)`.
-    /// No guarantees for non-supercontrolled basis.
-    fn decomp2_supercontrolled(
-        &self,
-        py: Python,
-        target: &TwoQubitWeylDecomposition,
-    ) -> SmallVec<[PyObject; 6]> {
-        self.decomp2_supercontrolled_inner(target)
-            .into_iter()
-            .map(|x| x.into_pyarray(py).into_any().unbind())
-            .collect()
-    }
-
-    /// Decompose target with :math:`3` uses of the basis.
-    ///
-    /// This is an exact decomposition for supercontrolled basis :math:`\sim U_d(\pi/4, b, 0)`, all b,
-    /// and any target. No guarantees for non-supercontrolled basis.
-    fn decomp3_supercontrolled(
-        &self,
-        py: Python,
-        target: &TwoQubitWeylDecomposition,
-    ) -> SmallVec<[PyObject; 8]> {
-        self.decomp3_supercontrolled_inner(target)
-            .into_iter()
-            .map(|x| x.into_pyarray(py).into_any().unbind())
-            .collect()
-    }
-
-    /// Decompose a two-qubit ``unitary`` over fixed basis and :math:`SU(2)` using the best
-    /// approximation given that each basis application has a finite ``basis_fidelity``.
-    fn generate_sequence(
-        &self,
-        unitary: PyReadonlyArray2<Complex64>,
-        basis_fidelity: Option<f64>,
-        approximate: bool,
-        _num_basis_uses: Option<u8>,
-    ) -> PyResult<TwoQubitGateSequence> {
-        let basis_fidelity = if !approximate {
-            1.0
-        } else {
-            basis_fidelity.unwrap_or(self.basis_fidelity)
-        };
-        let target_decomposed =
-            TwoQubitWeylDecomposition::new(unitary, Some(DEFAULT_FIDELITY), None)?;
-        let traces = self.traces(&target_decomposed);
-        let best_nbasis = traces
-            .into_iter()
-            .enumerate()
-            .map(|(idx, trace)| (idx, trace.trace_to_fid() * basis_fidelity.powi(idx as i32)))
-            .min_by(|(_idx1, fid1), (_idx2, fid2)| fid2.partial_cmp(fid1).unwrap())
-            .unwrap()
-            .0;
-        let best_nbasis = _num_basis_uses.unwrap_or(best_nbasis as u8);
-        let decomposition = match best_nbasis {
-            0 => decomp0_inner(&target_decomposed),
-            1 => self.decomp1_inner(&target_decomposed),
-            2 => self.decomp2_supercontrolled_inner(&target_decomposed),
-            3 => self.decomp3_supercontrolled_inner(&target_decomposed),
-            _ => unreachable!("Invalid basis to use"),
-        };
-        let pulse_optimize = self.pulse_optimize.unwrap_or(true);
-        let sequence = if pulse_optimize {
-            self.pulse_optimal_chooser(best_nbasis, &decomposition, &target_decomposed)?
-        } else {
-            None
-        };
-        if let Some(seq) = sequence {
-            return Ok(seq);
-        }
-        let mut target_1q_basis_list = EulerBasisSet::new();
-        target_1q_basis_list.add_basis(self.euler_basis);
-        let euler_decompositions: SmallVec<[Option<OneQubitGateSequence>; 8]> = decomposition
-            .iter()
-            .map(|decomp| {
-                unitary_to_gate_sequence_inner(
-                    decomp.view(),
-                    &target_1q_basis_list,
-                    0,
-                    None,
-                    true,
-                    None,
-                )
-            })
-            .collect();
-        let mut gates = Vec::with_capacity(TWO_QUBIT_SEQUENCE_DEFAULT_CAPACITY);
-        let mut global_phase = target_decomposed.global_phase;
-        global_phase -= best_nbasis as f64 * self.basis_decomposer.global_phase;
-        if best_nbasis == 2 {
-            global_phase += PI;
-        }
-        for i in 0..best_nbasis as usize {
-            if let Some(euler_decomp) = &euler_decompositions[2 * i] {
-                for gate in &euler_decomp.gates {
-                    gates.push((gate.0.into(), gate.1.clone(), smallvec![0]));
-                }
-                global_phase += euler_decomp.global_phase
-            }
-            if let Some(euler_decomp) = &euler_decompositions[2 * i + 1] {
-                for gate in &euler_decomp.gates {
-                    gates.push((gate.0.into(), gate.1.clone(), smallvec![1]));
-                }
-                global_phase += euler_decomp.global_phase
-            }
-            gates.push((self.gate.clone(), self.gate_params.clone(), smallvec![0, 1]));
-        }
-        if let Some(euler_decomp) = &euler_decompositions[2 * best_nbasis as usize] {
-            for gate in &euler_decomp.gates {
-                gates.push((gate.0.into(), gate.1.clone(), smallvec![0]));
-            }
-            global_phase += euler_decomp.global_phase
-        }
-        if let Some(euler_decomp) = &euler_decompositions[2 * best_nbasis as usize + 1] {
-            for gate in &euler_decomp.gates {
-                gates.push((gate.0.into(), gate.1.clone(), smallvec![1]));
-            }
-            global_phase += euler_decomp.global_phase
-        }
-        Ok(TwoQubitGateSequence {
-            gates,
-            global_phase,
-        })
-    }
-}
-
-static K12R_ARR: GateArray1Q = [
-    [c64(0., FRAC_1_SQRT_2), c64(FRAC_1_SQRT_2, 0.)],
-    [c64(-FRAC_1_SQRT_2, 0.), c64(0., -FRAC_1_SQRT_2)],
-];
-
-static K12L_ARR: GateArray1Q = [
-    [c64(0.5, 0.5), c64(0.5, 0.5)],
-    [c64(-0.5, 0.5), c64(0.5, -0.5)],
-];
-
-fn decomp0_inner(target: &TwoQubitWeylDecomposition) -> SmallVec<[Array2<Complex64>; 8]> {
-    smallvec![target.K1r.dot(&target.K2r), target.K1l.dot(&target.K2l),]
-}
-
 type PickleNewArgs<'a> = (Py<PyAny>, Py<PyAny>, f64, &'a str, Option<bool>);
 
 #[pymethods]
@@ -2267,7 +2032,7 @@ impl TwoQubitBasisDecomposer {
             gate_params?,
             gate_matrix.as_array(),
             basis_fidelity,
-            euler_basis,
+            EulerBasis::from_str(euler_basis).unwrap(),
             pulse_optimize,
         )
     }
@@ -2385,8 +2150,8 @@ impl TwoQubitBasisDecomposer {
         approximate: bool,
         _num_basis_uses: Option<u8>,
     ) -> PyResult<DAGCircuit> {
-        let sequence =
-            self.generate_sequence(unitary, basis_fidelity, approximate, _num_basis_uses)?;
+        let array = unitary.as_array();
+        let sequence = self.call_inner(array, basis_fidelity, approximate, _num_basis_uses)?;
         let mut dag = DAGCircuit::with_capacity(2, 0, None, Some(sequence.gates.len()), None, None);
         dag.set_global_phase_f64(sequence.global_phase);
         dag.add_qubit_unchecked(ShareableQubit::new_anonymous())?;
@@ -2428,8 +2193,8 @@ impl TwoQubitBasisDecomposer {
         approximate: bool,
         _num_basis_uses: Option<u8>,
     ) -> PyResult<PyCircuitData> {
-        let sequence =
-            self.generate_sequence(unitary, basis_fidelity, approximate, _num_basis_uses)?;
+        let array = unitary.as_array();
+        let sequence = self.call_inner(array, basis_fidelity, approximate, _num_basis_uses)?;
         Ok(CircuitData::from_packed_operations(
             2,
             0,
diff --git a/crates/transpiler/src/passes/consolidate_blocks.rs b/crates/transpiler/src/passes/consolidate_blocks.rs
index c1d0fcc59196..aa80e1446ac8 100644
--- a/crates/transpiler/src/passes/consolidate_blocks.rs
+++ b/crates/transpiler/src/passes/consolidate_blocks.rs
@@ -33,6 +33,7 @@ use qiskit_circuit::operations::StandardGate;
 use qiskit_circuit::operations::{ArrayType, Operation, Param, UnitaryGate};
 use qiskit_circuit::packed_instruction::PackedOperation;
 use qiskit_quantum_info::convert_2q_block_matrix::{blocks_to_matrix, get_matrix_from_inst};
+use qiskit_synthesis::euler_one_qubit_decomposer::EulerBasis;
 use qiskit_synthesis::two_qubit_decompose::RXXEquivalent;
 use qiskit_synthesis::two_qubit_decompose::{
     TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer,
@@ -120,7 +121,7 @@ fn get_decomposer_and_basis_gate(
                         SmallVec::default(),
                         get_matrix(&gate),
                         approximation_degree,
-                        "U",
+                        EulerBasis::U,
                         None,
                     )
                     .unwrap_or_else(|_| {
@@ -142,7 +143,7 @@ fn get_decomposer_and_basis_gate(
                 SmallVec::default(),
                 aview2(&CX_GATE),
                 1.0,
-                "U",
+                EulerBasis::U,
                 None,
             )
             .expect("Error while creating TwoQubitBasisDecomposer using a 'cx' gate."),
diff --git a/crates/transpiler/src/passes/mod.rs b/crates/transpiler/src/passes/mod.rs
index 3a34ab2ae115..594a25ad5d30 100644
--- a/crates/transpiler/src/passes/mod.rs
+++ b/crates/transpiler/src/passes/mod.rs
@@ -48,9 +48,8 @@ mod remove_diagonal_gates_before_measure;
 mod remove_identity_equiv;
 pub mod sabre;
 mod split_2q_unitaries;
-mod two_qubit_peephole;
-mod two_qubit_unitary_synthesis_utils;
 mod substitute_pi4_rotations;
+mod two_qubit_peephole;
 pub mod unitary_synthesis;
 mod unroll_3q_or_more;
 pub mod vf2;
@@ -97,8 +96,8 @@ pub use remove_diagonal_gates_before_measure::{
 };
 pub use remove_identity_equiv::{remove_identity_equiv_mod, run_remove_identity_equiv};
 pub use split_2q_unitaries::{run_split_2q_unitaries, split_2q_unitaries_mod};
-pub use two_qubit_peephole::{two_qubit_peephole_mod, two_qubit_unitary_peephole_optimize};
 pub use substitute_pi4_rotations::{py_run_substitute_pi4_rotations, substitute_pi4_rotations_mod};
+pub use two_qubit_peephole::{two_qubit_peephole_mod, two_qubit_unitary_peephole_optimize};
 pub use unitary_synthesis::{
     UnitarySynthesisConfig, UnitarySynthesisState, run_unitary_synthesis, unitary_synthesis_mod,
 };
diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 7286e71c4822..a0119c80c99f 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -10,283 +10,73 @@
 // copyright notice, and modified files need to carry a notice indicating
 // that they have been altered from the originals.
 
-use std::cmp::Ordering;
 use std::sync::Mutex;
+#[cfg(feature = "cache_pygates")]
+use std::sync::OnceLock;
 
 use hashbrown::{HashMap, HashSet};
+use pyo3::Python;
+use pyo3::intern;
 use pyo3::prelude::*;
 use rayon::prelude::*;
 use rustworkx_core::petgraph::stable_graph::NodeIndex;
-use smallvec::{smallvec, SmallVec};
+use smallvec::SmallVec;
 
-use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType, VarsMode};
-use qiskit_circuit::operations::{Operation, OperationRef, Param, StandardGate};
-use qiskit_circuit::packed_instruction::PackedOperation;
-use qiskit_circuit::Qubit;
+use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
+use qiskit_circuit::instruction::Parameters;
+use qiskit_circuit::operations::{
+    Operation, OperationRef, Param, PyOperationTypes, PythonOperation,
+};
+use qiskit_circuit::packed_instruction::{PackedInstruction, PackedOperation};
+use qiskit_circuit::{BlocksMode, Qubit, VarsMode};
 
-use super::two_qubit_unitary_synthesis_utils::{
-    preferred_direction, synth_su4_sequence, DecomposerElement, DecomposerType,
-    TwoQubitUnitarySequence,
+use crate::passes::unitary_synthesis::{
+    Approximation, QpuConstraint, TwoQSynthesisResult, synthesize_2q_matrix,
 };
-use crate::target::{Qargs, Target, TargetOperation};
-use crate::TranspilerError;
-use qiskit_circuit::getenv_use_multiple_threads;
+use crate::passes::{UnitarySynthesisConfig, UnitarySynthesisState};
+use crate::target::Target;
 use qiskit_circuit::PhysicalQubit;
+use qiskit_circuit::getenv_use_multiple_threads;
 use qiskit_quantum_info::convert_2q_block_matrix::blocks_to_matrix;
-use qiskit_synthesis::euler_one_qubit_decomposer::{
-    EulerBasis, EulerBasisSet, EULER_BASES, EULER_BASIS_NAMES,
-};
-use qiskit_synthesis::two_qubit_decompose::{
-    RXXEquivalent, TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer,
-    TwoQubitWeylDecomposition,
-};
 
-fn get_decomposers_from_target(
-    target: &Target,
-    qubits: &[Qubit],
-    fidelity: f64,
-) -> PyResult<Vec<DecomposerElement>> {
-    let physical_qubits: SmallVec<[PhysicalQubit; 2]> =
-        smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
-    let reverse_qubits: SmallVec<[PhysicalQubit; 2]> =
-        physical_qubits.iter().rev().copied().collect();
-    let mut reverse_used = false;
-    let mut gate_names: HashSet<&str> = match target.operation_names_for_qargs(&physical_qubits) {
-        Ok(names) => names.into_iter().collect(),
-        Err(err) => {
-            reverse_used = true;
-            target
-                .operation_names_for_qargs(&reverse_qubits)
-                .map_err(|_| TranspilerError::new_err(err.to_string()))?
-                .into_iter()
-                .collect()
-        }
-    };
-    if !reverse_used {
-        if let Ok(reverse_names) = target.operation_names_for_qargs(&reverse_qubits) {
-            if !reverse_names.is_empty() {
-                for name in reverse_names {
-                    gate_names.insert(name);
-                }
-            }
-        }
-    }
-    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param])> = gate_names
-        .iter()
-        .filter_map(|name| match target.operation_from_name(name) {
-            Some(raw_op) => {
-                if let TargetOperation::Normal(op) = raw_op {
-                    match op.operation.view() {
-                        OperationRef::StandardGate(gate) => {
-                            if matches!(
-                                gate,
-                                StandardGate::CX | StandardGate::CZ | StandardGate::ECR
-                            ) {
-                                Some((*name, &op.operation, op.params.as_slice()))
-                            } else if let Some(matrix) = gate.matrix(&op.params) {
-                                if let Ok(weyl) =
-                                    TwoQubitWeylDecomposition::new_inner(matrix.view(), None, None)
-                                {
-                                    if weyl.is_supercontrolled() {
-                                        Some((*name, &op.operation, op.params.as_slice()))
-                                    } else {
-                                        None
-                                    }
-                                } else {
-                                    None
-                                }
-                            } else {
-                                None
-                            }
-                        }
-                        OperationRef::Gate(gate) => {
-                            if let Some(matrix) = gate.matrix(&op.params) {
-                                let weyl =
-                                    TwoQubitWeylDecomposition::new_inner(matrix.view(), None, None)
-                                        .unwrap();
-                                if weyl.is_supercontrolled() {
-                                    Some((*name, &op.operation, op.params.as_slice()))
-                                } else {
-                                    None
-                                }
-                            } else {
-                                None
-                            }
-                        }
-                        _ => None,
-                    }
-                } else {
-                    None
-                }
-            }
-            None => None,
-        })
-        .collect();
+type MappingIterItem = Option<(TwoQSynthesisResult, [Qubit; 2])>;
 
-    let single_qubit_basis_list = target.operation_names_for_qargs(&[physical_qubits[0]]);
-    let mut target_basis_set = EulerBasisSet::new();
-    match single_qubit_basis_list {
-        Ok(basis_list) => {
-            EULER_BASES
-                .iter()
-                .enumerate()
-                .filter_map(|(idx, gates)| {
-                    if !gates.iter().all(|gate| basis_list.contains(gate)) {
-                        return None;
-                    }
-                    let basis = EULER_BASIS_NAMES[idx];
-                    Some(basis)
-                })
-                .for_each(|basis| target_basis_set.add_basis(basis));
-        }
-        Err(_) => target_basis_set.support_all(),
-    }
-    if target_basis_set.basis_supported(EulerBasis::U3)
-        && target_basis_set.basis_supported(EulerBasis::U321)
-    {
-        target_basis_set.remove(EulerBasis::U3);
-    }
-    if target_basis_set.basis_supported(EulerBasis::ZSX)
-        && target_basis_set.basis_supported(EulerBasis::ZSXX)
-    {
-        target_basis_set.remove(EulerBasis::ZSX);
-    }
-
-    let decomposers: PyResult<Vec<DecomposerElement>> = available_kak_gate
-        .iter()
-        .filter_map(|(two_qubit_name, two_qubit_gate, params)| {
-            let matrix = two_qubit_gate.matrix(params);
-            matrix.map(|matrix| {
-                target_basis_set.get_bases().filter_map(move |euler_basis| {
-                    TwoQubitBasisDecomposer::new_inner(
-                        two_qubit_name.to_string(),
-                        matrix.view(),
-                        fidelity,
-                        euler_basis,
-                        None,
-                    )
-                    .map(|decomp| {
-                        if !decomp.super_controlled() {
-                            None
-                        } else {
-                            Some(DecomposerElement {
-                                decomposer: DecomposerType::TwoQubitBasis(Box::new(decomp)),
-                                packed_op: (*two_qubit_gate).clone(),
-                                params: params.iter().cloned().collect(),
-                                target_name: two_qubit_name.to_string(),
-                            })
-                        }
-                    })
-                    .transpose()
-                })
-            })
-        })
-        .flatten()
-        .collect();
-    let mut decomposers = decomposers?;
-    for gate in [
-        StandardGate::RXX,
-        StandardGate::RZZ,
-        StandardGate::RYY,
-        StandardGate::RZX,
-    ] {
-        if gate_names.contains(gate.name()) {
-            let op = target.operation_from_name(gate.name()).unwrap();
-            if op
-                .params()
-                .iter()
-                .all(|x| matches!(x, Param::ParameterExpression(_)))
-            {
-                for euler_basis in target_basis_set.get_bases() {
-                    decomposers.push(DecomposerElement {
-                        decomposer: DecomposerType::TwoQubitControlledU(Box::new(
-                            TwoQubitControlledUDecomposer::new(
-                                RXXEquivalent::Standard(gate),
-                                euler_basis.as_str(),
-                            )?,
-                        )),
-                        packed_op: gate.into(),
-                        // TODO: Add param when ParameterExpression doesn't
-                        // need python. This is a corrupt param for the gates
-                        // here, but it unused in the passes and needs to be
-                        // an unbound  parameter. Do not use this value for
-                        // constructing a circuit.
-                        params: smallvec![],
-                        target_name: gate.name().to_string(),
-                    });
-                }
-            }
-        }
-    }
-    Ok(decomposers)
+// This is a separate function in case we need to handle any Python synchronization in the future
+// (such as releasing the GIL). For right now this doesn't seem to be necessary, but keeping it
+// separate enables any manipulation of the Py handle in the future.
+#[pyfunction(name = "two_qubit_unitary_peephole_optimize")]
+pub fn py_two_qubit_unitary_peephole_optimize(
+    dag: &DAGCircuit,
+    target: &Target,
+    approximation_degree: Option<f64>,
+) -> PyResult<Option<DAGCircuit>> {
+    two_qubit_unitary_peephole_optimize(dag, target, approximation_degree)
 }
 
-/// Score a given sequence using the error rate reported in the target
+/// This function runs the two qubit unitary peephole optimization pass
 ///
-/// Return a tuple of the predicted fidelity and the number of 2q gates in the sequence
-#[inline]
-fn score_sequence<'a>(
-    target: &'a Target,
-    kak_gate_name: &str,
-    sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
-) -> (usize, f64, usize) {
-    let mut two_gate_count = 0;
-    let mut total_gate_count = 0;
-    let fidelity = sequence
-        .filter_map(|(gate, local_qubits)| {
-            total_gate_count += 1;
-            let qubits = local_qubits
-                .iter()
-                .map(|qubit| PhysicalQubit(qubit.0))
-                .collect::<Vec<_>>();
-            if qubits.len() == 2 {
-                two_gate_count += 1;
-            }
-            let name = match gate.as_ref() {
-                Some(g) => g.name(),
-                None => kak_gate_name,
-            };
-            let error = target.get_error(name, qubits.as_slice());
-            error.map(|error| 1. - error)
-        })
-        .product::<f64>();
-    (two_gate_count, 1. - fidelity, total_gate_count)
-}
-
-type MappingIterItem = Option<(TwoQubitUnitarySequence, [Qubit; 2])>;
-
-/// This transpiler pass can only run in a context where we've translated the circuit gates (or
-/// where we know all gates have a matrix). If any gate identified in the run fails to have a
-/// matrix defined (either in rust or python) it will be skipped
-#[pyfunction]
+/// It returns None if there is no modifications/optimiations made to the input dag and the pass
+/// function calling this should just return the input dag from the pass.
 pub fn two_qubit_unitary_peephole_optimize(
-    py: Python,
     dag: &DAGCircuit,
     target: &Target,
-    fidelity: f64,
-) -> PyResult<DAGCircuit> {
+    approximation_degree: Option<f64>,
+) -> PyResult<Option<DAGCircuit>> {
     let runs: Vec<Vec<NodeIndex>> = dag.collect_2q_runs().unwrap();
+    if runs.is_empty() {
+        return Ok(None);
+    }
     let node_mapping: HashMap<NodeIndex, usize> =
         HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
     let locked_node_mapping = Mutex::new(node_mapping);
-    let coupling_edges = match target.qargs() {
-        Some(qargs) => qargs
-            .filter_map(|qargs| match qargs {
-                Qargs::Concrete(qargs) => {
-                    if qargs.len() == 2 {
-                        Some([qargs[0], qargs[1]])
-                    } else {
-                        None
-                    }
-                }
-                Qargs::Global => None,
-            })
-            .collect(),
-        None => HashSet::new(),
-    };
+    let physical_qubits = (0..dag.num_qubits() as u32)
+        .map(PhysicalQubit::new)
+        .collect::<Vec<_>>();
+    let approximation = Approximation::from_py_approximation_degree(approximation_degree);
+
     let find_best_sequence =
         |run_index: usize, node_indices: &[NodeIndex]| -> PyResult<MappingIterItem> {
-            let block_qubit_map = node_indices
+            let q_virt = node_indices
                 .iter()
                 .find_map(|node_index| {
                     let inst = dag.dag()[*node_index].unwrap_operation();
@@ -302,87 +92,24 @@ pub fn two_qubit_unitary_peephole_optimize(
                     }
                 })
                 .unwrap();
-            let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
-            let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
-            let mut decomposer_scores: Vec<Option<(usize, f64, usize)>> =
-                vec![None; decomposers.len()];
-
-            let order_sequence =
-                |(index_a, sequence_a): &(usize, TwoQubitUnitarySequence),
-                 (index_b, sequence_b): &(usize, TwoQubitUnitarySequence)| {
-                    let score_a = (
-                        match decomposer_scores[*index_a] {
-                            Some(score) => score,
-                            None => {
-                                let score: (usize, f64, usize) = score_sequence(
-                                    target,
-                                    sequence_a.target_name.as_str(),
-                                    sequence_a.gate_sequence.gates.iter().map(
-                                        |(gate, _params, local_qubits)| {
-                                            let qubits = local_qubits
-                                                .iter()
-                                                .map(|qubit| block_qubit_map[*qubit as usize])
-                                                .collect();
-                                            (*gate, qubits)
-                                        },
-                                    ),
-                                );
-                                decomposer_scores[*index_a] = Some(score);
-                                score
-                            }
-                        },
-                        index_a,
-                    );
+            let q_phys = q_virt.map(|q| physical_qubits[q.index()]);
+            let matrix = blocks_to_matrix(dag, node_indices, q_virt)?;
+            let unitary_synthesis_config = UnitarySynthesisConfig {
+                approximation,
+                ..Default::default()
+            };
+            let mut synthesis_state = UnitarySynthesisState::new(unitary_synthesis_config);
 
-                    let score_b = (
-                        match decomposer_scores[*index_b] {
-                            Some(score) => score,
-                            None => {
-                                let score: (usize, f64, usize) = score_sequence(
-                                    target,
-                                    sequence_b.target_name.as_str(),
-                                    sequence_b.gate_sequence.gates.iter().map(
-                                        |(gate, _params, local_qubits)| {
-                                            let qubits = local_qubits
-                                                .iter()
-                                                .map(|qubit| block_qubit_map[*qubit as usize])
-                                                .collect();
-                                            (*gate, qubits)
-                                        },
-                                    ),
-                                );
-                                decomposer_scores[*index_b] = Some(score);
-                                score
-                            }
-                        },
-                        index_b,
-                    );
-                    score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
-                };
-            let sequence = decomposers
-                .iter()
-                .map(|decomposer| {
-                    let physical_block_qubit_map: [PhysicalQubit; 2] = [
-                        PhysicalQubit(block_qubit_map[0].0),
-                        PhysicalQubit(block_qubit_map[1].0),
-                    ];
-                    let dir = preferred_direction(
-                        &physical_block_qubit_map,
-                        Some(true),
-                        &coupling_edges,
-                        Some(target),
-                        decomposer,
-                    )
-                    .ok()
-                    .flatten();
-                    synth_su4_sequence(matrix.view(), decomposer, dir, Some(fidelity)).unwrap()
-                })
-                .enumerate()
-                .min_by(order_sequence);
-            if sequence.is_none() {
+            let result = synthesize_2q_matrix(
+                matrix.into(),
+                q_phys,
+                &mut synthesis_state,
+                QpuConstraint::Target(target),
+            )?;
+            if result.is_none() {
                 return Ok(None);
             }
-            let sequence = sequence.unwrap();
+            let result = result.unwrap();
             let mut original_fidelity: f64 = 1.;
             let mut original_2q_count: usize = 0;
             let original_total_count: usize = node_indices.len();
@@ -407,7 +134,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                         // in the target or the gate is ideal. We need to do a second lookup
                         // to determine if the gate is supported, and if it isn't we don't need
                         // to finish scoring because we know we'll use the synthesis output
-                        if !target.instruction_supported(name, &qubits) {
+                        if !target.instruction_supported(name, &qubits, inst.params_view(), true) {
                             outside_target = true;
                             break;
                         }
@@ -421,23 +148,14 @@ pub fn two_qubit_unitary_peephole_optimize(
                 1. - original_fidelity,
                 original_total_count,
             );
-            let new_score: (usize, f64, usize) =
-                match decomposer_scores[sequence.0] {
-                    Some(score) => score,
-                    None => score_sequence(
-                        target,
-                        sequence.1.target_name.as_str(),
-                        sequence.1.gate_sequence.gates.iter().map(
-                            |(gate, _params, local_qubits)| {
-                                let qubits = local_qubits
-                                    .iter()
-                                    .map(|qubit| block_qubit_map[*qubit as usize])
-                                    .collect();
-                                (*gate, qubits)
-                            },
-                        ),
-                    ),
-                };
+            let new_2q_count = result
+                .sequence
+                .gates
+                .iter()
+                .filter(|x| x.0.num_qubits() == 2)
+                .count();
+            let new_gate_count = result.sequence.gates.len();
+            let new_score = (new_2q_count, result.fidelity.unwrap_or(1.), new_gate_count);
             // If the we are not outside the target and the new score isn't any better just use the
             // original (this includes a tie).
             if !outside_target && new_score >= original_score {
@@ -450,80 +168,96 @@ pub fn two_qubit_unitary_peephole_optimize(
             for node in node_indices {
                 node_mapping.insert(*node, run_index);
             }
-            Ok(Some((sequence.1, block_qubit_map)))
+            Ok(Some((result, q_virt)))
         };
 
     let run_mapping: PyResult<Vec<MappingIterItem>> = if getenv_use_multiple_threads() {
-        py.allow_threads(|| {
-            // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
-            // This is done in parallel
-            runs.par_iter()
-                .enumerate()
-                .map(|(index, sequence)| find_best_sequence(index, sequence.as_slice()))
-                .collect()
-        })
+        // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
+        // This is done in parallel
+        runs.par_iter()
+            .enumerate()
+            .map(|(index, sequence)| find_best_sequence(index, sequence.as_slice()))
+            .collect()
     } else {
         runs.iter()
             .enumerate()
             .map(|(index, sequence)| find_best_sequence(index, sequence.as_slice()))
             .collect()
     };
-
     let run_mapping = run_mapping?;
     // After we've computed all the sequences to execute now serially build up a new dag.
     let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
-    let out_dag = dag.copy_empty_like(VarsMode::Alike)?;
+    let out_dag = dag.copy_empty_like_with_same_capacity(VarsMode::Alike, BlocksMode::Keep)?;
     let mut out_dag_builder = out_dag.into_builder();
     let node_mapping = locked_node_mapping.into_inner().unwrap();
-    for node in dag.topological_op_nodes()? {
+    if node_mapping.is_empty() {
+        return Ok(None);
+    }
+    for node in dag.topological_op_nodes(false) {
         match node_mapping.get(&node) {
             Some(run_index) => {
                 if processed_runs.contains(run_index) {
                     continue;
                 }
-                let run = run_mapping[*run_index].as_ref();
-                if run.is_none() {
+                // A None is inserted into the node_mapping as the value for a run that we don't
+                // substitute but was identified so we added an explicit None to preserve the
+                // indexing with the vec.
+                let Some((result, qargs_virt)) = run_mapping[*run_index].as_ref() else {
                     let NodeType::Operation(ref instr) = dag.dag()[node] else {
                         unreachable!("Must be an op node")
                     };
                     out_dag_builder.push_back(instr.clone())?;
                     continue;
-                }
-                let (sequence, qubit_map) = run.unwrap();
-                for (gate, params, local_qubits) in &sequence.gate_sequence.gates {
-                    let qubits: Vec<Qubit> = local_qubits
-                        .iter()
-                        .map(|index| qubit_map[*index as usize])
-                        .collect();
+                };
+                let order = result.dir.as_indices();
+                let out_qargs = [qargs_virt[order[0] as usize], qargs_virt[order[1] as usize]];
+                let qubit_keys = [
+                    out_dag_builder.insert_qargs(&[out_qargs[0]]),
+                    out_dag_builder.insert_qargs(&[out_qargs[1]]),
+                    out_dag_builder.insert_qargs(&[out_qargs[0], out_qargs[1]]),
+                    out_dag_builder.insert_qargs(&[out_qargs[1], out_qargs[0]]),
+                ];
 
-                    let out_params = if params.is_empty() {
-                        None
-                    } else {
-                        Some(params.into_iter().map(|val| Param::Float(*val)).collect())
-                    };
-                    match gate {
-                        Some(gate) => out_dag_builder.apply_operation_back(
-                            PackedOperation::from_standard_gate(*gate),
-                            &qubits,
-                            &[],
-                            out_params,
-                            None,
-                            #[cfg(feature = "cache_pygates")]
-                            None,
+                for (gate, params, local_qubits) in &result.sequence.gates {
+                    let qubits = match local_qubits.as_slice() {
+                        [0] => qubit_keys[0],
+                        [1] => qubit_keys[1],
+                        [0, 1] => qubit_keys[2],
+                        [1, 0] => qubit_keys[3],
+                        _ => panic!(
+                            "internal logic error: decomposed sequence contained unexpected qargs"
                         ),
-                        None => out_dag_builder.apply_operation_back(
-                            sequence.decomp_op.clone(),
-                            &qubits,
-                            &[],
-                            Some(out_params.unwrap_or(sequence.decomp_params.clone())),
-                            None,
-                            #[cfg(feature = "cache_pygates")]
-                            None,
-                        ),
-                    }?;
+                    };
+                    let op = match gate.view() {
+                        OperationRef::StandardGate(gate) => PackedOperation::from(gate),
+                        OperationRef::Gate(py_gate) => Python::attach(|py| -> PyResult<_> {
+                            let gate = py_gate.py_copy(py)?;
+                            gate.instruction
+                                .setattr(py, intern!(py, "params"), params)?;
+                            Ok(PackedOperation::from(Box::new(PyOperationTypes::Gate(
+                                gate,
+                            ))))
+                        })?,
+                        _ => {
+                            panic!("internal logic error: decomposed sequence contains a non-gate")
+                        }
+                    };
+                    let params = (!params.is_empty()).then(|| {
+                        Box::new(Parameters::Params(
+                            params.iter().copied().map(Param::Float).collect(),
+                        ))
+                    });
+                    out_dag_builder.push_back(PackedInstruction {
+                      op,
+                      qubits,
+                      clbits: Default::default(),
+                      params,
+                      label: None,
+                      #[cfg(feature = "cache_pygates")] // W: code is inactive due to #[cfg] directives: feature …
+                      py_op: OnceLock::new(),
+                    })?;
                 }
-                out_dag_builder
-                    .add_global_phase(&Param::Float(sequence.gate_sequence.global_phase))?;
+                out_dag_builder.add_global_phase(&Param::Float(result.sequence.global_phase))?;
                 processed_runs.insert(*run_index);
             }
             None => {
@@ -534,10 +268,10 @@ pub fn two_qubit_unitary_peephole_optimize(
             }
         }
     }
-    Ok(out_dag_builder.build())
+    Ok(Some(out_dag_builder.build()))
 }
 
 pub fn two_qubit_peephole_mod(m: &Bound<PyModule>) -> PyResult<()> {
-    m.add_wrapped(wrap_pyfunction!(two_qubit_unitary_peephole_optimize))?;
+    m.add_wrapped(wrap_pyfunction!(py_two_qubit_unitary_peephole_optimize))?;
     Ok(())
 }
diff --git a/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs b/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs
deleted file mode 100644
index 596ba5fd8882..000000000000
--- a/crates/transpiler/src/passes/two_qubit_unitary_synthesis_utils.rs
+++ /dev/null
@@ -1,248 +0,0 @@
-// This code is part of Qiskit.
-//
-// (C) Copyright IBM 2024
-//
-// This code is licensed under the Apache License, Version 2.0. You may
-// obtain a copy of this license in the LICENSE.txt file in the root directory
-// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
-//
-// Any modifications or derivative works of this code must retain this
-// copyright notice, and modified files need to carry a notice indicating
-// that they have been altered from the originals.
-#![allow(clippy::too_many_arguments)]
-
-use hashbrown::HashSet;
-use ndarray::prelude::*;
-use num_complex::Complex64;
-use smallvec::SmallVec;
-
-use pyo3::prelude::*;
-
-use qiskit_circuit::operations::{Operation, Param};
-use qiskit_circuit::packed_instruction::PackedOperation;
-
-use crate::target::Qargs;
-use crate::target::Target;
-use crate::QiskitError;
-use qiskit_circuit::PhysicalQubit;
-use qiskit_synthesis::two_qubit_decompose::{
-    TwoQubitBasisDecomposer, TwoQubitControlledUDecomposer, TwoQubitGateSequence,
-};
-
-#[derive(Clone, Debug)]
-pub(crate) enum DecomposerType {
-    TwoQubitBasis(Box<TwoQubitBasisDecomposer>),
-    TwoQubitControlledU(Box<TwoQubitControlledUDecomposer>),
-    XX(PyObject),
-}
-
-#[derive(Clone, Debug)]
-pub(crate) struct DecomposerElement {
-    pub(crate) decomposer: DecomposerType,
-    pub(crate) packed_op: PackedOperation,
-    pub(crate) params: SmallVec<[Param; 3]>,
-    pub(crate) target_name: String,
-}
-
-#[derive(Clone, Debug)]
-pub(crate) struct TwoQubitUnitarySequence {
-    pub(crate) gate_sequence: TwoQubitGateSequence,
-    pub(crate) decomp_op: PackedOperation,
-    pub(crate) decomp_params: SmallVec<[Param; 3]>,
-    pub(crate) target_name: String,
-}
-
-/// Function to evaluate hardware-native direction, this allows to correct
-/// the synthesis output to match the target constraints.
-/// Returns:
-///     * `true` if gate qubits are in the hardware-native direction
-///     * `false` if gate qubits must be flipped to match hardware-native direction
-#[inline]
-pub(crate) fn preferred_direction(
-    ref_qubits: &[PhysicalQubit; 2],
-    natural_direction: Option<bool>,
-    coupling_edges: &HashSet<[PhysicalQubit; 2]>,
-    target: Option<&Target>,
-    decomposer: &DecomposerElement,
-) -> PyResult<Option<bool>> {
-    let qubits: [PhysicalQubit; 2] = *ref_qubits;
-    let mut reverse_qubits: [PhysicalQubit; 2] = qubits;
-    reverse_qubits.reverse();
-
-    let preferred_direction = match natural_direction {
-        Some(false) => None,
-        _ => {
-            // None or Some(true)
-            let zero_one = coupling_edges.contains(&qubits);
-            let one_zero = coupling_edges.contains(&[qubits[1], qubits[0]]);
-
-            match (zero_one, one_zero) {
-                (true, false) => Some(true),
-                (false, true) => Some(false),
-                _ => {
-                    match target {
-                        Some(target) => {
-                            let mut cost_0_1: f64 = f64::INFINITY;
-                            let mut cost_1_0: f64 = f64::INFINITY;
-
-                            let compute_cost = |lengths: bool,
-                                                q_tuple: [PhysicalQubit; 2],
-                                                in_cost: f64|
-                             -> PyResult<f64> {
-                                let cost = match target
-                                    .qargs_for_operation_name(decomposer.packed_op.name())
-                                {
-                                    Ok(_) => match target[decomposer.packed_op.name()]
-                                        .get(&Qargs::from(q_tuple))
-                                    {
-                                        Some(Some(_props)) => {
-                                            if lengths {
-                                                _props.duration.unwrap_or(in_cost)
-                                            } else {
-                                                _props.error.unwrap_or(in_cost)
-                                            }
-                                        }
-                                        _ => in_cost,
-                                    },
-                                    Err(_) => in_cost,
-                                };
-                                Ok(cost)
-                            };
-                            // Try to find the cost in gate_lengths
-                            cost_0_1 = compute_cost(true, qubits, cost_0_1)?;
-                            cost_1_0 = compute_cost(true, reverse_qubits, cost_1_0)?;
-
-                            // If no valid cost was found in gate_lengths, check gate_errors
-                            if !(cost_0_1 < f64::INFINITY || cost_1_0 < f64::INFINITY) {
-                                cost_0_1 = compute_cost(false, qubits, cost_0_1)?;
-                                cost_1_0 = compute_cost(false, reverse_qubits, cost_1_0)?;
-                            }
-
-                            if cost_0_1 < cost_1_0 {
-                                Some(true)
-                            } else if cost_1_0 < cost_0_1 {
-                                Some(false)
-                            } else {
-                                None
-                            }
-                        }
-                        None => None,
-                    }
-                }
-            }
-        }
-    };
-    if natural_direction == Some(true) && preferred_direction.is_none() {
-        return Err(QiskitError::new_err(format!(
-            concat!(
-                "No preferred direction of gate on qubits {:?} ",
-                "could be determined from coupling map or gate lengths / gate errors."
-            ),
-            qubits
-        )));
-    }
-    Ok(preferred_direction)
-}
-
-/// Apply synthesis for decomposers that return a SEQUENCE (TwoQubitBasis and TwoQubitControlledU).
-#[inline]
-pub(crate) fn synth_su4_sequence(
-    su4_mat: ArrayView2<Complex64>,
-    decomposer_2q: &DecomposerElement,
-    preferred_direction: Option<bool>,
-    approximation_degree: Option<f64>,
-) -> PyResult<TwoQubitUnitarySequence> {
-    let is_approximate = approximation_degree.is_none() || approximation_degree.unwrap() != 1.0;
-    let synth = if let DecomposerType::TwoQubitBasis(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None, is_approximate, None)?
-    } else if let DecomposerType::TwoQubitControlledU(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None)?
-    } else {
-        unreachable!("synth_su4_sequence should only be called for TwoQubitBasisDecomposer or TwoQubitControlledUDecomposer.")
-    };
-    let sequence = TwoQubitUnitarySequence {
-        gate_sequence: synth,
-        decomp_op: decomposer_2q.packed_op.clone(),
-        decomp_params: decomposer_2q.params.clone(),
-        target_name: decomposer_2q.target_name.clone(),
-    };
-    match preferred_direction {
-        None => Ok(sequence),
-        Some(preferred_dir) => {
-            let mut synth_direction: Option<SmallVec<[u8; 2]>> = None;
-            // if the gates in synthesis are in the opposite direction of the preferred direction
-            // resynthesize a new operator which is the original conjugated by swaps.
-            // this new operator is doubly mirrored from the original and is locally equivalent.
-            for (gate, _, qubits) in sequence.gate_sequence.gates() {
-                if gate.is_none() || gate.unwrap().name() == "cx" {
-                    synth_direction = Some(qubits.clone());
-                }
-            }
-            match synth_direction {
-                None => Ok(sequence),
-                Some(synth_direction) => {
-                    let synth_dir = match synth_direction.as_slice() {
-                        [0, 1] => true,
-                        [1, 0] => false,
-                        _ => unreachable!(),
-                    };
-                    if synth_dir != preferred_dir {
-                        reversed_synth_su4_sequence(
-                            su4_mat.to_owned(),
-                            decomposer_2q,
-                            approximation_degree,
-                        )
-                    } else {
-                        Ok(sequence)
-                    }
-                }
-            }
-        }
-    }
-}
-
-/// Apply reverse synthesis for decomposers that return a SEQUENCE (TwoQubitBasis and TwoQubitControlledU).
-/// This function is called by `synth_su4_sequence`` if the "direct" synthesis
-/// doesn't match the hardware restrictions.
-fn reversed_synth_su4_sequence(
-    mut su4_mat: Array2<Complex64>,
-    decomposer_2q: &DecomposerElement,
-    approximation_degree: Option<f64>,
-) -> PyResult<TwoQubitUnitarySequence> {
-    let is_approximate = approximation_degree.is_none() || approximation_degree.unwrap() != 1.0;
-    // Swap rows 1 and 2
-    let (mut row_1, mut row_2) = su4_mat.multi_slice_mut((s![1, ..], s![2, ..]));
-    azip!((x in &mut row_1, y in &mut row_2) (*x, *y) = (*y, *x));
-
-    // Swap columns 1 and 2
-    let (mut col_1, mut col_2) = su4_mat.multi_slice_mut((s![.., 1], s![.., 2]));
-    azip!((x in &mut col_1, y in &mut col_2) (*x, *y) = (*y, *x));
-
-    let synth = if let DecomposerType::TwoQubitBasis(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None, is_approximate, None)?
-    } else if let DecomposerType::TwoQubitControlledU(decomp) = &decomposer_2q.decomposer {
-        decomp.call_inner(su4_mat.view(), None)?
-    } else {
-        unreachable!(
-            "reversed_synth_su4_sequence should only be called for TwoQubitBasisDecomposer."
-        )
-    };
-    let flip_bits: [u8; 2] = [1, 0];
-    let mut reversed_gates = Vec::with_capacity(synth.gates().len());
-    for (gate, params, qubit_ids) in synth.gates() {
-        let new_qubit_ids = qubit_ids
-            .into_iter()
-            .map(|x| flip_bits[*x as usize])
-            .collect::<SmallVec<[u8; 2]>>();
-        reversed_gates.push((*gate, params.clone(), new_qubit_ids.clone()));
-    }
-    let mut reversed_synth: TwoQubitGateSequence = TwoQubitGateSequence::new();
-    reversed_synth.set_state((reversed_gates, synth.global_phase()));
-    let sequence = TwoQubitUnitarySequence {
-        gate_sequence: reversed_synth,
-        decomp_op: decomposer_2q.packed_op.clone(),
-        decomp_params: decomposer_2q.params.clone(),
-        target_name: decomposer_2q.target_name.clone(),
-    };
-    Ok(sequence)
-}
diff --git a/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs b/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
index b16d527dcd13..f4868ef3037e 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
@@ -203,7 +203,7 @@ impl StaticKakConstructor {
             self.source.params.clone(),
             matrix.view(),
             self.fidelity.get(),
-            self.euler.as_str(),
+            self.euler,
             self.use_pulse_optimizer.to_py_pulse_optimize(),
         )
     }
diff --git a/crates/transpiler/src/passes/unitary_synthesis/mod.rs b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
index 3890aec6df0d..f1af04540457 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/mod.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
@@ -482,14 +482,47 @@ fn synthesize_1q_matrix_onto(
     Ok(true)
 }
 
-fn synthesize_2q_matrix_onto(
-    out: &mut DAGCircuitBuilder,
+pub struct TwoQSynthesisResult {
+    pub sequence: TwoQubitGateSequence,
+    pub dir: Direction2q,
+    pub fidelity: Option<f64>,
+}
+
+#[inline]
+pub fn fidelity_2q_sequence(
+    pair: &(Direction2q, TwoQubitGateSequence),
+    constraint: &QpuConstraint,
+    qargs_phys: [PhysicalQubit; 2],
+) -> f64 {
+    let QpuConstraint::Target(target) = &constraint else {
+        return 1.;
+    };
+    let (dir, sequence) = pair;
+    let order = dir.as_indices();
+    let phys = [qargs_phys[order[0] as usize], qargs_phys[order[1] as usize]];
+    sequence
+        .gates()
+        .iter()
+        .map(|(op, _, qubits)| {
+            let qargs: &[_] = match *qubits.as_slice() {
+                [q] => &[phys[q as usize]],
+                [q1, q2] => &[phys[q1 as usize], phys[q2 as usize]],
+                _ => panic!("sequences should only contain 1q and 2q gates"),
+            };
+            // TODO: this does not handle the possibility of a 2q decomposer (like the
+            // XXDecomposer) using specialised instructions whose operation names do not match
+            // their target key.
+            1. - target.get_error(op.name(), qargs).unwrap_or(0.)
+        })
+        .product()
+}
+
+pub fn synthesize_2q_matrix(
     mut unitary: CowArray<Complex64, Ix2>,
     qargs_phys: [PhysicalQubit; 2],
-    qargs_virt: [Qubit; 2],
     state: &mut UnitarySynthesisState,
     constraint: QpuConstraint,
-) -> PyResult<bool> {
+) -> PyResult<Option<TwoQSynthesisResult>> {
     let decomposer_cache = &mut state.cache;
     let config = &state.config;
 
@@ -542,31 +575,7 @@ fn synthesize_2q_matrix_onto(
         // inconsistent; either it should be an error in _all_ circumstances if synthesis fails or
         // in _none_.  It's tricky to recreate the pre-Qiskit-2.4 behaviour bug-for-bug in the new
         // refactor because of how the split between decomposer construction and use works now.
-        return Ok(false);
-    };
-
-    let fidelity = |pair: &(Direction2q, TwoQubitGateSequence)| -> f64 {
-        let QpuConstraint::Target(target) = &constraint else {
-            return 1.;
-        };
-        let (dir, sequence) = pair;
-        let order = dir.as_indices();
-        let phys = [qargs_phys[order[0] as usize], qargs_phys[order[1] as usize]];
-        sequence
-            .gates()
-            .iter()
-            .map(|(op, _, qubits)| {
-                let qargs: &[_] = match *qubits.as_slice() {
-                    [q] => &[phys[q as usize]],
-                    [q1, q2] => &[phys[q1 as usize], phys[q2 as usize]],
-                    _ => panic!("sequences should only contain 1q and 2q gates"),
-                };
-                // TODO: this does not handle the possibility of a 2q decomposer (like the
-                // XXDecomposer) using specialised instructions whose operation names do not match
-                // their target key.
-                1. - target.get_error(op.name(), qargs).unwrap_or(0.)
-            })
-            .product()
+        return Ok(None);
     };
 
     // We only need to calculate the best score if there's more than one sequence.
@@ -574,8 +583,9 @@ fn synthesize_2q_matrix_onto(
     let mut best_pair = first;
     for sequence in sequences {
         let sequence = sequence?;
-        let prev_fidelity = best_fidelity.unwrap_or_else(|| fidelity(&best_pair));
-        let this_fidelity = fidelity(&sequence);
+        let prev_fidelity = best_fidelity
+            .unwrap_or_else(|| fidelity_2q_sequence(&best_pair, &constraint, qargs_phys));
+        let this_fidelity = fidelity_2q_sequence(&sequence, &constraint, qargs_phys);
         if this_fidelity > prev_fidelity {
             best_fidelity = Some(this_fidelity);
             best_pair = sequence;
@@ -583,10 +593,26 @@ fn synthesize_2q_matrix_onto(
             best_fidelity = Some(prev_fidelity);
         }
     }
+    Ok(Some(TwoQSynthesisResult {
+        sequence: best_pair.1,
+        dir: best_pair.0,
+        fidelity: best_fidelity,
+    }))
+}
 
+fn synthesize_2q_matrix_onto(
+    out: &mut DAGCircuitBuilder,
+    unitary: CowArray<Complex64, Ix2>,
+    qargs_phys: [PhysicalQubit; 2],
+    qargs_virt: [Qubit; 2],
+    state: &mut UnitarySynthesisState,
+    constraint: QpuConstraint,
+) -> PyResult<bool> {
+    let Some(result) = synthesize_2q_matrix(unitary, qargs_phys, state, constraint)? else {
+        return Ok(false);
+    };
     // ... now apply the best sequence.
-    let (dir, sequence) = best_pair;
-    let order = dir.as_indices();
+    let order = result.dir.as_indices();
     let out_qargs = [qargs_virt[order[0] as usize], qargs_virt[order[1] as usize]];
     let qubit_keys = [
         out.insert_qargs(&[out_qargs[0]]),
@@ -594,8 +620,8 @@ fn synthesize_2q_matrix_onto(
         out.insert_qargs(&[out_qargs[0], out_qargs[1]]),
         out.insert_qargs(&[out_qargs[1], out_qargs[0]]),
     ];
-    out.add_global_phase(&Param::Float(sequence.global_phase()))?;
-    for (gate, params, qubits) in sequence.gates() {
+    out.add_global_phase(&Param::Float(result.sequence.global_phase()))?;
+    for (gate, params, qubits) in result.sequence.gates() {
         let qubits = match qubits.as_slice() {
             [0] => qubit_keys[0],
             [1] => qubit_keys[1],
diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index 6dafe944a0f3..d40ab39fede4 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -29,12 +29,27 @@ class TwoQubitPeepholeOptimization(TransformationPass):
     If the synthesized two qubit unitary is "better" than the original
     subcircuit that subcircuit is used to replace the original. The heuristic
     used to determine if it's better first looks at the two qubit gate count
-    in the circuit, and prefers the synthesis with fewer two qubit gates.
+    in the circuit, and prefers the synthesis with fewer two qubit gates, if
+    the two qubit gate counts are the same then it looks at the estimated
+    fidelity of the circuit and picks the subcircuit with higher estimated
+    fidelity, and finally if needed it picks the subcircuit with the fewest
+    total gates.
 
     In case the target is overcomplete the pass will try all the
     decomposers supported for all the gates supported on a given qubit.
-    The decomposition that has the best expected performance will be selected
-    and used to replace the block.
+    The decomposition that has the best expected performance using the above
+    heuristic will be selected and used to replace the block.
+
+    This pass is designed to be run on a physical circuit and the details of
+    operations on a given qubit is assumed to be the hardware qubit from the
+    target. However, the output of the pass might not use hardware operations,
+    specifically single qubit gates might be emitted outside the target's supported
+    operations, typically only if a parameterized gate supported by the
+    :class:`.TwoQubitControlledUDecomposer` is used for synthesis. As such if running
+    this pass in a physical optimization stage (such as :ref:`transpiler-preset-stage-optimization`)
+    this should be paired with passes such as :class:`.BasisTranslator` and/or
+    :class:`.Optimize1qGatesDecomposition` to ensure that these errant single qubit
+    gates are replaced with hardware supported operations prior to exiting the stage.
 
     This pass is multithreaded, and will perform the analysis in parallel
     and use all the cores available on your local system. You can refer to
@@ -72,4 +87,7 @@ def __init__(
         self._approximation_degree = approximation_degree
 
     def run(self, dag: DAGCircuit) -> DAGCircuit:
-        return two_qubit_unitary_peephole_optimize(dag, self._target, self._approximation_degree)
+        result = two_qubit_unitary_peephole_optimize(dag, self._target, self._approximation_degree)
+        if result is None:
+            return dag
+        return result
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 5f4fa07e8176..9c17a74a6264 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -23,6 +23,7 @@
 from qiskit import generate_preset_pass_manager, transpile
 from qiskit.providers.fake_provider import GenericBackendV2
 from qiskit.circuit import QuantumCircuit, QuantumRegister
+from qiskit.circuit.random import random_circuit
 from qiskit.circuit.parameterexpression import ParameterValueType
 from qiskit.converters import circuit_to_dag, dag_to_circuit
 from qiskit.quantum_info.operators import Operator
@@ -170,13 +171,7 @@ def test_fractional_cx_with_backendv2(self):
         backend = FakeMumbaiFractionalCX()
         synth_pass = TwoQubitPeepholeOptimization(target=backend.target)
         tqc = synth_pass(circ)
-        tqc_index = {qubit: index for index, qubit in enumerate(tqc.qubits)}
-        # RZX with discrete angles would be lower error but because XX Decomposer is not
-        # supported/availble in rust we can't synthesize to it so CX ends up being used
-        self.assertGreaterEqual(len(tqc.get_instructions("rzx")), 0)
-        self.assertEqual(len(tqc.get_instructions("cx")), 3)
-        for instr in tqc.get_instructions("cx"):
-            self.assertEqual((0, 1), (tqc_index[instr.qubits[0]], tqc_index[instr.qubits[1]]))
+        np.testing.assert_allclose(Operator.from_circuit(tqc), Operator.from_circuit(circ))
 
     def test_reverse_direction(self):
         target = Target(2)
@@ -486,7 +481,7 @@ def test_swap_on_cz_target(self):
         qc.swap(0, 1)
         qc = transpile(qc, target=target, seed_transpiler=1234, optimization_level=0)
         res = peephole(qc)
-        self.assertEqual(res, qc)
+        np.testing.assert_allclose(Operator(qc), Operator(res))
         # Check run of swaps
         qc_duplicated = QuantumCircuit(2)
         for _ in range(100):
@@ -564,14 +559,24 @@ def test_pass_respects_directionality(self):
         self.assertTrue(self.all_inst_in_target(res, target))
         self.assertEqual(Operator(res), Operator(qc_duplicated))
 
-    def all_inst_in_target(self, circuit: QuantumCircuit, target: Target):
+    def all_inst_in_target(self, circuit: QuantumCircuit, target: Target, allow_inverse=False):
         for inst in circuit.data:
             if not target.instruction_supported(
                 inst.name, tuple(circuit.find_bit(x).index for x in inst.qubits)
             ):
-                raise self.fail(
-                    f"{inst.name} {tuple(circuit.find_bit(x).index for x in inst.qubits)} not supported"
-                )
+                if allow_inverse:
+                    if not target.instruction_supported(
+                        inst.operation.inverse().name,
+                        tuple(circuit.find_bit(x).index for x in inst.qubits),
+                    ):
+                        raise self.fail(
+                            f"{inst.name} {tuple(circuit.find_bit(x).index for x in inst.qubits)} not supported"
+                        )
+
+                else:
+                    raise self.fail(
+                        f"{inst.name} {tuple(circuit.find_bit(x).index for x in inst.qubits)} not supported"
+                    )
         return True
 
     @combine(
@@ -626,10 +631,9 @@ def test_two_qubit_parametrized_gates_basis_decomp_target(self, gate, target_gat
                 ),
             ]
         )
-
         legacy = legacy_path.run(qc)
         self.all_inst_in_target(transpiled_circuit, target)
-        self.assertEqual(Operator(transpiled_circuit), Operator(qc))
+        np.testing.assert_allclose(Operator(transpiled_circuit), Operator(qc), atol=1e-12, rtol=0)
         self.assertDictEqual(
             dict(sorted(transpiled_circuit.count_ops().items())),
             dict(sorted(legacy.count_ops().items())),
@@ -724,3 +728,47 @@ def test_two_qubit_rzz_cz_gates_rzz_target(self):
         self.assertEqual(Operator(transpiled_circuit), Operator(qc))
         self.assertTrue(set(transpiled_circuit.count_ops()).issubset({"rz", "rx", "rzz"}))
         self.assertEqual(transpiled_circuit.count_ops()["rzz"], 1)
+
+    @combine(
+        target_2q_gates=[
+            ["cx"],
+            ["cz"],
+            ["ecr"],
+            ["cx", "cz", "ecr"],
+            ["rxx", "cx"],
+            ["rzz", "cz"],
+            ["rzx", "ecr"],
+            ["rzz", "rzx", "rxx", "ecr", "cx", "cz"],
+        ],
+        target_1q_gates=[
+            ["rz", "sx"],
+            ["rz", "sx", "x"],
+            ["rz", "rx", "sx", "x"],
+            ["ry", "rx"],
+            ["ry", "rx", "rz"],
+            ["rz", "rx"],
+            ["u"],
+            ["p", "sx"],
+            ["u", "u1", "u2", "u3", "rz", "sx", "x", "ry", "rx", "r", "p"],
+        ],
+        name="{target_2q_gates}_{target_1q_gates}",
+    )
+    def test_random_circuit(self, target_2q_gates, target_1q_gates):
+        cmap = CouplingMap.from_grid(2, 3)
+        basis_gates = target_1q_gates + target_2q_gates
+        backend = GenericBackendV2(
+            cmap.size(), basis_gates=basis_gates, seed=2024, coupling_map=cmap
+        )
+        peephole = TwoQubitPeepholeOptimization(backend.target)
+        qc = random_circuit(cmap.size(), 2, max_operands=2, seed=12345_42)
+        qc = transpile(qc, target=backend.target, optimization_level=0, seed_transpiler=2025)
+        result = peephole(qc)
+        if "rzx" in target_2q_gates or "rzz" in target_2q_gates or "rxx" in target_2q_gates:
+            self.all_inst_in_target(result, backend.target, allow_inverse=True)
+        else:
+            self.all_inst_in_target(result, backend.target)
+        print("Source")
+        print(qc)
+        print("Result")
+        print(result)
+        np.testing.assert_allclose(Operator(result), Operator(qc), atol=1e-12, rtol=0)

From 785a02b68a8c94b2cb26f99c71cd113165d6c0b9 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 17 Mar 2026 09:45:12 -0400
Subject: [PATCH 36/64] Fix directionality test

---
 test/python/transpiler/test_two_qubit_peephole.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 9c17a74a6264..3800278d0a3e 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -508,6 +508,7 @@ def test_pass_respects_directionality(self):
             CXGate(),
             {
                 (1, 0): InstructionProperties(error=3.058e-3, duration=6.8e-8),
+                (0, 1): InstructionProperties(error=.99999),
             },
         )
         target.add_instruction(
@@ -534,10 +535,12 @@ def test_pass_respects_directionality(self):
         peephole = TwoQubitPeepholeOptimization(target)
         qc = QuantumCircuit(2)
         qc.swap(0, 1)
-        qc = transpile(qc, target=target, seed_transpiler=1234, optimization_level=2)
         res = peephole(qc)
         self.assertTrue(self.all_inst_in_target(res, target))
-        self.assertEqual(res, qc)
+        self.assertEqual(Operator(res), Operator(qc))
+        for inst in res.data:
+            if len(inst.qubits) == 2:
+                self.assertEqual(inst.qubits, (res.qubits[1], res.qubits[0]))
         # Check run of swaps
         qc_duplicated = QuantumCircuit(2)
         for _ in range(100):
@@ -546,8 +549,9 @@ def test_pass_respects_directionality(self):
             qc_duplicated, target=target, seed_transpiler=1234, optimization_level=0
         )
         res = peephole(qc_duplicated)
+        self.assertEqual(Operator(res), Operator(qc_duplicated))
+        self.assertEqual(Operator(res), Operator(np.eye(4, dtype=complex)))
         self.assertTrue(self.all_inst_in_target(res, target))
-        self.assertEqual(Operator(res), Operator(QuantumCircuit(2)))
 
         qc_duplicated = QuantumCircuit(2)
         for _ in range(101):
@@ -558,6 +562,9 @@ def test_pass_respects_directionality(self):
         res = peephole(qc_duplicated)
         self.assertTrue(self.all_inst_in_target(res, target))
         self.assertEqual(Operator(res), Operator(qc_duplicated))
+        for inst in res.data:
+            if len(inst.qubits) == 2:
+                self.assertEqual(inst.qubits, (res.qubits[1], res.qubits[0]))
 
     def all_inst_in_target(self, circuit: QuantumCircuit, target: Target, allow_inverse=False):
         for inst in circuit.data:

From 2f682d5255b338f2b785b799278a5c8fd325ec61 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 17 Mar 2026 12:30:17 -0400
Subject: [PATCH 37/64] Fix dag reconstruction bug on insertion too early

Prior to this commit there was a bug in the dag reconstruction logic for
specific subcircuit formations that could cause the synthesized block to
be inserted before it's parent nodes in the dag. Specfically if you
think of a circuit such as as:

0: h(1)
1: cx(3, 2)
2: cx(1, 2)
3: cx(2, 1)

(not an exact example but showcases it well). Nodes 0, 2, and 3 form a
block that can be optimized through peephole resynthesis. The previous
logic for reconstruction could encounter node 0 first when iterating
over the nodes in topological order and then would proceed to insert
the entire contents of the synthesized block before node 1. This would
be invalid because node 1 is a parent of 2 and 3. This commit fixes this
by holding the block insertion until the first 2q node instead of the
first node.
---
 .../src/passes/two_qubit_peephole.rs          | 27 ++++++++++---------
 .../src/passes/unitary_synthesis/mod.rs       |  1 +
 .../transpiler/test_two_qubit_peephole.py     |  8 ++----
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index a0119c80c99f..1d838562405a 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -82,11 +82,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                     let inst = dag.dag()[*node_index].unwrap_operation();
                     let qubits = dag.get_qargs(inst.qubits);
                     if qubits.len() == 2 {
-                        if qubits[0] > qubits[1] {
-                            Some([qubits[1], qubits[0]])
-                        } else {
-                            Some([qubits[0], qubits[1]])
-                        }
+                        Some([qubits[0], qubits[1]])
                     } else {
                         None
                     }
@@ -199,15 +195,21 @@ pub fn two_qubit_unitary_peephole_optimize(
                 if processed_runs.contains(run_index) {
                     continue;
                 }
-                // A None is inserted into the node_mapping as the value for a run that we don't
+                // If this is not a two qubit gate then there is a chance this will cause the
+                // insertion to happen too early. We skip the nodes in a run until we encounter
+                // a 2q gate which ensure the block is inserted into the correct location in the
+                // circuit.
+                if dag.dag()[node].unwrap_operation().op.num_qubits() != 2 {
+                    continue;
+                }
+                // A None is inserted into the run_mapping as the value for a run that we don't
                 // substitute but was identified so we added an explicit None to preserve the
-                // indexing with the vec.
+                // indexing with the vec. This shouldn't be possible to hit the else condition, but
+                // it's left in
                 let Some((result, qargs_virt)) = run_mapping[*run_index].as_ref() else {
-                    let NodeType::Operation(ref instr) = dag.dag()[node] else {
-                        unreachable!("Must be an op node")
-                    };
-                    out_dag_builder.push_back(instr.clone())?;
-                    continue;
+                    unreachable!(
+                        "node_mapping can't contain a value pointing to an unpoluated run in run_mapping"
+                    );
                 };
                 let order = result.dir.as_indices();
                 let out_qargs = [qargs_virt[order[0] as usize], qargs_virt[order[1] as usize]];
@@ -217,7 +219,6 @@ pub fn two_qubit_unitary_peephole_optimize(
                     out_dag_builder.insert_qargs(&[out_qargs[0], out_qargs[1]]),
                     out_dag_builder.insert_qargs(&[out_qargs[1], out_qargs[0]]),
                 ];
-
                 for (gate, params, local_qubits) in &result.sequence.gates {
                     let qubits = match local_qubits.as_slice() {
                         [0] => qubit_keys[0],
diff --git a/crates/transpiler/src/passes/unitary_synthesis/mod.rs b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
index aa88a28f47a9..70886e4ee7ee 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/mod.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
@@ -488,6 +488,7 @@ fn synthesize_1q_matrix_onto(
     Ok(true)
 }
 
+#[derive(Debug)]
 pub struct TwoQSynthesisResult {
     pub sequence: TwoQubitGateSequence,
     pub dir: Direction2q,
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 3800278d0a3e..c357fa9db645 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -508,7 +508,7 @@ def test_pass_respects_directionality(self):
             CXGate(),
             {
                 (1, 0): InstructionProperties(error=3.058e-3, duration=6.8e-8),
-                (0, 1): InstructionProperties(error=.99999),
+                (0, 1): InstructionProperties(error=0.99999),
             },
         )
         target.add_instruction(
@@ -767,15 +767,11 @@ def test_random_circuit(self, target_2q_gates, target_1q_gates):
             cmap.size(), basis_gates=basis_gates, seed=2024, coupling_map=cmap
         )
         peephole = TwoQubitPeepholeOptimization(backend.target)
-        qc = random_circuit(cmap.size(), 2, max_operands=2, seed=12345_42)
+        qc = random_circuit(cmap.size(), 79, max_operands=2, seed=12345_42)
         qc = transpile(qc, target=backend.target, optimization_level=0, seed_transpiler=2025)
         result = peephole(qc)
         if "rzx" in target_2q_gates or "rzz" in target_2q_gates or "rxx" in target_2q_gates:
             self.all_inst_in_target(result, backend.target, allow_inverse=True)
         else:
             self.all_inst_in_target(result, backend.target)
-        print("Source")
-        print(qc)
-        print("Result")
-        print(result)
         np.testing.assert_allclose(Operator(result), Operator(qc), atol=1e-12, rtol=0)

From 79f2466784f9a0e57f3218c1170def4bf844f81e Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 17 Mar 2026 12:44:02 -0400
Subject: [PATCH 38/64] Remove smallvec usage for original sequence scoring

---
 .../src/passes/two_qubit_peephole.rs          | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 1d838562405a..67ddd8b17a96 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -20,7 +20,6 @@ use pyo3::intern;
 use pyo3::prelude::*;
 use rayon::prelude::*;
 use rustworkx_core::petgraph::stable_graph::NodeIndex;
-use smallvec::SmallVec;
 
 use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
 use qiskit_circuit::instruction::Parameters;
@@ -114,23 +113,23 @@ pub fn two_qubit_unitary_peephole_optimize(
                 let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
                     unreachable!("All run nodes will be ops")
                 };
-                let qubits: SmallVec<[PhysicalQubit; 2]> = dag
-                    .get_qargs(inst.qubits)
-                    .iter()
-                    .map(|qubit| PhysicalQubit(qubit.0))
-                    .collect();
+                let qubits: &[_] = match dag.get_qargs(inst.qubits) {
+                    [q] => &[PhysicalQubit(q.0)],
+                    [q0, q1] => &[PhysicalQubit(q0.0), PhysicalQubit(q1.0)],
+                    _ => panic!("Runs should only contain 1q and 2q gates"),
+                };
                 if qubits.len() == 2 {
                     original_2q_count += 1;
                 }
                 let name = inst.op.name();
-                let gate_fidelity = match target.get_error(name, qubits.as_slice()) {
+                let gate_fidelity = match target.get_error(name, qubits) {
                     Some(err) => 1. - err,
                     None => {
                         // If error rate is None this can mean either the gate is not supported
                         // in the target or the gate is ideal. We need to do a second lookup
                         // to determine if the gate is supported, and if it isn't we don't need
                         // to finish scoring because we know we'll use the synthesis output
-                        if !target.instruction_supported(name, &qubits, inst.params_view(), true) {
+                        if !target.instruction_supported(name, qubits, inst.params_view(), true) {
                             outside_target = true;
                             break;
                         }
@@ -204,8 +203,9 @@ pub fn two_qubit_unitary_peephole_optimize(
                 }
                 // A None is inserted into the run_mapping as the value for a run that we don't
                 // substitute but was identified so we added an explicit None to preserve the
-                // indexing with the vec. This shouldn't be possible to hit the else condition, but
-                // it's left in
+                // indexing with the vec. This shouldn't be possible to hit the else condition
+                // since node mapping will never contain a value for a run_mapping index that
+                // is set to None.
                 let Some((result, qargs_virt)) = run_mapping[*run_index].as_ref() else {
                     unreachable!(
                         "node_mapping can't contain a value pointing to an unpoluated run in run_mapping"

From 1cdaa610e485dcb972d2bf56a8ecc25118141956 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 17 Mar 2026 12:58:52 -0400
Subject: [PATCH 39/64] Fix original score creation

---
 crates/transpiler/src/passes/two_qubit_peephole.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 67ddd8b17a96..151f19fa29af 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -140,7 +140,7 @@ pub fn two_qubit_unitary_peephole_optimize(
             }
             let original_score = (
                 original_2q_count,
-                1. - original_fidelity,
+                original_fidelity,
                 original_total_count,
             );
             let new_2q_count = result

From a7794ffb9a00b8f16a2d572810368423d3c5604c Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 17 Mar 2026 15:36:10 -0400
Subject: [PATCH 40/64] Fix comparison between original score and new score

The score needs to be less to be improved, before we were comparing
fidelities as the second field, less fidelity is worse and we shouldn't
prefer that. This switches to use error for the second field in the
score comparison to ensure that less is better.
---
 crates/transpiler/src/passes/two_qubit_peephole.rs | 12 ++++++++++--
 test/python/transpiler/test_two_qubit_peephole.py  |  5 +----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 7cadc78b7abd..4a9daf3ae92a 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -138,7 +138,11 @@ pub fn two_qubit_unitary_peephole_optimize(
                 };
                 original_fidelity *= gate_fidelity;
             }
-            let original_score = (original_2q_count, original_fidelity, original_total_count);
+            let original_score = (
+                original_2q_count,
+                1. - original_fidelity,
+                original_total_count,
+            );
             let new_2q_count = result
                 .sequence
                 .gates
@@ -146,7 +150,11 @@ pub fn two_qubit_unitary_peephole_optimize(
                 .filter(|x| x.0.num_qubits() == 2)
                 .count();
             let new_gate_count = result.sequence.gates.len();
-            let new_score = (new_2q_count, result.fidelity.unwrap_or(1.), new_gate_count);
+            let new_score = (
+                new_2q_count,
+                1. - result.fidelity.unwrap_or(1.),
+                new_gate_count,
+            );
             // If the we are not outside the target and the new score isn't any better just use the
             // original (this includes a tie).
             if !outside_target && new_score >= original_score {
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index c357fa9db645..e57cb2558eee 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -481,7 +481,7 @@ def test_swap_on_cz_target(self):
         qc.swap(0, 1)
         qc = transpile(qc, target=target, seed_transpiler=1234, optimization_level=0)
         res = peephole(qc)
-        np.testing.assert_allclose(Operator(qc), Operator(res))
+        np.testing.assert_allclose(Operator(qc), Operator(res), atol=1e-12, rtol=0)
         # Check run of swaps
         qc_duplicated = QuantumCircuit(2)
         for _ in range(100):
@@ -689,13 +689,10 @@ def test_two_qubit_parametrized_gates_controlled_u_target(
             target.add_instruction(RXGate(lam))
             target.add_instruction(RZGate(theta))
             target.add_instruction(target_gate)
-
         qc = QuantumCircuit(2)
         qc.append(gate, [0, 1])
-
         peephole = TwoQubitPeepholeOptimization(target)
         transpiled_circuit = peephole(qc)
-
         legacy_path = PassManager(
             [
                 ConsolidateBlocks(target=target),

From 8bcc573bf2fa910090a3569a74f2dd1b7bce514b Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 17 Mar 2026 17:23:25 -0400
Subject: [PATCH 41/64] Use Dashmap<NodeIndex, usize> instead of
 Mutex<HashMap<NodeIndex, usize>>

---
 Cargo.lock                                    | 52 +++++++++++++++++++
 crates/transpiler/Cargo.toml                  |  1 +
 .../src/passes/two_qubit_peephole.rs          | 24 ++++-----
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d5027ddd7b10..cec9b7cfc7b5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -477,6 +477,20 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "dashmap"
+version = "6.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "dashu"
 version = "0.4.2"
@@ -1178,6 +1192,15 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
 
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
 [[package]]
 name = "log"
 version = "0.4.28"
@@ -1606,6 +1629,19 @@ version = "4.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52"
 
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-link 0.2.0",
+]
+
 [[package]]
 name = "paste"
 version = "1.0.15"
@@ -2150,6 +2186,7 @@ dependencies = [
  "anyhow",
  "approx",
  "bytemuck",
+ "dashmap",
  "fixedbitset 0.5.7",
  "hashbrown 0.15.5",
  "indexmap",
@@ -2340,6 +2377,15 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430"
 
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags 2.9.4",
+]
+
 [[package]]
 name = "regex"
 version = "1.12.3"
@@ -2500,6 +2546,12 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
 
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
 [[package]]
 name = "semver"
 version = "1.0.27"
diff --git a/crates/transpiler/Cargo.toml b/crates/transpiler/Cargo.toml
index e8639fcd9ed3..23839e28209c 100644
--- a/crates/transpiler/Cargo.toml
+++ b/crates/transpiler/Cargo.toml
@@ -30,6 +30,7 @@ thiserror.workspace = true
 bytemuck.workspace = true
 fixedbitset.workspace = true
 anyhow.workspace = true
+dashmap = "6.1.0"
 
 [dependencies.uuid]
 workspace = true
diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 4a9daf3ae92a..2743b5a23285 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -10,11 +10,11 @@
 // copyright notice, and modified files need to carry a notice indicating
 // that they have been altered from the originals.
 
-use std::sync::Mutex;
 #[cfg(feature = "cache_pygates")]
 use std::sync::OnceLock;
 
-use hashbrown::{HashMap, HashSet};
+use dashmap::DashMap;
+use hashbrown::HashSet;
 use pyo3::Python;
 use pyo3::intern;
 use pyo3::prelude::*;
@@ -65,13 +65,19 @@ pub fn two_qubit_unitary_peephole_optimize(
     if runs.is_empty() {
         return Ok(None);
     }
-    let node_mapping: HashMap<NodeIndex, usize> =
-        HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
-    let locked_node_mapping = Mutex::new(node_mapping);
+    let node_mapping: DashMap<NodeIndex, usize, ahash::RandomState> =
+        DashMap::with_capacity_and_hasher(
+            runs.iter().map(|run| run.len()).sum(),
+            ahash::RandomState::default(),
+        );
     let physical_qubits = (0..dag.num_qubits() as u32)
         .map(PhysicalQubit::new)
         .collect::<Vec<_>>();
     let approximation = Approximation::from_py_approximation_degree(approximation_degree);
+    let unitary_synthesis_config = UnitarySynthesisConfig {
+        approximation,
+        ..Default::default()
+    };
 
     let find_best_sequence =
         |run_index: usize, node_indices: &[NodeIndex]| -> PyResult<MappingIterItem> {
@@ -89,10 +95,6 @@ pub fn two_qubit_unitary_peephole_optimize(
                 .unwrap();
             let q_phys = q_virt.map(|q| physical_qubits[q.index()]);
             let matrix = blocks_to_matrix(dag, node_indices, q_virt)?;
-            let unitary_synthesis_config = UnitarySynthesisConfig {
-                approximation,
-                ..Default::default()
-            };
             let mut synthesis_state = UnitarySynthesisState::new(unitary_synthesis_config);
 
             let result = synthesize_2q_matrix(
@@ -163,7 +165,6 @@ pub fn two_qubit_unitary_peephole_optimize(
             // This is done at the end of the map in some attempt to minimize
             // lock contention. If this were serial code it'd make more sense
             // to do this as part of the iteration building the
-            let mut node_mapping = locked_node_mapping.lock().unwrap();
             for node in node_indices {
                 node_mapping.insert(*node, run_index);
             }
@@ -188,14 +189,13 @@ pub fn two_qubit_unitary_peephole_optimize(
     let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
     let out_dag = dag.copy_empty_like_with_same_capacity(VarsMode::Alike, BlocksMode::Keep)?;
     let mut out_dag_builder = out_dag.into_builder();
-    let node_mapping = locked_node_mapping.into_inner().unwrap();
     if node_mapping.is_empty() {
         return Ok(None);
     }
     for node in dag.topological_op_nodes(false) {
         match node_mapping.get(&node) {
             Some(run_index) => {
-                if processed_runs.contains(run_index) {
+                if processed_runs.contains(run_index.value()) {
                     continue;
                 }
                 // If this is not a two qubit gate then there is a chance this will cause the

From 4ecb01ec074fea72035a3b23389fca4429e620fe Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 17 Mar 2026 23:10:42 -0400
Subject: [PATCH 42/64] Use thread local storage for decomposer cache

This commit moves away from using a unique UnitarySynthesisState
instance for each block run in the parallel iterator to using a thread
local cache. This means we'll have n caches that are reused per thread.
The downside with this is potentially re-computating the same set of
decomposers between threads.The other tradeoff is this requires a
refcell for dynamic borrow checking as the threadlocal storage only
returns `&` references and not mutables ones. The dynamic borrow
checking will have a runtime overhead to access the cache.  In practice
the overhead of this approach should be minimal especially compared t
some of the other options and this approach speeds the pass up about 7%.

As the cache struct used in the state object is not thread safe as at
it's core it's based around a IndexMap to map the qubits to the
decomposers to use. While we could look at refactoring the struct to
be internally threadsafe this would rely on some form of
synchronization which has the potential for contention between
threads. Either we have to internally use a RWLock
or mutex around the internal fields on the cache struct or we could look
at using dashmap in place of hashmap (but not the indexmap) to move the
synchronization to the shards/chunks internally to reduce contention.
But the locking necessary around the Decomposer2qCacheInner would cause
a large amount of contention because that's the core structure that will
be held by a thread for the most expensive operation in a thread,
running the decomposition.

Another, option would be to use a per qarg cache so we have
an outer dashmap instance that is keyed on the qargs array and the value
is the decomposer cache instance for that qarg. This would work fine and
would avoid the overhead of having to potentially compute the decomposers
multiple times it would use slightly more memory (assuming there are
more qargs in the target than threads). There is still some
synchronization if two threads are trying to compute decomposition on
the same qargs at the same time (or if qargs on the same shard). This
would probably be a viable alternative (I tested it locally but didn't
get accurate benchmarks before switching to thread local storage).

There is also the option to put the state object behind a mutex but this
has the same issues as making the cache threadsafe the decomposer cache
will effectively serialize the unitary synthesis calls because the lock
will be held by a thread for the duration of the decomposition.
---
 Cargo.lock                                         | 1 +
 crates/transpiler/Cargo.toml                       | 1 +
 crates/transpiler/src/passes/two_qubit_peephole.rs | 9 ++++++---
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cec9b7cfc7b5..79b4d00627df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2206,6 +2206,7 @@ dependencies = [
  "rustworkx-core",
  "smallvec",
  "thiserror 2.0.18",
+ "thread_local",
  "uuid",
 ]
 
diff --git a/crates/transpiler/Cargo.toml b/crates/transpiler/Cargo.toml
index 23839e28209c..2e4062e959a3 100644
--- a/crates/transpiler/Cargo.toml
+++ b/crates/transpiler/Cargo.toml
@@ -31,6 +31,7 @@ bytemuck.workspace = true
 fixedbitset.workspace = true
 anyhow.workspace = true
 dashmap = "6.1.0"
+thread_local = "1.1"
 
 [dependencies.uuid]
 workspace = true
diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 2743b5a23285..874124ae3fc6 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -10,6 +10,7 @@
 // copyright notice, and modified files need to carry a notice indicating
 // that they have been altered from the originals.
 
+use std::cell::RefCell;
 #[cfg(feature = "cache_pygates")]
 use std::sync::OnceLock;
 
@@ -37,6 +38,7 @@ use crate::target::Target;
 use qiskit_circuit::PhysicalQubit;
 use qiskit_circuit::getenv_use_multiple_threads;
 use qiskit_quantum_info::convert_2q_block_matrix::blocks_to_matrix;
+use thread_local::ThreadLocal;
 
 type MappingIterItem = Option<(TwoQSynthesisResult, [Qubit; 2])>;
 
@@ -78,7 +80,7 @@ pub fn two_qubit_unitary_peephole_optimize(
         approximation,
         ..Default::default()
     };
-
+    let thread_local_states = ThreadLocal::new();
     let find_best_sequence =
         |run_index: usize, node_indices: &[NodeIndex]| -> PyResult<MappingIterItem> {
             let q_virt = node_indices
@@ -95,12 +97,13 @@ pub fn two_qubit_unitary_peephole_optimize(
                 .unwrap();
             let q_phys = q_virt.map(|q| physical_qubits[q.index()]);
             let matrix = blocks_to_matrix(dag, node_indices, q_virt)?;
-            let mut synthesis_state = UnitarySynthesisState::new(unitary_synthesis_config);
+            let synthesis_state: &RefCell<UnitarySynthesisState> = thread_local_states
+                .get_or(|| RefCell::new(UnitarySynthesisState::new(unitary_synthesis_config)));
 
             let result = synthesize_2q_matrix(
                 matrix.into(),
                 q_phys,
-                &mut synthesis_state,
+                &mut synthesis_state.borrow_mut(),
                 QpuConstraint::Target(target),
             )?;
             if result.is_none() {

From 76b40abd72f4d34793c6aea27f2b431941a8369c Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 18 Mar 2026 00:20:35 -0400
Subject: [PATCH 43/64] Fix doc indent

---
 qiskit/transpiler/passes/optimization/two_qubit_peephole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index d40ab39fede4..bf5e15b320d0 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -75,7 +75,7 @@ def __init__(
 
         Args:
             target: The target to run the pass for
-                approximation_degree: heuristic dial used for circuit approximation (1.0=no
+            approximation_degree: heuristic dial used for circuit approximation (1.0=no
                 approximation, 0.0=maximal approximation). Approximation can decrease the number
                 of gates used in the synthesized unitaries smaller at the cost of straying from the
                 original unitary. If ``None``, approximation is done based on gate fidelities

From 76f593543ae0f79259c79053e0c7418e8bdd8a2d Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 18 Mar 2026 12:05:48 -0400
Subject: [PATCH 44/64] Appease the lint check changing http->https in the
 license header

---
 crates/transpiler/src/passes/two_qubit_peephole.rs          | 2 +-
 qiskit/transpiler/passes/optimization/two_qubit_peephole.py | 2 +-
 test/python/transpiler/test_two_qubit_peephole.py           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 874124ae3fc6..41604f17730f 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -4,7 +4,7 @@
 //
 // This code is licensed under the Apache License, Version 2.0. You may
 // obtain a copy of this license in the LICENSE.txt file in the root directory
-// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+// of this source tree or at https://www.apache.org/licenses/LICENSE-2.0.
 //
 // Any modifications or derivative works of this code must retain this
 // copyright notice, and modified files need to carry a notice indicating
diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index bf5e15b320d0..ff0c2ac5941d 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -4,7 +4,7 @@
 #
 # This code is licensed under the Apache License, Version 2.0. You may
 # obtain a copy of this license in the LICENSE.txt file in the root directory
-# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+# of this source tree or at https://www.apache.org/licenses/LICENSE-2.0.
 #
 # Any modifications or derivative works of this code must retain this
 # copyright notice, and modified files need to carry a notice indicating
diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index e57cb2558eee..9f0cded532a5 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -4,7 +4,7 @@
 #
 # This code is licensed under the Apache License, Version 2.0. You may
 # obtain a copy of this license in the LICENSE.txt file in the root directory
-# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+# of this source tree or at https://www.apache.org/licenses/LICENSE-2.0.
 #
 # Any modifications or derivative works of this code must retain this
 # copyright notice, and modified files need to carry a notice indicating

From 2d40a186f5f1963bda45cdffd7b5a6734c34bedb Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 18 Mar 2026 12:19:53 -0400
Subject: [PATCH 45/64] Fix release note

---
 .../two_qubit_peephole-de6d3438ed7df6a9.yaml  | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml b/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
index 7acb7cb5f614..27891fbed377 100644
--- a/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
+++ b/releasenotes/notes/two_qubit_peephole-de6d3438ed7df6a9.yaml
@@ -2,10 +2,10 @@
 features_transpiler:
   - |
     Added a new transpiler pass :class:`.TwoQubitPeepholeOptimization` which
-    is intended to perform two qubit unitary peephole optimization, where it
-    analyzes the circuit to find two qubit blocks in the circuit, compute
-    the unitary of that subcircuit, and then replace the original block with
-    the synthesized unitary which uses fewer operations. For example:
+    is intended to perform two qubit unitary peephole optimization. The pass
+    analyzes the circuit to find two qubit blocks in the circuit, it then
+    computes the unitary of that subcircuit, and will replace the original
+    block with the synthesized unitary which uses fewer operations. For example:
 
     .. plot::
 
@@ -22,15 +22,17 @@ features_transpiler:
             unoptimized.cx(1, 0)
 
         # Generate a target with random error rates
-        target = GenericBackendV2(2, ["u", "cx"], coupling_map=[0, 1]).target
+        target = GenericBackendV2(2, ["u", "cx"], coupling_map=[[0, 1]]).target
         # Instantiate pass
         peephole_pass = TwoQubitPeepholeOptimization(target)
         # Run pass and visualize output
         optimized = peephole_pass(unoptimized)
         optimized.draw("mpl")
 
-    This functionality can be perfomed by running
-    :class:`.Collect2qBlocks`, :class:`.ConsolidateBlocks`, and
-    :class:`.UnitarySynthesis` sequentially. However this new pass offers
-    improved runtime performance and also better quality output in cases
-    of overcomplete and hetergeneous targets.
+    This functionality could previously have been perfomed by running
+    :class:`.ConsolidateBlocks`, and:class:`.UnitarySynthesis` sequentially
+    in your pass manager. However, this new pass offers improved runtime
+    performance by performing the synthesis in parallel. It also has improved
+    heuristics enabled by doing the optimization in a single step which can
+    result in better quality output, especially in cases of overcomplete and/or
+    hetergeneous targets.

From 158725f2502e65fc33c38bc63ad5af8d6808e5c1 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 19 Mar 2026 12:32:42 -0400
Subject: [PATCH 46/64] Return a user error from a decomposer build with an
 incorrect string instead of panic

---
 crates/synthesis/src/two_qubit_decompose.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/synthesis/src/two_qubit_decompose.rs b/crates/synthesis/src/two_qubit_decompose.rs
index 9e5dbb787dcd..7886f08b4ebc 100644
--- a/crates/synthesis/src/two_qubit_decompose.rs
+++ b/crates/synthesis/src/two_qubit_decompose.rs
@@ -2032,7 +2032,7 @@ impl TwoQubitBasisDecomposer {
             gate_params?,
             gate_matrix.as_array(),
             basis_fidelity,
-            EulerBasis::from_str(euler_basis).unwrap(),
+            EulerBasis::from_str(euler_basis).map_err(|err| PyValueError::new_err(err))?,
             pulse_optimize,
         )
     }

From a1643a8f2316f355d23bbe163dff43af0ff74916 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Mon, 23 Mar 2026 14:21:37 -0400
Subject: [PATCH 47/64] Fix scoring of single decomposition

This commit fixes an edge case in the scoring when there was only a
single decomposition available. In UnitarySynthesis for these cases
the fidelity calculation is not run as part of the synthesis because there
is no reason to compute the estimated score when it's not being compared
to anything. However, for the new peephole pass we need to run the
scoring because the estimated fidelity is used to compare against the
original circuit's block to determine whether to replace it or not.
Previously, the new peephole pass treated the lack of a score as it
being ideal, which was incorrect and led to the pass replacing every
block unless the original circuit's block had more 2q gates. This fixes
this and now runs the scoring manually if there is not one returned by
the synthesis.
---
 crates/transpiler/src/passes/two_qubit_peephole.rs    | 11 +++++++++--
 crates/transpiler/src/passes/unitary_synthesis/mod.rs | 11 ++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index d3a97f3246fa..7de811cc112d 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -31,7 +31,7 @@ use qiskit_circuit::packed_instruction::{PackedInstruction, PackedOperation};
 use qiskit_circuit::{BlocksMode, Qubit, VarsMode};
 
 use crate::passes::unitary_synthesis::{
-    Approximation, QpuConstraint, TwoQSynthesisResult, synthesize_2q_matrix,
+    Approximation, QpuConstraint, TwoQSynthesisResult, fidelity_2q_sequence, synthesize_2q_matrix,
 };
 use crate::passes::{UnitarySynthesisConfig, UnitarySynthesisState};
 use crate::target::Target;
@@ -157,7 +157,14 @@ pub fn two_qubit_unitary_peephole_optimize(
             let new_gate_count = result.sequence.gates.len();
             let new_score = (
                 new_2q_count,
-                1. - result.fidelity.unwrap_or(1.),
+                1. - result.fidelity.unwrap_or_else(|| {
+                    fidelity_2q_sequence(
+                        &result.dir,
+                        &result.sequence,
+                        &QpuConstraint::Target(target),
+                        q_phys,
+                    )
+                }),
                 new_gate_count,
             );
             // If the we are not outside the target and the new score isn't any better just use the
diff --git a/crates/transpiler/src/passes/unitary_synthesis/mod.rs b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
index 70886e4ee7ee..fc1989043f42 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/mod.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
@@ -497,14 +497,14 @@ pub struct TwoQSynthesisResult {
 
 #[inline]
 pub fn fidelity_2q_sequence(
-    pair: &(Direction2q, TwoQubitGateSequence),
+    dir: &Direction2q,
+    sequence: &TwoQubitGateSequence,
     constraint: &QpuConstraint,
     qargs_phys: [PhysicalQubit; 2],
 ) -> f64 {
     let QpuConstraint::Target(target) = &constraint else {
         return 1.;
     };
-    let (dir, sequence) = pair;
     let order = dir.as_indices();
     let phys = [qargs_phys[order[0] as usize], qargs_phys[order[1] as usize]];
     sequence
@@ -590,9 +590,10 @@ pub fn synthesize_2q_matrix(
     let mut best_pair = first;
     for sequence in sequences {
         let sequence = sequence?;
-        let prev_fidelity = best_fidelity
-            .unwrap_or_else(|| fidelity_2q_sequence(&best_pair, &constraint, qargs_phys));
-        let this_fidelity = fidelity_2q_sequence(&sequence, &constraint, qargs_phys);
+        let prev_fidelity = best_fidelity.unwrap_or_else(|| {
+            fidelity_2q_sequence(&best_pair.0, &best_pair.1, &constraint, qargs_phys)
+        });
+        let this_fidelity = fidelity_2q_sequence(&sequence.0, &sequence.1, &constraint, qargs_phys);
         if this_fidelity > prev_fidelity {
             best_fidelity = Some(this_fidelity);
             best_pair = sequence;

From dfbe65d136f92ed21cc4040c0a8868a709c26a00 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 26 Mar 2026 09:03:17 -0400
Subject: [PATCH 48/64] Performance tuning

This commit makes a couple of small adjustments to the pass
implementation to improve the runtime performance. The first is it
switches the topological sort function used for the final iteration to
rebuild the dag. The lexicographical toplogical sort this was using
before had extra overhead which wasn't necessary for what we were doing.
The output from a slightly different ordering doesn't change the
structure of the dag so using this new function will be slightly faster.
The other key change is moving the node mapping away from dashmap to use
a `Mutex<Vec<usize>>`. This will require a slightly larger object in
memory but it is much faster to access and write to because there is no
hashing or other indirection once the lock is acquired by a thread.
---
 Cargo.lock                                    |  15 --
 crates/transpiler/Cargo.toml                  |   1 -
 .../src/passes/two_qubit_peephole.rs          | 172 +++++++++---------
 3 files changed, 87 insertions(+), 101 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 16c0c4c6d383..398444940761 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -513,20 +513,6 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "dashmap"
-version = "6.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
-dependencies = [
- "cfg-if",
- "crossbeam-utils",
- "hashbrown 0.14.5",
- "lock_api",
- "once_cell",
- "parking_lot_core",
-]
-
 [[package]]
 name = "dashu"
 version = "0.4.2"
@@ -2295,7 +2281,6 @@ dependencies = [
  "anyhow",
  "approx",
  "bytemuck",
- "dashmap",
  "fixedbitset 0.5.7",
  "hashbrown 0.15.5",
  "indexmap",
diff --git a/crates/transpiler/Cargo.toml b/crates/transpiler/Cargo.toml
index 2e4062e959a3..ae201320ce1c 100644
--- a/crates/transpiler/Cargo.toml
+++ b/crates/transpiler/Cargo.toml
@@ -30,7 +30,6 @@ thiserror.workspace = true
 bytemuck.workspace = true
 fixedbitset.workspace = true
 anyhow.workspace = true
-dashmap = "6.1.0"
 thread_local = "1.1"
 
 [dependencies.uuid]
diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index a1b0d5fb146b..84947aa1ae39 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -11,18 +11,20 @@
 // that they have been altered from the originals.
 
 use std::cell::RefCell;
+use std::sync::Mutex;
 #[cfg(feature = "cache_pygates")]
 use std::sync::OnceLock;
 
-use dashmap::DashMap;
 use hashbrown::HashSet;
+use nalgebra::U4;
+use num_complex::Complex64;
 use pyo3::Python;
 use pyo3::intern;
 use pyo3::prelude::*;
 use rayon::prelude::*;
+use rustworkx_core::petgraph::algo::toposort;
 use rustworkx_core::petgraph::stable_graph::NodeIndex;
-use nalgebra::U4;
-use num_complex::Complex64;
+use rustworkx_core::petgraph::visit::NodeIndexable;
 
 use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
 use qiskit_circuit::instruction::Parameters;
@@ -70,11 +72,8 @@ pub fn two_qubit_unitary_peephole_optimize(
     if runs.is_empty() {
         return Ok(None);
     }
-    let node_mapping: DashMap<NodeIndex, usize, ahash::RandomState> =
-        DashMap::with_capacity_and_hasher(
-            runs.iter().map(|run| run.len()).sum(),
-            ahash::RandomState::default(),
-        );
+    let node_mapping: Vec<usize> = vec![usize::MAX; dag.dag().node_bound()];
+    let locked_node_mapping = Mutex::new(node_mapping);
     let physical_qubits = (0..dag.num_qubits() as u32)
         .map(PhysicalQubit::new)
         .collect::<Vec<_>>();
@@ -178,8 +177,9 @@ pub fn two_qubit_unitary_peephole_optimize(
             // This is done at the end of the map in some attempt to minimize
             // lock contention. If this were serial code it'd make more sense
             // to do this as part of the iteration building the
+            let mut node_mapping = locked_node_mapping.lock().unwrap();
             for node in node_indices {
-                node_mapping.insert(*node, run_index);
+                node_mapping[node.index()] = run_index;
             }
             Ok(Some((result, q_virt)))
         };
@@ -202,88 +202,90 @@ pub fn two_qubit_unitary_peephole_optimize(
     let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
     let out_dag = dag.copy_empty_like_with_same_capacity(VarsMode::Alike, BlocksMode::Keep)?;
     let mut out_dag_builder = out_dag.into_builder();
+    let node_mapping = locked_node_mapping.lock().unwrap();
     if node_mapping.is_empty() {
         return Ok(None);
     }
-    for node in dag.topological_op_nodes(false) {
-        match node_mapping.get(&node) {
-            Some(run_index) => {
-                if processed_runs.contains(run_index.value()) {
-                    continue;
-                }
-                // If this is not a two qubit gate then there is a chance this will cause the
-                // insertion to happen too early. We skip the nodes in a run until we encounter
-                // a 2q gate which ensure the block is inserted into the correct location in the
-                // circuit.
-                if dag.dag()[node].unwrap_operation().op.num_qubits() != 2 {
-                    continue;
-                }
-                // A None is inserted into the run_mapping as the value for a run that we don't
-                // substitute but was identified so we added an explicit None to preserve the
-                // indexing with the vec. This shouldn't be possible to hit the else condition
-                // since node mapping will never contain a value for a run_mapping index that
-                // is set to None.
-                let Some((result, qargs_virt)) = run_mapping[*run_index].as_ref() else {
-                    unreachable!(
-                        "node_mapping can't contain a value pointing to an unpoluated run in run_mapping"
-                    );
-                };
-                let order = result.dir.as_indices();
-                let out_qargs = [qargs_virt[order[0] as usize], qargs_virt[order[1] as usize]];
-                let qubit_keys = [
-                    out_dag_builder.insert_qargs(&[out_qargs[0]]),
-                    out_dag_builder.insert_qargs(&[out_qargs[1]]),
-                    out_dag_builder.insert_qargs(&[out_qargs[0], out_qargs[1]]),
-                    out_dag_builder.insert_qargs(&[out_qargs[1], out_qargs[0]]),
-                ];
-                for (gate, params, local_qubits) in &result.sequence.gates {
-                    let qubits = match local_qubits.as_slice() {
-                        [0] => qubit_keys[0],
-                        [1] => qubit_keys[1],
-                        [0, 1] => qubit_keys[2],
-                        [1, 0] => qubit_keys[3],
-                        _ => panic!(
-                            "internal logic error: decomposed sequence contained unexpected qargs"
-                        ),
-                    };
-                    let op = match gate.view() {
-                        OperationRef::StandardGate(gate) => PackedOperation::from(gate),
-                        OperationRef::Gate(py_gate) => Python::attach(|py| -> PyResult<_> {
-                            let gate = py_gate.py_copy(py)?;
-                            gate.instruction
-                                .setattr(py, intern!(py, "params"), params)?;
-                            Ok(PackedOperation::from(Box::new(PyOperationTypes::Gate(
-                                gate,
-                            ))))
-                        })?,
-                        _ => {
-                            panic!("internal logic error: decomposed sequence contains a non-gate")
-                        }
-                    };
-                    let params = (!params.is_empty()).then(|| {
-                        Box::new(Parameters::Params(
-                            params.iter().copied().map(Param::Float).collect(),
-                        ))
-                    });
-                    out_dag_builder.push_back(PackedInstruction {
-                      op,
-                      qubits,
-                      clbits: Default::default(),
-                      params,
-                      label: None,
-                      #[cfg(feature = "cache_pygates")] // W: code is inactive due to #[cfg] directives: feature …
-                      py_op: OnceLock::new(),
-                    })?;
-                }
-                out_dag_builder.add_global_phase(&Param::Float(result.sequence.global_phase()))?;
-                processed_runs.insert(*run_index);
+    for node in toposort(dag.dag(), None).unwrap() {
+        if !matches!(dag.dag()[node], NodeType::Operation(_)) {
+            continue;
+        }
+        let run_index = node_mapping[node.index()];
+        if run_index != usize::MAX {
+            if processed_runs.contains(&run_index) {
+                continue;
             }
-            None => {
-                let NodeType::Operation(ref instr) = dag.dag()[node] else {
-                    unreachable!("Must be an op node")
+            // If this is not a two qubit gate then there is a chance this will cause the
+            // insertion to happen too early. We skip the nodes in a run until we encounter
+            // a 2q gate which ensure the block is inserted into the correct location in the
+            // circuit.
+            if dag.dag()[node].unwrap_operation().op.num_qubits() != 2 {
+                continue;
+            }
+            // A None is inserted into the run_mapping as the value for a run that we don't
+            // substitute but was identified so we added an explicit None to preserve the
+            // indexing with the vec. This shouldn't be possible to hit the else condition
+            // since node mapping will never contain a value for a run_mapping index that
+            // is set to None.
+            let Some((result, qargs_virt)) = run_mapping[run_index].as_ref() else {
+                unreachable!(
+                    "node_mapping can't contain a value pointing to an unpoluated run in run_mapping"
+                );
+            };
+            let order = result.dir.as_indices();
+            let out_qargs = [qargs_virt[order[0] as usize], qargs_virt[order[1] as usize]];
+            let qubit_keys = [
+                out_dag_builder.insert_qargs(&[out_qargs[0]]),
+                out_dag_builder.insert_qargs(&[out_qargs[1]]),
+                out_dag_builder.insert_qargs(&[out_qargs[0], out_qargs[1]]),
+                out_dag_builder.insert_qargs(&[out_qargs[1], out_qargs[0]]),
+            ];
+            for (gate, params, local_qubits) in &result.sequence.gates {
+                let qubits = match local_qubits.as_slice() {
+                    [0] => qubit_keys[0],
+                    [1] => qubit_keys[1],
+                    [0, 1] => qubit_keys[2],
+                    [1, 0] => qubit_keys[3],
+                    _ => panic!(
+                        "internal logic error: decomposed sequence contained unexpected qargs"
+                    ),
+                };
+                let op = match gate.view() {
+                    OperationRef::StandardGate(gate) => PackedOperation::from(gate),
+                    OperationRef::Gate(py_gate) => Python::attach(|py| -> PyResult<_> {
+                        let gate = py_gate.py_copy(py)?;
+                        gate.instruction
+                            .setattr(py, intern!(py, "params"), params)?;
+                        Ok(PackedOperation::from(Box::new(PyOperationTypes::Gate(
+                            gate,
+                        ))))
+                    })?,
+                    _ => {
+                        panic!("internal logic error: decomposed sequence contains a non-gate")
+                    }
                 };
-                out_dag_builder.push_back(instr.clone())?;
+                let params = (!params.is_empty()).then(|| {
+                    Box::new(Parameters::Params(
+                        params.iter().copied().map(Param::Float).collect(),
+                    ))
+                });
+                out_dag_builder.push_back(PackedInstruction {
+                  op,
+                  qubits,
+                  clbits: Default::default(),
+                  params,
+                  label: None,
+                  #[cfg(feature = "cache_pygates")] // W: code is inactive due to #[cfg] directives: feature …
+                  py_op: OnceLock::new(),
+                })?;
             }
+            out_dag_builder.add_global_phase(&Param::Float(result.sequence.global_phase()))?;
+            processed_runs.insert(run_index);
+        } else {
+            let NodeType::Operation(ref instr) = dag.dag()[node] else {
+                unreachable!("Must be an op node")
+            };
+            out_dag_builder.push_back(instr.clone())?;
         }
     }
     Ok(Some(out_dag_builder.build()))

From 9e3a651fb600df43f77dd2d91fdf5ebe7fa9e8b8 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Tue, 31 Mar 2026 14:33:12 -0400
Subject: [PATCH 49/64] Don't use a hashset for tracking processed runs

---
 crates/transpiler/src/passes/two_qubit_peephole.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 4d8e3ddb3a0a..5c19ad145904 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -15,7 +15,6 @@ use std::sync::Mutex;
 #[cfg(feature = "cache_pygates")]
 use std::sync::OnceLock;
 
-use hashbrown::HashSet;
 use nalgebra::U4;
 use num_complex::Complex64;
 use pyo3::Python;
@@ -40,9 +39,9 @@ use crate::passes::unitary_synthesis::{
 use crate::passes::{UnitarySynthesisConfig, UnitarySynthesisState};
 use crate::target::Target;
 use qiskit_circuit::PhysicalQubit;
+use qiskit_synthesis::linalg::nalgebra_array_view;
 use qiskit_synthesis::matrix::two_qubit::blocks_to_matrix;
 use qiskit_util::getenv_use_multiple_threads;
-use qiskit_synthesis::linalg::nalgebra_array_view;
 use thread_local::ThreadLocal;
 
 type MappingIterItem = Option<(TwoQSynthesisResult, [Qubit; 2])>;
@@ -199,7 +198,7 @@ pub fn two_qubit_unitary_peephole_optimize(
     };
     let run_mapping = run_mapping?;
     // After we've computed all the sequences to execute now serially build up a new dag.
-    let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
+    let mut processed_runs: Vec<bool> = vec![false; run_mapping.len()];
     let out_dag = dag.copy_empty_like_with_same_capacity(VarsMode::Alike, BlocksMode::Keep)?;
     let mut out_dag_builder = out_dag.into_builder();
     let node_mapping = locked_node_mapping.lock().unwrap();
@@ -212,7 +211,7 @@ pub fn two_qubit_unitary_peephole_optimize(
         }
         let run_index = node_mapping[node.index()];
         if run_index != usize::MAX {
-            if processed_runs.contains(&run_index) {
+            if processed_runs[run_index] {
                 continue;
             }
             // If this is not a two qubit gate then there is a chance this will cause the
@@ -280,7 +279,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                 })?;
             }
             out_dag_builder.add_global_phase(&Param::Float(result.sequence.global_phase()))?;
-            processed_runs.insert(run_index);
+            processed_runs[run_index] = true;
         } else {
             let NodeType::Operation(ref instr) = dag.dag()[node] else {
                 unreachable!("Must be an op node")

From a8dce96f81bfe26f35f191596018335a2150fd43 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 2 Apr 2026 09:06:32 -0400
Subject: [PATCH 50/64] Remove unnecessary empty check and use
 Mutex::into_inner() when no more locking is needed

---
 crates/transpiler/src/passes/two_qubit_peephole.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 5c19ad145904..0f6545279b5d 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -201,10 +201,7 @@ pub fn two_qubit_unitary_peephole_optimize(
     let mut processed_runs: Vec<bool> = vec![false; run_mapping.len()];
     let out_dag = dag.copy_empty_like_with_same_capacity(VarsMode::Alike, BlocksMode::Keep)?;
     let mut out_dag_builder = out_dag.into_builder();
-    let node_mapping = locked_node_mapping.lock().unwrap();
-    if node_mapping.is_empty() {
-        return Ok(None);
-    }
+    let node_mapping = locked_node_mapping.into_inner().unwrap();
     for node in toposort(dag.dag(), None).unwrap() {
         if !matches!(dag.dag()[node], NodeType::Operation(_)) {
             continue;

From 27842179f66023350ed00f65a31b9df5558ab145 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 2 Apr 2026 09:19:51 -0400
Subject: [PATCH 51/64] Add atomic bool to tracking whether we've made a
 substitution

We don't want to spend time reconstructing an exact copy of the dag if
there are no substituions needed. Prior to using a vec for tracking the
run indices that nodes are part of we would check if that map was empty.
The vec is always populated and to determine if there are no entries
we'd have to do a worse case O(n) lookup to determine if any entries are
set. To avoid that this overhead but keeping the check this adds an atomic
bool that is used to track whether we've substituted any blocks. If this
is not set to true we can just exit early since there are no
substitutions to make.
---
 crates/transpiler/src/passes/two_qubit_peephole.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 0f6545279b5d..8a747cc2ce8f 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -14,6 +14,7 @@ use std::cell::RefCell;
 use std::sync::Mutex;
 #[cfg(feature = "cache_pygates")]
 use std::sync::OnceLock;
+use std::sync::atomic::AtomicBool;
 
 use nalgebra::U4;
 use num_complex::Complex64;
@@ -73,6 +74,7 @@ pub fn two_qubit_unitary_peephole_optimize(
     }
     let node_mapping: Vec<usize> = vec![usize::MAX; dag.dag().node_bound()];
     let locked_node_mapping = Mutex::new(node_mapping);
+    let substitution_made = AtomicBool::new(false);
     let physical_qubits = (0..dag.num_qubits() as u32)
         .map(PhysicalQubit::new)
         .collect::<Vec<_>>();
@@ -180,6 +182,7 @@ pub fn two_qubit_unitary_peephole_optimize(
             for node in node_indices {
                 node_mapping[node.index()] = run_index;
             }
+            substitution_made.store(true, std::sync::atomic::Ordering::Relaxed);
             Ok(Some((result, q_virt)))
         };
 
@@ -197,6 +200,9 @@ pub fn two_qubit_unitary_peephole_optimize(
             .collect()
     };
     let run_mapping = run_mapping?;
+    if !substitution_made.into_inner() {
+        return Ok(None);
+    }
     // After we've computed all the sequences to execute now serially build up a new dag.
     let mut processed_runs: Vec<bool> = vec![false; run_mapping.len()];
     let out_dag = dag.copy_empty_like_with_same_capacity(VarsMode::Alike, BlocksMode::Keep)?;

From b9ce054fa2840ca3e3c03cfe38487ec1d47a7600 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 07:50:31 -0400
Subject: [PATCH 52/64] Add release note details to the pass docstring

---
 .../passes/optimization/two_qubit_peephole.py | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index ff0c2ac5941d..d2d73a103901 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -55,15 +55,45 @@ class TwoQubitPeepholeOptimization(TransformationPass):
     and use all the cores available on your local system. You can refer to
     the `configuration guide <https://docs.quantum.ibm.com/guides/configure-qiskit-local>`__
     for details on how to control the threading behavior for Qiskit more broadly
-    which will also control this pass.
+    which will also control this pass
 
-    Unlike :class:`.UnitarySynthesis` pass this does not use the :ref`unitary-synth-plugin`.
+    This pass is similar in functionality to running :class:`.ConsolidateBlocks`
+    and :class:`.UnitarySynthesis` sequentially in your pass manager. However,
+    this pass offers improved runtime performance by performing the synthesis in
+    parallel. It also has improved heuristics enabled by doing the optimization in
+    a single step which can result in better quality output, especially in cases
+    of overcomplete and/or hetergeneous targets. However, these heuristics and this
+    pass as a whole are only valid for physical circuits. Additionally, unlike
+    :class:`.UnitarySynthesis` pass this does not use the :ref:`unitary-synth-plugin`.
     This is a tradeoff for performance and it forgoes the pluggability exposed
     via that interface. Internally it currently only uses the :class:`.TwoQubitBasisDecomposer`
     and :class:`.TwoQubitControlledUDecomposer` for synthesizing the two qubit unitaries.
     You should not use this pass if you need to use the pluggable interface and the ability
     to use different synthesis algorithms, instead you should use a combination of
-    :class:`.ConsolidateBlocks` and :class:`.UnitarySynthesis` to use the plugin mechanism
+    :class:`.ConsolidateBlocks` and :class:`.UnitarySynthesis` to leverage the plugin
+    mechanism in :class:`.UnitarySynthesis`.
+
+    .. plot::
+      :include-source:
+
+      from qiskit.circuit import QuantumCircuit
+      from qiskit.transpiler.passes import TwoQubitPeepholeOptimization
+      from qiskit.providers.fake_provider import GenericBackendV2
+
+      # Build an unoptimized 2 qubit circuit
+      unoptimized = QuantumCircuit(2)
+      for i in range(10):
+        if i % 2:
+          unoptimized.cx(0, 1)
+        else:
+          unoptimized.cx(1, 0)
+      # Generate a target with random error rates
+      backend = GenericBackendV2(2, ["u", "cx"], coupling_map=[[0, 1], [1, 0]])
+      # Instantiate pass
+      peephole_pass = TwoQubitPeepholeOptimization(backend.target)
+      # Run pass and visualize output
+      optimized = peephole_pass(unoptimized)
+      optimized.draw("mpl")
     """
 
     def __init__(

From 12b14eee493d05bf499896b5aee30eea04058565 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 07:52:33 -0400
Subject: [PATCH 53/64] Update test module docstring

---
 test/python/transpiler/test_two_qubit_peephole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 9f0cded532a5..97059fbb55a7 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -13,7 +13,7 @@
 # pylint: disable=missing-function-docstring
 
 """
-Tests for the default UnitarySynthesis transpiler pass.
+Tests for the TwoQubitPeepholeOptimization transpiler pass.
 """
 
 import math

From f80db21f475e67eb72b93dbd56364ef24dc0ae55 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 07:55:16 -0400
Subject: [PATCH 54/64] Update pass module docstring

---
 qiskit/transpiler/passes/optimization/two_qubit_peephole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index d2d73a103901..8160e8949ca2 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -10,7 +10,7 @@
 # copyright notice, and modified files need to carry a notice indicating
 # that they have been altered from the originals.
 
-"""Splits each two-qubit gate in the `dag` into two single-qubit gates, if possible without error."""
+"""A transpiler pass to optimize 2q blocks in a circuit."""
 
 from __future__ import annotations
 

From 8c03af39c88729dad61feb012aca414b4f95e6a7 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 07:58:29 -0400
Subject: [PATCH 55/64] Fix typo in code comment

---
 crates/transpiler/src/passes/two_qubit_peephole.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 8a747cc2ce8f..6c9850564ac2 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -170,7 +170,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                 }),
                 new_gate_count,
             );
-            // If the we are not outside the target and the new score isn't any better just use the
+            // If we are not outside the target and the new score isn't any better just use the
             // original (this includes a tie).
             if !outside_target && new_score >= original_score {
                 return Ok(None);

From 819110d24bbd64427f2192a7519431d1983a1398 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 08:15:47 -0400
Subject: [PATCH 56/64] Add docstrings for the new UnitarySynthesis helper
 methods

---
 .../src/passes/unitary_synthesis/mod.rs       | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/crates/transpiler/src/passes/unitary_synthesis/mod.rs b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
index b03bbda2d0ca..525a13f6cba8 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/mod.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
@@ -493,8 +493,20 @@ pub struct TwoQSynthesisResult {
     pub fidelity: Option<f64>,
 }
 
+/// Estimate the fidelity of a synthesized two qubit unitary synthesis output
+///
+/// The estimated fidelity is computed by taking the error rate from the target
+/// and computing the product of the fidelities of each gate in the circuit. In
+/// the absence of error rates (either QpuConstraint doesn't have a target or
+/// the target contains no error rates) then a fidelity of 1 is returned.
+///
+/// # Args
+/// `dir` - The direction of the synthesis (forward or reverse)
+/// `sequence` - The synthesis sequence
+/// `constraint` - The qpu constraints used for the synthesis, typically just a Target
+/// `qargs_phys` - The qpu qargs the unitary is run on
 #[inline]
-pub fn fidelity_2q_sequence(
+pub(crate) fn fidelity_2q_sequence(
     dir: &Direction2q,
     sequence: &TwoQubitGateSequence,
     constraint: &QpuConstraint,
@@ -522,7 +534,17 @@ pub fn fidelity_2q_sequence(
         .product()
 }
 
-pub fn synthesize_2q_matrix(
+/// Synthesize a given two qubit unitary matrix into a gate sequence.
+///
+/// For overcomplete targets this will run all compatible decomposers and pick the output synthesis
+/// with the highest estimated fidelity.
+///
+/// # Args
+/// `unitary` - The unitary to synthesize
+/// `qargs_phys` - The physical qubits the unitary is being applied to
+/// `state` - The internal state of the pass, this includes the configured synthesizers
+/// `constraint` - The qpu constraints used for the synthesis, typically just a Target
+pub(crate) fn synthesize_2q_matrix(
     mut unitary: CowArray<Complex64, Ix2>,
     qargs_phys: [PhysicalQubit; 2],
     state: &mut UnitarySynthesisState,

From 56ea5aa51171efdb545184028cd38a1f8bc4ce00 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 08:22:06 -0400
Subject: [PATCH 57/64] Expand 2q basis fore new tests

---
 test/python/transpiler/test_two_qubit_peephole.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 97059fbb55a7..3f7ffcd08638 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -743,6 +743,8 @@ def test_two_qubit_rzz_cz_gates_rzz_target(self):
             ["rzz", "cz"],
             ["rzx", "ecr"],
             ["rzz", "rzx", "rxx", "ecr", "cx", "cz"],
+            ["cz", "swap"],
+            ["cx", "swap"],
         ],
         target_1q_gates=[
             ["rz", "sx"],

From 31f616f69c7f34d835521ad2dec4605893bb98d1 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 09:14:43 -0400
Subject: [PATCH 58/64] Use the peephole pass heuristic for best synthesis
 selection

Previously there was a mismatch between the scoring of synthesis
results and the peephole pass's comparison with the original block. The
pass is documented as using the tuple (num_2q_gates, error, num_gates)
and picking the min of all the choices. But, when we called the unitary
synthesis function that selects the best synthesis outcome it was
maximizing the estimated fidelity but not considering the gate counts
like the pass is documented as doing. This corrects this mismatch by
updating the function doing the synthesis to be generic on score type
and taking a scorer callback. This lets the peephole pass control the
heuristic used for selecting the best score.
---
 .../src/passes/two_qubit_peephole.rs          | 45 ++++++++++++++++---
 .../src/passes/unitary_synthesis/mod.rs       | 31 +++++++++----
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 6c9850564ac2..5eff949d89b2 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -34,6 +34,7 @@ use qiskit_circuit::operations::{
 use qiskit_circuit::packed_instruction::{PackedInstruction, PackedOperation};
 use qiskit_circuit::{BlocksMode, Qubit, VarsMode};
 
+use super::unitary_synthesis::Direction2q;
 use crate::passes::unitary_synthesis::{
     Approximation, QpuConstraint, TwoQSynthesisResult, fidelity_2q_sequence, synthesize_2q_matrix,
 };
@@ -42,10 +43,11 @@ use crate::target::Target;
 use qiskit_circuit::PhysicalQubit;
 use qiskit_synthesis::linalg::nalgebra_array_view;
 use qiskit_synthesis::matrix::two_qubit::blocks_to_matrix;
+use qiskit_synthesis::two_qubit_decompose::TwoQubitGateSequence;
 use qiskit_util::getenv_use_multiple_threads;
 use thread_local::ThreadLocal;
 
-type MappingIterItem = Option<(TwoQSynthesisResult, [Qubit; 2])>;
+type MappingIterItem = Option<(TwoQSynthesisResult<f64>, [Qubit; 2])>;
 
 // This is a separate function in case we need to handle any Python synchronization in the future
 // (such as releasing the GIL). For right now this doesn't seem to be necessary, but keeping it
@@ -59,6 +61,24 @@ pub fn py_two_qubit_unitary_peephole_optimize(
     two_qubit_unitary_peephole_optimize(dag, target, approximation_degree)
 }
 
+fn score_sequence(
+    dir: &Direction2q,
+    sequence: &TwoQubitGateSequence,
+    constraint: &QpuConstraint,
+    qargs: [PhysicalQubit; 2],
+) -> (i64, f64, i64) {
+    let fidelity = fidelity_2q_sequence(dir, sequence, constraint, qargs);
+    // Make the gate counts negative because synthesize_2q_matrix picks the largest value
+    // we want to minimize the gate counts.
+    let gate_count = -(sequence.gates.len() as i64);
+    let twoq_gate_count = -(sequence
+        .gates
+        .iter()
+        .filter(|x| x.0.num_qubits() == 2)
+        .count() as i64);
+    (twoq_gate_count, fidelity, gate_count)
+}
+
 /// This function runs the two qubit unitary peephole optimization pass
 ///
 /// It returns None if there is no modifications/optimiations made to the input dag and the pass
@@ -108,6 +128,7 @@ pub fn two_qubit_unitary_peephole_optimize(
                 q_phys,
                 &mut synthesis_state.borrow_mut(),
                 QpuConstraint::Target(target),
+                score_sequence,
             )?;
             if result.is_none() {
                 return Ok(None);
@@ -152,15 +173,20 @@ pub fn two_qubit_unitary_peephole_optimize(
                 original_total_count,
             );
             let new_2q_count = result
-                .sequence
-                .gates
-                .iter()
-                .filter(|x| x.0.num_qubits() == 2)
-                .count();
+                .score
+                .map(|score| -score.0 as usize)
+                .unwrap_or_else(|| {
+                    result
+                        .sequence
+                        .gates
+                        .iter()
+                        .filter(|x| x.0.num_qubits() == 2)
+                        .count()
+                });
             let new_gate_count = result.sequence.gates.len();
             let new_score = (
                 new_2q_count,
-                1. - result.fidelity.unwrap_or_else(|| {
+                1. - result.score.map(|score| score.1).unwrap_or_else(|| {
                     fidelity_2q_sequence(
                         &result.dir,
                         &result.sequence,
@@ -183,6 +209,11 @@ pub fn two_qubit_unitary_peephole_optimize(
                 node_mapping[node.index()] = run_index;
             }
             substitution_made.store(true, std::sync::atomic::Ordering::Relaxed);
+            let result = TwoQSynthesisResult {
+                sequence: result.sequence,
+                dir: result.dir,
+                score: result.score.map(|score| score.1),
+            };
             Ok(Some((result, q_virt)))
         };
 
diff --git a/crates/transpiler/src/passes/unitary_synthesis/mod.rs b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
index 525a13f6cba8..c84a9deb4e21 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/mod.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/mod.rs
@@ -23,7 +23,9 @@ use numpy::PyReadonlyArray2;
 use pyo3::prelude::*;
 use pyo3::{intern, wrap_pyfunction};
 
-use self::decomposers::{Decomposer2q, DecomposerCache, Direction2q, FlipDirection};
+pub(crate) use self::decomposers::Direction2q;
+
+use self::decomposers::{Decomposer2q, DecomposerCache, FlipDirection};
 use crate::QiskitError;
 use crate::target::Target;
 use qiskit_circuit::bit::QuantumRegister;
@@ -487,10 +489,10 @@ fn synthesize_1q_matrix_onto(
 }
 
 #[derive(Debug)]
-pub struct TwoQSynthesisResult {
+pub struct TwoQSynthesisResult<S> {
     pub sequence: TwoQubitGateSequence,
     pub dir: Direction2q,
-    pub fidelity: Option<f64>,
+    pub score: Option<S>,
 }
 
 /// Estimate the fidelity of a synthesized two qubit unitary synthesis output
@@ -544,12 +546,21 @@ pub(crate) fn fidelity_2q_sequence(
 /// `qargs_phys` - The physical qubits the unitary is being applied to
 /// `state` - The internal state of the pass, this includes the configured synthesizers
 /// `constraint` - The qpu constraints used for the synthesis, typically just a Target
-pub(crate) fn synthesize_2q_matrix(
+/// `fidelity_calculation` - A callable that is called with a two qubit synthesis output sequence and
+///   the context around it. It is expected to return a fidelity score to compare that synthesis output
+///   against other decomposers. The synthesis output with the maximum score is selected and is
+///   what is returned by the `synthesize_2q_matrix`.
+pub(crate) fn synthesize_2q_matrix<F, S>(
     mut unitary: CowArray<Complex64, Ix2>,
     qargs_phys: [PhysicalQubit; 2],
     state: &mut UnitarySynthesisState,
     constraint: QpuConstraint,
-) -> PyResult<Option<TwoQSynthesisResult>> {
+    mut fidelity_calculation: F,
+) -> PyResult<Option<TwoQSynthesisResult<S>>>
+where
+    F: FnMut(&Direction2q, &TwoQubitGateSequence, &QpuConstraint, [PhysicalQubit; 2]) -> S,
+    S: PartialOrd,
+{
     let decomposer_cache = &mut state.cache;
     let config = &state.config;
 
@@ -611,9 +622,9 @@ pub(crate) fn synthesize_2q_matrix(
     for sequence in sequences {
         let sequence = sequence?;
         let prev_fidelity = best_fidelity.unwrap_or_else(|| {
-            fidelity_2q_sequence(&best_pair.0, &best_pair.1, &constraint, qargs_phys)
+            fidelity_calculation(&best_pair.0, &best_pair.1, &constraint, qargs_phys)
         });
-        let this_fidelity = fidelity_2q_sequence(&sequence.0, &sequence.1, &constraint, qargs_phys);
+        let this_fidelity = fidelity_calculation(&sequence.0, &sequence.1, &constraint, qargs_phys);
         if this_fidelity > prev_fidelity {
             best_fidelity = Some(this_fidelity);
             best_pair = sequence;
@@ -624,7 +635,7 @@ pub(crate) fn synthesize_2q_matrix(
     Ok(Some(TwoQSynthesisResult {
         sequence: best_pair.1,
         dir: best_pair.0,
-        fidelity: best_fidelity,
+        score: best_fidelity,
     }))
 }
 
@@ -636,7 +647,9 @@ fn synthesize_2q_matrix_onto(
     state: &mut UnitarySynthesisState,
     constraint: QpuConstraint,
 ) -> PyResult<bool> {
-    let Some(result) = synthesize_2q_matrix(unitary, qargs_phys, state, constraint)? else {
+    let Some(result) =
+        synthesize_2q_matrix(unitary, qargs_phys, state, constraint, fidelity_2q_sequence)?
+    else {
         return Ok(false);
     };
     // ... now apply the best sequence.

From ddd8fdd703c98382dfb31cbbdec798e412895689 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 20:19:43 -0400
Subject: [PATCH 59/64] Add alt text to plot

---
 qiskit/transpiler/passes/optimization/two_qubit_peephole.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index 8160e8949ca2..698913650f40 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -75,6 +75,7 @@ class TwoQubitPeepholeOptimization(TransformationPass):
 
     .. plot::
       :include-source:
+      :alt: Optimized circuit
 
       from qiskit.circuit import QuantumCircuit
       from qiskit.transpiler.passes import TwoQubitPeepholeOptimization

From 7c2f549ebb5144772dd70c39c181b9c6b46ece64 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sat, 25 Apr 2026 21:11:16 -0400
Subject: [PATCH 60/64] Add missing docs ref label

---
 qiskit/transpiler/passes/optimization/two_qubit_peephole.py | 4 ++--
 qiskit/transpiler/passes/synthesis/plugin.py                | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
index 698913650f40..9d5b381d0663 100644
--- a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -62,9 +62,9 @@ class TwoQubitPeepholeOptimization(TransformationPass):
     this pass offers improved runtime performance by performing the synthesis in
     parallel. It also has improved heuristics enabled by doing the optimization in
     a single step which can result in better quality output, especially in cases
-    of overcomplete and/or hetergeneous targets. However, these heuristics and this
+    of overcomplete and/or heterogeneous targets. However, these heuristics and this
     pass as a whole are only valid for physical circuits. Additionally, unlike
-    :class:`.UnitarySynthesis` pass this does not use the :ref:`unitary-synth-plugin`.
+    :class:`.UnitarySynthesis` pass this does not use :ref:`unitary-synth-plugin`.
     This is a tradeoff for performance and it forgoes the pluggability exposed
     via that interface. Internally it currently only uses the :class:`.TwoQubitBasisDecomposer`
     and :class:`.TwoQubitControlledUDecomposer` for synthesizing the two qubit unitaries.
diff --git a/qiskit/transpiler/passes/synthesis/plugin.py b/qiskit/transpiler/passes/synthesis/plugin.py
index 11e6c180036a..1f41837e7a3b 100644
--- a/qiskit/transpiler/passes/synthesis/plugin.py
+++ b/qiskit/transpiler/passes/synthesis/plugin.py
@@ -254,6 +254,8 @@ def run(self, high_level_object, coupling_map=None, target=None, qubits=None, **
 Using Plugins
 =============
 
+.. _unitary-synth-plugin:
+
 Unitary Synthesis Plugins
 -------------------------
 

From 5c6d6509ff567c4c0e336a1e528297d16d75b00b Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 26 Apr 2026 09:16:58 -0400
Subject: [PATCH 61/64] Add xx +/- yy gates to test matrix

---
 test/python/transpiler/test_two_qubit_peephole.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index 3f7ffcd08638..bfad640ba113 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -56,6 +56,8 @@
     CRXGate,
     CRYGate,
     CUGate,
+    XXPlusYYGate,
+    XXMinusYYGate,
 )
 from qiskit.circuit import Measure
 from qiskit.circuit.controlflow import IfElseOp
@@ -597,6 +599,8 @@ def all_inst_in_target(self, circuit: QuantumCircuit, target: Target, allow_inve
             CRXGate(0.1),
             CRYGate(0.1),
             CUGate(0.1, 0.2, 0.3, 0.4),
+            XXPlusYYGate(0.1),
+            XXMinusYYGate(0.1),
         ],
         target_gate=[
             CXGate(),

From 66a90de48e60301fe4c9613570e42f931b15a183 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 26 Apr 2026 09:17:26 -0400
Subject: [PATCH 62/64] Assert a single 2q gate on controlled u decomposition

---
 test/python/transpiler/test_two_qubit_peephole.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/python/transpiler/test_two_qubit_peephole.py b/test/python/transpiler/test_two_qubit_peephole.py
index bfad640ba113..b66d1dccd9e3 100644
--- a/test/python/transpiler/test_two_qubit_peephole.py
+++ b/test/python/transpiler/test_two_qubit_peephole.py
@@ -713,6 +713,7 @@ def test_two_qubit_parametrized_gates_controlled_u_target(
             dict(sorted(transpiled_circuit.count_ops().items())),
             dict(sorted(legacy.count_ops().items())),
         )
+        self.assertEqual(transpiled_circuit.size(lambda x: x.operation.num_qubits == 2), 1)
 
     def test_two_qubit_rzz_cz_gates_rzz_target(self):
         """Test the synthesis of a circuit containing a RZZ and CZ gates

From 3dc56ea26d9c7474a13d2f857729cbaf26c4c35f Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Mon, 4 May 2026 15:50:30 -0400
Subject: [PATCH 63/64] Rework python function split

In testing the pass in the full pass manager there is an underlying
issue with Python defined gates in the circuit and the GIL handling. In
the presence of those gates the thread doing the synthesis and analysis
will need the GIL to get the matrix of the Python gate. But in the
previous version of this pass the parent thread retained the GIL while
the parallel workers ran. This would cause a deadlock because the
worker threads would never be able to acquire the GIL when they tried to
do so during the execution of the synthesis. This commit attempts to fix
this by splitting out the parallel portion from the serial portion of
the function. The serial portion rebuilds the dag from the analysis
results and needs the GIL to copy any python operations in the circuit
as it's rebuilding. So we re-attach the GIL prior to running the serial
portion.

The unitary synthesis decomposer handling needed to be updated as well
because there was implicit usage of the GIL from the py-clone feature
around handling custom rxx equivalent gates for the controlled u
decomposer. This was not correct in a threaded context where the GIL
might be released and would cause a panic. This is updated to explicitly
attach to the python interpreter and handle the python copy with the py
token explicitly. There were places already doing this in the decomposer
handling code but because of the py-clone feature the clone() calls were
missed.
---
 crates/transpiler/src/passes/mod.rs           |  5 +-
 .../src/passes/two_qubit_peephole.rs          | 69 +++++++++++++++----
 .../passes/unitary_synthesis/decomposers.rs   | 21 +++++-
 3 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/crates/transpiler/src/passes/mod.rs b/crates/transpiler/src/passes/mod.rs
index 328c2ce931c6..d1432e773b17 100644
--- a/crates/transpiler/src/passes/mod.rs
+++ b/crates/transpiler/src/passes/mod.rs
@@ -105,7 +105,10 @@ pub use schedule_analysis::scheduling_mod;
 pub use split_2q_unitaries::{run_split_2q_unitaries, split_2q_unitaries_mod};
 pub use substitute_pi4_rotations::{py_run_substitute_pi4_rotations, substitute_pi4_rotations_mod};
 pub use synthesize_rz_rotations::{py_run_synthesize_rz_rotations, synthesize_rz_rotations_mod};
-pub use two_qubit_peephole::{two_qubit_peephole_mod, two_qubit_unitary_peephole_optimize};
+pub use two_qubit_peephole::{
+    py_two_qubit_unitary_peephole_optimize, two_qubit_peephole_mod,
+    two_qubit_unitary_peephole_optimize,
+};
 pub use unitary_synthesis::{
     UnitarySynthesisConfig, UnitarySynthesisState, run_unitary_synthesis, unitary_synthesis_mod,
 };
diff --git a/crates/transpiler/src/passes/two_qubit_peephole.rs b/crates/transpiler/src/passes/two_qubit_peephole.rs
index 5eff949d89b2..1049db006561 100644
--- a/crates/transpiler/src/passes/two_qubit_peephole.rs
+++ b/crates/transpiler/src/passes/two_qubit_peephole.rs
@@ -49,16 +49,43 @@ use thread_local::ThreadLocal;
 
 type MappingIterItem = Option<(TwoQSynthesisResult<f64>, [Qubit; 2])>;
 
-// This is a separate function in case we need to handle any Python synchronization in the future
-// (such as releasing the GIL). For right now this doesn't seem to be necessary, but keeping it
-// separate enables any manipulation of the Py handle in the future.
+/// A python entry-point to the pass function
+///
+/// This function explicitly releases the GIL prior to entering the parallel portion of the pass.
+/// This is necessary because if there are any Python defined and owned objects in the circuit
+/// the pass will need GIL access to interact with that object in parallel.
 #[pyfunction(name = "two_qubit_unitary_peephole_optimize")]
 pub fn py_two_qubit_unitary_peephole_optimize(
+    py: Python,
     dag: &DAGCircuit,
     target: &Target,
     approximation_degree: Option<f64>,
 ) -> PyResult<Option<DAGCircuit>> {
-    two_qubit_unitary_peephole_optimize(dag, target, approximation_degree)
+    let result = py.detach(move || {
+        two_qubit_unitary_peephole_optimize_analysis(dag, target, approximation_degree)
+    })?;
+    let Some(result) = result else {
+        return Ok(None);
+    };
+    two_qubit_unitary_peephole_optimize_apply(dag, result)
+}
+
+/// A non-python entry-point to the pass function.
+///
+/// This function is not safe in the context where Python owned objects are in the circuit.
+/// It will hang/deadlock on the GIL if called in these contexts and should not be used if
+/// there are Python owned objects in the circuit. If you're using this from python you should call
+/// `py_two_qubit_unitary_peephole_optimize` instead.
+pub fn two_qubit_unitary_peephole_optimize(
+    dag: &DAGCircuit,
+    target: &Target,
+    approximation_degree: Option<f64>,
+) -> PyResult<Option<DAGCircuit>> {
+    let result = two_qubit_unitary_peephole_optimize_analysis(dag, target, approximation_degree)?;
+    let Some(result) = result else {
+        return Ok(None);
+    };
+    two_qubit_unitary_peephole_optimize_apply(dag, result)
 }
 
 fn score_sequence(
@@ -79,15 +106,16 @@ fn score_sequence(
     (twoq_gate_count, fidelity, gate_count)
 }
 
-/// This function runs the two qubit unitary peephole optimization pass
-///
-/// It returns None if there is no modifications/optimiations made to the input dag and the pass
-/// function calling this should just return the input dag from the pass.
-pub fn two_qubit_unitary_peephole_optimize(
+struct PeepholeResult {
+    run_mapping: Vec<MappingIterItem>,
+    node_mapping: Vec<usize>,
+}
+
+fn two_qubit_unitary_peephole_optimize_analysis(
     dag: &DAGCircuit,
     target: &Target,
     approximation_degree: Option<f64>,
-) -> PyResult<Option<DAGCircuit>> {
+) -> PyResult<Option<PeepholeResult>> {
     let runs: Vec<Vec<NodeIndex>> = dag.collect_2q_runs().unwrap();
     if runs.is_empty() {
         return Ok(None);
@@ -234,16 +262,29 @@ pub fn two_qubit_unitary_peephole_optimize(
     if !substitution_made.into_inner() {
         return Ok(None);
     }
+    Ok(Some(PeepholeResult {
+        node_mapping: locked_node_mapping.into_inner().unwrap(),
+        run_mapping,
+    }))
+}
+
+/// This function runs the two qubit unitary peephole optimization pass
+///
+/// It returns None if there is no modifications/optimiations made to the input dag and the pass
+/// function calling this should just return the input dag from the pass.
+fn two_qubit_unitary_peephole_optimize_apply(
+    dag: &DAGCircuit,
+    result: PeepholeResult,
+) -> PyResult<Option<DAGCircuit>> {
     // After we've computed all the sequences to execute now serially build up a new dag.
-    let mut processed_runs: Vec<bool> = vec![false; run_mapping.len()];
+    let mut processed_runs: Vec<bool> = vec![false; result.run_mapping.len()];
     let out_dag = dag.copy_empty_like_with_same_capacity(VarsMode::Alike, BlocksMode::Keep)?;
     let mut out_dag_builder = out_dag.into_builder();
-    let node_mapping = locked_node_mapping.into_inner().unwrap();
     for node in toposort(dag.dag(), None).unwrap() {
         if !matches!(dag.dag()[node], NodeType::Operation(_)) {
             continue;
         }
-        let run_index = node_mapping[node.index()];
+        let run_index = result.node_mapping[node.index()];
         if run_index != usize::MAX {
             if processed_runs[run_index] {
                 continue;
@@ -260,7 +301,7 @@ pub fn two_qubit_unitary_peephole_optimize(
             // indexing with the vec. This shouldn't be possible to hit the else condition
             // since node mapping will never contain a value for a run_mapping index that
             // is set to None.
-            let Some((result, qargs_virt)) = run_mapping[run_index].as_ref() else {
+            let Some((result, qargs_virt)) = result.run_mapping[run_index].as_ref() else {
                 unreachable!(
                     "node_mapping can't contain a value pointing to an unpoluated run in run_mapping"
                 );
diff --git a/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs b/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
index f4868ef3037e..e7535bf6ecae 100644
--- a/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
+++ b/crates/transpiler/src/passes/unitary_synthesis/decomposers.rs
@@ -94,7 +94,18 @@ impl hash::Hash for ContinuousPauliConstructor {
 }
 impl ContinuousPauliConstructor {
     fn try_build(&self) -> PyResult<TwoQubitControlledUDecomposer> {
-        TwoQubitControlledUDecomposer::new_inner(self.source.clone(), self.euler.as_str())
+        match self.source {
+            RXXEquivalent::Standard(gate) => TwoQubitControlledUDecomposer::new_inner(
+                RXXEquivalent::Standard(gate),
+                self.euler.as_str(),
+            ),
+            RXXEquivalent::CustomPython(ref gate) => Python::attach(|py| {
+                TwoQubitControlledUDecomposer::new_inner(
+                    RXXEquivalent::CustomPython(gate.clone_ref(py)),
+                    self.euler.as_str(),
+                )
+            }),
+        }
     }
 }
 
@@ -650,9 +661,15 @@ fn get_2q_decomposers(
                 // _all_ of the 1q bases simultaneously without further decompositions, but don't
                 // expose that functionality.  This wastes huge amounts of time and needs a fix.
                 for euler in euler_bases.get_bases() {
+                    let rxx_copy = match rxx_equivalent {
+                        RXXEquivalent::Standard(gate) => RXXEquivalent::Standard(gate),
+                        RXXEquivalent::CustomPython(ref gate) => {
+                            RXXEquivalent::CustomPython(Python::attach(|py| gate.clone_ref(py)))
+                        }
+                    };
                     let constructor =
                         Decomposer2qConstructor::ContinuousPauli(ContinuousPauliConstructor {
-                            source: rxx_equivalent.clone(),
+                            source: rxx_copy,
                             euler,
                         });
                     let flip = choose_flip(candidate.direction, &constructor);

From 471d1995accb457ba1e87e24312296f573dceac9 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Mon, 4 May 2026 17:09:11 -0400
Subject: [PATCH 64/64] Use TwoQubitPeepholeOptimization in preset pass
 managers

Following on from #13419 which added a new optimization pass
TwoQubitPeepholeOptimization which was designed to replace the
pair of ConsolidateBlocks and UnitarySynthesis for the optimization
stage after we have a physical circuit. That PR however did not update
the preset pass managers to concentrate the review on just adding the
new pass. This continues off from there by updating the preset pass
managers to use the new pass in optimization levels 2 and 3 replacing
those levels' optimization stage's previous usage of ConsolidateBlocks
and UnitarySynthesis to achieve the same goal. This should result in
both a runtime performance and transpilation quality improvement as the
new pass is both faster and should produce better fidelity circuits
than the previous peephole optimization.

The tests updates that are made in this PR are because the peephole
optimization is changing the transpilation output of various test
circuits. These were all verified to be valid outputs and in all cases
a "better" output than before. Specifically, for the tests updated
these were the changes in output and why they occurred:

* The two tests in
  test.python.circuit.test_scheduled_circuit.TestScheduledCircuit were
  the single CX gate in the output circuit was flipped from (0, 1) to
  (1, 0) because in the target the error rate for the (0, 1) direction
  was higher than the extra error cost of 3 sx gates (the rz gates have
  0 error).
* In test_unroll_only_if_not_gates_in_basis from
  test.python.transpiler.test_preset_passmanagers.TestPresetPassManager
  we no longer run ConsolidateBlocks in the optimization loop so we no
  longer need to add the 2 executions from the init and translation
  stages. The test is updated to count the new peephole pass which is
  the intent of the count check, to check the pass in the optimization
  loop.
* In test_2q_circuit_5q_backend_v2 from
  test.python.transpiler.test_vf2_post_layout.TestVF2PostLayoutUndirected
  had the same cx gate flipping because the error rate in the original
  layout for the reverse direction was 0.000779905 vs 0.00163587 in the
  original direction. So the new pass was correctly flipping the cx gate
  resulting in a different circuit that vf2 couldn't place anywhere
  better. To fix this the test sets a fixed layout on worse qubits so
  that vf2 will have to place it somewhere better.
* For test_layout_tokyo_fully_connected_cx_4_3 from
  test.python.transpiler.test_preset_passmanagers.TestFinalLayouts the
  output circuit has a better estimated fidelity (although more gates in
  general). The transpiler output goes from an estimated fidelity of
  0.9526614226294913 before the new pass was used to an estimated fidelity
  of 0.961996188569715 after the new pass is used. This new circuit with a
  better fidelity has a different initial layout set now, so the test
  is updated to use the new layout.
---
 .../preset_passmanagers/builtin_plugins.py    | 27 ++++---------------
 test/python/circuit/test_scheduled_circuit.py | 12 +++++++--
 .../transpiler/test_preset_passmanagers.py    | 16 +++++------
 .../python/transpiler/test_vf2_post_layout.py |  3 +--
 4 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/qiskit/transpiler/preset_passmanagers/builtin_plugins.py b/qiskit/transpiler/preset_passmanagers/builtin_plugins.py
index b53a6ff39e22..fc60ef6ba0f8 100644
--- a/qiskit/transpiler/preset_passmanagers/builtin_plugins.py
+++ b/qiskit/transpiler/preset_passmanagers/builtin_plugins.py
@@ -34,6 +34,7 @@
 from qiskit.transpiler.passes import ElidePermutations
 from qiskit.transpiler.passes import RemoveDiagonalGatesBeforeMeasure
 from qiskit.transpiler.passes import CommutativeOptimization
+from qiskit.transpiler.passes import TwoQubitPeepholeOptimization
 from qiskit.transpiler.passes import BasisTranslator
 from qiskit.transpiler.passes import SynthesizeRZRotations
 from qiskit.transpiler.passes import OptimizeCliffordT
@@ -504,18 +505,9 @@ def pass_manager(self, pass_manager_config, optimization_level=None):
                 loop_check, continue_loop = _optimization_check_fixed_point()
             case 2:
                 pre_loop = [
-                    ConsolidateBlocks(
-                        basis_gates=pass_manager_config.basis_gates,
-                        target=pass_manager_config.target,
-                        approximation_degree=pass_manager_config.approximation_degree,
-                    ),
-                    UnitarySynthesis(
-                        pass_manager_config.basis_gates,
+                    TwoQubitPeepholeOptimization(
+                        pass_manager_config.target,
                         approximation_degree=pass_manager_config.approximation_degree,
-                        coupling_map=pass_manager_config.coupling_map,
-                        method=pass_manager_config.unitary_synthesis_method,
-                        plugin_config=pass_manager_config.unitary_synthesis_plugin_config,
-                        target=pass_manager_config.target,
                     ),
                 ]
                 loop = [
@@ -534,18 +526,9 @@ def pass_manager(self, pass_manager_config, optimization_level=None):
             case 3:
                 pre_loop = []
                 loop = [
-                    ConsolidateBlocks(
-                        basis_gates=pass_manager_config.basis_gates,
-                        target=pass_manager_config.target,
-                        approximation_degree=pass_manager_config.approximation_degree,
-                    ),
-                    UnitarySynthesis(
-                        pass_manager_config.basis_gates,
+                    TwoQubitPeepholeOptimization(
+                        pass_manager_config.target,
                         approximation_degree=pass_manager_config.approximation_degree,
-                        coupling_map=pass_manager_config.coupling_map,
-                        method=pass_manager_config.unitary_synthesis_method,
-                        plugin_config=pass_manager_config.unitary_synthesis_plugin_config,
-                        target=pass_manager_config.target,
                     ),
                     RemoveIdentityEquivalent(
                         approximation_degree=pass_manager_config.approximation_degree,
diff --git a/test/python/circuit/test_scheduled_circuit.py b/test/python/circuit/test_scheduled_circuit.py
index 3972f0a681f0..3a6d1a794002 100644
--- a/test/python/circuit/test_scheduled_circuit.py
+++ b/test/python/circuit/test_scheduled_circuit.py
@@ -157,8 +157,13 @@ def test_transpile_delay_circuit_with_backend(self):
             qc, backend=self.backend_with_dt, scheduling_method="alap", layout_method="trivial"
         )
         target_durations = self.backend_with_dt.target.durations()
+        # The longest path output is 3 sx gates 1 cx(1,) and two delays for
+        # alignment
+        cx_duration = target_durations.get("cx", (1, 0))
+        sx_duration = target_durations.get("sx", (0,))
+        expected_duration = sx_duration * 3 + cx_duration + 270 + 15
         with self.assertWarns(DeprecationWarning):
-            self.assertEqual(scheduled.duration, target_durations.get("cx", (0, 1)) + 450)
+            self.assertEqual(scheduled.duration, expected_duration)
 
     def test_transpile_circuit_with_custom_instruction(self):
         """See: https://github.com/Qiskit/qiskit-terra/issues/5154"""
@@ -239,8 +244,11 @@ def test_unit_seconds_when_using_backend_durations(self):
             qc, backend=self.backend_with_dt, scheduling_method="alap", layout_method="trivial"
         )
         target_durations = self.backend_with_dt.target.durations()
+        cx_duration = target_durations.get("cx", (1, 0))
+        sx_duration = target_durations.get("sx", (0,))
+        expected_duration = cx_duration + 3 * sx_duration + 320 + 15
         with self.assertWarns(DeprecationWarning):
-            self.assertEqual(scheduled.duration, target_durations.get("cx", (0, 1)) + 500)
+            self.assertEqual(scheduled.duration, expected_duration)
 
     def test_per_qubit_durations(self):
         """Test target with custom instruction_durations"""
diff --git a/test/python/transpiler/test_preset_passmanagers.py b/test/python/transpiler/test_preset_passmanagers.py
index fcf0df5dd4c2..789e1200f51b 100644
--- a/test/python/transpiler/test_preset_passmanagers.py
+++ b/test/python/transpiler/test_preset_passmanagers.py
@@ -48,10 +48,10 @@
 from qiskit.providers.fake_provider import GenericBackendV2
 from qiskit.converters import circuit_to_dag
 from qiskit.circuit.library import GraphStateGate, UnitaryGate
-from qiskit.quantum_info import random_unitary
+from qiskit.quantum_info import random_unitary, Operator
 from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
 from qiskit.transpiler.preset_passmanagers import level0, level1, level2, level3
-from qiskit.transpiler.passes import ConsolidateBlocks, GatesInBasis
+from qiskit.transpiler.passes import ConsolidateBlocks, GatesInBasis, TwoQubitPeepholeOptimization
 from qiskit.transpiler.passes.scheduling.alignments.check_durations import InstructionDurationCheck
 from qiskit.transpiler.preset_passmanagers.builtin_plugins import OptimizationPassManager
 from qiskit.transpiler.timing_constraints import TimingConstraints
@@ -291,15 +291,15 @@ def test_unroll_only_if_not_gates_in_basis(self):
         )
         qv_circuit = quantum_volume(3)
         gates_in_basis_true_count = 0
-        consolidate_blocks_count = 0
+        peephole_count = 0
 
         def counting_callback_func(pass_, dag, time, property_set, count):
             nonlocal gates_in_basis_true_count
-            nonlocal consolidate_blocks_count
+            nonlocal peephole_count
             if isinstance(pass_, GatesInBasis) and property_set["all_gates_in_basis"]:
                 gates_in_basis_true_count += 1
-            if isinstance(pass_, ConsolidateBlocks):
-                consolidate_blocks_count += 1
+            if isinstance(pass_, TwoQubitPeepholeOptimization):
+                peephole_count += 1
 
         transpile(
             qv_circuit,
@@ -308,7 +308,7 @@ def counting_callback_func(pass_, dag, time, property_set, count):
             callback=counting_callback_func,
             translation_method="synthesis",
         )
-        self.assertEqual(gates_in_basis_true_count + 2, consolidate_blocks_count)
+        self.assertEqual(gates_in_basis_true_count, peephole_count)
 
     @data(0, 1, 2, 3)
     def test_layout_registers_preserved(self, optimization_level):
@@ -1059,7 +1059,7 @@ def test_layout_tokyo_fully_connected_cx(self, level):
             [0, 1, 2, 3, 4],
             [5, 6, 10, 0, 11],
             [5, 6, 10, 0, 11],
-            [5, 11, 6, 0, 10],
+            [6, 7, 1, 5, 2],
         ]
         backend = GenericBackendV2(num_qubits=20, coupling_map=TOKYO_CMAP, seed=42)
         result = transpile(qc, backend, optimization_level=level, seed_transpiler=42)
diff --git a/test/python/transpiler/test_vf2_post_layout.py b/test/python/transpiler/test_vf2_post_layout.py
index ea5c5f15bf2a..1e6bb82af604 100644
--- a/test/python/transpiler/test_vf2_post_layout.py
+++ b/test/python/transpiler/test_vf2_post_layout.py
@@ -567,11 +567,10 @@ def test_2q_circuit_5q_backend_v2(self):
             coupling_map=YORKTOWN_CMAP,
             seed=self.seed,
         )
-
         qr = QuantumRegister(2, "qr")
         circuit = QuantumCircuit(qr)
         circuit.cx(qr[1], qr[0])  # qr1 -> qr0
-        tqc = transpile(circuit, backend, layout_method="dense")
+        tqc = transpile(circuit, backend, initial_layout=[1, 0])
         initial_layout = tqc._layout
         dag = circuit_to_dag(tqc)
         pass_ = VF2PostLayout(target=backend.target, seed=self.seed, strict_direction=False)