flexflow · seemamirch · Apr 9, 2026 · Apr 9, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.dtg.toml b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.dtg.toml
@@ -11,13 +11,13 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ops/attention_attrs.dtg.h", 
-  "op-attrs/ops/batch_matmul_attrs.dtg.h", 
-  "op-attrs/ops/batch_norm_attrs.dtg.h", 
-  "op-attrs/ops/broadcast_attrs.dtg.h", 
-  "op-attrs/ops/cast_attrs.dtg.h", 
-  "op-attrs/ops/combine_attrs.dtg.h", 
-  "op-attrs/ops/concat_attrs.dtg.h", 
+  "op-attrs/ops/attention_attrs.dtg.h",
+  "op-attrs/ops/batch_matmul_attrs.dtg.h",
+  "op-attrs/ops/batch_norm_attrs.dtg.h",
+  "op-attrs/ops/broadcast_attrs.dtg.h",
+  "op-attrs/ops/cast_attrs.dtg.h",
+  "op-attrs/ops/combine_attrs.dtg.h",
+  "op-attrs/ops/concat_attrs.dtg.h",
   "op-attrs/ops/conv_2d_attrs.dtg.h",
   "op-attrs/ops/dropout_attrs.dtg.h",
   "op-attrs/ops/element_binary_attrs.dtg.h",
@@ -61,7 +61,7 @@ key = "cast"
 
 [[values]]
 type = "::FlexFlow::CombineAttrs"
-key = "combine_distributed"
+key = "parallel_combine"
 
 [[values]]
 type = "::FlexFlow::ConcatAttrs"
@@ -125,15 +125,15 @@ key = "reduce"
 
 [[values]]
 type = "::FlexFlow::ReductionAttrs"
-key = "reduce_distributed"
+key = "parallel_reduce"
 
 [[values]]
 type = "::FlexFlow::RepartitionAttrs"
-key = "partition_distributed"
+key = "parallel_partition"
 
 [[values]]
 type = "::FlexFlow::ReplicateAttrs"
-key = "replicate_distributed"
+key = "parallel_replicate"
 
 [[values]]
 type = "::FlexFlow::ReverseAttrs"

diff --git a/lib/op-attrs/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/src/op-attrs/ops/element_unary.cc
@@ -35,7 +35,6 @@ ParallelTensorDimDegrees get_output_parallel_dim_degrees(
     ElementUnaryAttrs const &attrs,
     ParallelTensorDimDegrees const &input_degrees) {
   ASSERT(input_degrees.sum_degree.value == 1);
-  ASSERT(input_degrees.discard_copy_degree.value == 1);
 
   return input_degrees;
 }

diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -53,22 +53,27 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("sum degree > 1") {
+    SUBCASE("discard copy degree > 1") {
       positive_int degree = 2_p;
 
-      CHECK_THROWS(get_output_shape(
-          attrs,
-          make_input(
-              SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p);
+
+      tl::expected<ParallelTensorShape, std::string> result =
+          get_output_shape(attrs, par_input);
+      tl::expected<ParallelTensorShape, std::string> correct = par_input;
+
+      CHECK(result == correct);
     }
 
-    SUBCASE("discard copy degree > 1") {
+    SUBCASE("sum degree > 1") {
       positive_int degree = 2_p;
 
       CHECK_THROWS(get_output_shape(
           attrs,
           make_input(
-              SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)));
+              SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
     }
+
   }
 }
diff --git a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.h b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.h
@@ -8,15 +8,45 @@ namespace FlexFlow {
 
 std::unordered_set<parallel_layer_guid_t>
     mpcg_get_parallel_layers(MappedParallelComputationGraph const &);
+
 MappedOperatorTaskGroup
     mpcg_get_mapping_for_layer(MappedParallelComputationGraph const &,
                                parallel_layer_guid_t);
 
 ParallelComputationGraph pcg_from_mpcg(MappedParallelComputationGraph const &);
 
+parallel_layer_guid_t mpcg_get_source_layer(MappedParallelComputationGraph const &,
+                                            parallel_tensor_guid_t const &);
+
+PCGOperatorAttrs mpcg_get_pcg_op_attrs(MappedParallelComputationGraph const &,
+                                       parallel_layer_guid_t const &);
+
+ParallelTensorAttrs mpcg_get_parallel_tensor_attrs(MappedParallelComputationGraph const &,
+                                                   parallel_tensor_guid_t const &);
+
+std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
+  mpcg_get_incoming_edges(MappedParallelComputationGraph const &,
+                          parallel_layer_guid_t const &);
+
+std::unordered_set<ParallelComputationGraphEdge>
+  mpcg_get_outgoing_edges(MappedParallelComputationGraph const &,
+                          parallel_layer_guid_t const &);
+
+ManyToOne<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_incoming_tensors(MappedParallelComputationGraph const &,
+                              parallel_layer_guid_t const &);
+
+bidict<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_outgoing_tensors(MappedParallelComputationGraph const &,
+                         parallel_layer_guid_t const &);
+
 std::unordered_set<ParallelComputationGraphEdge>
     mpcg_get_edges(MappedParallelComputationGraph const &);
 
+std::unordered_set<parallel_tensor_use_t>
+    mpcg_get_parallel_tensor_uses(MappedParallelComputationGraph const &,
+                                  parallel_tensor_guid_t const &);
+
 MappedParallelComputationGraph mapped_pcg_from_pcg_and_mapped_op_task_groups(
     ParallelComputationGraph const &pcg,
     std::unordered_map<parallel_layer_guid_t, MappedOperatorTaskGroup> const

diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -11,6 +11,7 @@
 #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include <unordered_set>
+#include "pcg/parallel_computation_graph/parallel_tensor_use_t.dtg.h"
 
 namespace FlexFlow {
 
@@ -53,6 +54,10 @@ std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
     get_incoming_edges(ParallelComputationGraph const &,
                        parallel_layer_guid_t const &);
 
+std::unordered_set<parallel_tensor_use_t>
+    pcg_get_parallel_tensor_uses(ParallelComputationGraph const &,
+                                 parallel_tensor_guid_t const &);
+
 std::unordered_set<parallel_layer_guid_t>
     get_initial_layers(ParallelComputationGraph const &);
 

diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_use_t.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_use_t.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_PARALLEL_TENSOR_USE_T_H
+#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_PARALLEL_TENSOR_USE_T_H
+
+#include "pcg/parallel_computation_graph/parallel_tensor_use_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+parallel_layer_guid_t parallel_tensor_use_get_layer(parallel_tensor_use_t const &);
+TensorSlotName parallel_tensor_use_get_slot(parallel_tensor_use_t const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.cc b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.cc
@@ -8,6 +8,8 @@
 #include "utils/graph/labelled_kwarg_dataflow_graph/algorithms/labelled_kwarg_dataflow_graph_view_as_dot.h"
 #include "utils/graph/labelled_kwarg_dataflow_graph/algorithms/materialize_labelled_kwarg_dataflow_graph_view.h"
 #include "utils/graph/labelled_kwarg_dataflow_graph/algorithms/rewrite_labelled_kwarg_dataflow_graph_node_labels.h"
+#include "utils/bidict/algorithms/bidict_from_map.h"
+#include "utils/many_to_one/many_to_one_from_map.h"
 
 namespace FlexFlow {
 
@@ -46,6 +48,66 @@ ParallelComputationGraph
   };
 }
 
+parallel_layer_guid_t mpcg_get_source_layer(MappedParallelComputationGraph const &mpcg,
+                                            parallel_tensor_guid_t const &t)
+{
+  return get_source_layer(pcg_from_mpcg(mpcg), t);
+}
+
+PCGOperatorAttrs mpcg_get_pcg_op_attrs(MappedParallelComputationGraph const &mpcg,
+                                       parallel_layer_guid_t const &l)
+{
+  return pcg_get_op_attrs(pcg_from_mpcg(mpcg), l);
+}
+
+ParallelTensorAttrs mpcg_get_parallel_tensor_attrs(MappedParallelComputationGraph const &mpcg,
+                                                   parallel_tensor_guid_t const &t)
+{
+  return get_parallel_tensor_attrs(pcg_from_mpcg(mpcg), t);
+}
+
+std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
+  mpcg_get_incoming_edges(MappedParallelComputationGraph const &mpcg,
+                          parallel_layer_guid_t const &l)
+{
+  return get_incoming_edges(pcg_from_mpcg(mpcg), l);
+}
+
+std::unordered_set<ParallelComputationGraphEdge>
+  mpcg_get_outgoing_edges(MappedParallelComputationGraph const &mpcg,
+                          parallel_layer_guid_t const &l)
+{
+  return get_outgoing_edges(pcg_from_mpcg(mpcg), l);
+}
+
+ManyToOne<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_incoming_tensors(MappedParallelComputationGraph const &mpcg,
+                              parallel_layer_guid_t const &l)
+{
+  return many_to_one_from_map(get_incoming_tensors(pcg_from_mpcg(mpcg), l));
+}
+
+
+bidict<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_outgoing_tensors(MappedParallelComputationGraph const &mpcg,
+                              parallel_layer_guid_t const &l)
+{
+  return bidict_from_map(get_outgoing_tensors(pcg_from_mpcg(mpcg), l));
+}
+
+std::unordered_set<ParallelComputationGraphEdge>
+    mpcg_get_edges(MappedParallelComputationGraph const &mpcg)
+{
+  return get_edges(pcg_from_mpcg(mpcg));
+}
+
+std::unordered_set<parallel_tensor_use_t>
+    mpcg_get_parallel_tensor_uses(MappedParallelComputationGraph const &mpcg,
+                                  parallel_tensor_guid_t const &t)
+{
+  return pcg_get_parallel_tensor_uses(pcg_from_mpcg(mpcg), t);
+}
+
 MappedParallelComputationGraph mapped_pcg_from_pcg_and_mapped_op_task_groups(
     ParallelComputationGraph const &pcg,
     std::unordered_map<parallel_layer_guid_t, MappedOperatorTaskGroup> const

diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -36,6 +36,7 @@
 #include "utils/graph/node/node.dtg.h"
 #include "utils/record_formatter.h"
 #include <unordered_set>
+#include "utils/graph/kwarg_dataflow_graph/algorithms/get_kwarg_dataflow_value_uses.h"
 
 namespace FlexFlow {
 
@@ -206,6 +207,20 @@ std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
   });
 }
 
+std::unordered_set<parallel_tensor_use_t>
+    pcg_get_parallel_tensor_uses(ParallelComputationGraph const &pcg,
+                                 parallel_tensor_guid_t const &t)
+{
+  std::unordered_set<KwargDataflowInput<TensorSlotName>> raw_uses =
+      get_kwarg_dataflow_value_uses(pcg.raw_graph,
+                                    t.raw_graph_output);
+
+  return transform(raw_uses, [](KwargDataflowInput<TensorSlotName> const &i) {
+    return parallel_tensor_use_t{i};
+  });
+}
+
+
 std::unordered_set<parallel_layer_guid_t>
     get_initial_layers(ParallelComputationGraph const &pcg) {
   std::unordered_set<Node> raw_sources = get_initial_nodes(pcg.raw_graph);

diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_tensor_use_t.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_tensor_use_t.cc
@@ -0,0 +1,13 @@
+#include "pcg/parallel_computation_graph/parallel_tensor_use_t.h"
+
+namespace FlexFlow {
+
+parallel_layer_guid_t parallel_tensor_use_get_layer(parallel_tensor_use_t const &u) {
+  return parallel_layer_guid_t{u.raw_dataflow_input.node};
+}
+
+TensorSlotName parallel_tensor_use_get_slot(parallel_tensor_use_t const &u) {
+  return u.raw_dataflow_input.slot_name;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -63,15 +63,18 @@ struct RealmContext {
                             int priority = 0);
   ///\}
 
-  /** \name Data movement */
+  /** \name Data movement and reduction */
   ///\{
-  Realm::Event issue_copy(ParallelTensorShape const &src_shape,
-                          Realm::RegionInstance src_inst,
-                          ParallelTensorShape const &dst_shape,
-                          Realm::RegionInstance dst_inst,
-                          Realm::ProfilingRequestSet const &requests,
-                          Realm::Event wait_on = Realm::Event::NO_EVENT,
-                          int priority = 0);
+  Realm::Event
+      issue_copy(ParallelTensorShape const &src_shape,
+                 Realm::RegionInstance src_inst,
+                 ParallelTensorShape const &dst_shape,
+                 Realm::RegionInstance dst_inst,
+                 Realm::ProfilingRequestSet const &requests,
+                 Realm::Event wait_on = Realm::Event::NO_EVENT,
+                 int priority = 0,
+                 std::optional<Realm::ReductionOpID> redop_id = std::nullopt,
+                 bool exclusive = false);
   ///\}
 
   /** \name Instance management */