diff --git a/.gitignore b/.gitignore
index 8d3c051bae..112fc29b95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# tag files
+tags
+
 # gdb history
 .gdb_history
 
@@ -176,7 +179,7 @@ cython_debug/
 # IDE
 .vscode/*
 
-# Docker container 
+# Docker container
 docker/config
 
 # yarn prettier module (for linting workflows)
diff --git a/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h
index a331676401..eaea06b1a5 100644
--- a/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h
+++ b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h
@@ -9,133 +9,29 @@
  * https://github.com/google-research/nasbench/blob/b94247037ee470418a3e56dcb83814e9be83f3a8/nasbench/api.py
  */
 
-#include "utils/containers/all_of.h"
-#include "utils/containers/repeat.h"
-#include "utils/containers/transform.h"
-#include "utils/graph/algorithms.h"
-#include "utils/graph/digraph/algorithms/get_edges.h"
-#include "utils/graph/digraph/algorithms/get_initial_nodes.h"
-#include "utils/graph/digraph/algorithms/get_terminal_nodes.h"
-#include "utils/graph/digraph/algorithms/is_acyclic.h"
-#include "utils/graph/digraph/algorithms/materialize_digraph_view.h"
-#include "utils/graph/digraph/algorithms/transitive_reduction.h"
-#include "utils/graph/instances/adjacency_digraph.h"
-#include "utils/graph/node/algorithms.h"
-#include "utils/graph/series_parallel/digraph_generation.h"
+#include "utils/graph/digraph/digraph.h"
+#include "utils/graph/digraph/digraph_view.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 #include <optional>
 #include <vector>
 
-using AdjacencyMatrix = std::vector<std::vector<bool>>;
 namespace FlexFlow {
 
-const nonnegative_int MIN_NODES = nonnegative_int{6};
-const nonnegative_int MAX_NODES = nonnegative_int{8};
-const nonnegative_int MIN_EDGES = nonnegative_int{8};
-const nonnegative_int MAX_EDGES = nonnegative_int{11};
-const nonnegative_int NUM_CELLS = nonnegative_int{9};
-
 struct NasNetBenchConfig {
-  AdjacencyMatrix adjacency_matrix;
+  std::vector<std::vector<bool>> adjacency_matrix;
 };
 
-bool is_valid_config(NasNetBenchConfig const &config) {
-  AdjacencyMatrix const &matrix = config.adjacency_matrix;
-  const size_t size = matrix.size();
-
-  auto is_valid_size = [](nonnegative_int s) {
-    return s >= MIN_NODES && s <= MAX_NODES;
-  };
-
-  auto is_square_matrix = [&](auto const &m) {
-    return all_of(m, [&](const auto &row) { return row.size() == size; });
-  };
-
-  auto is_upper_triangular = [&](auto const &m) {
-    for (size_t i = 0; i < size; ++i) {
-      for (size_t j = 0; j <= i; ++j) {
-        if (matrix[i][j]) {
-          return false;
-        }
-      }
-    }
-    return true;
-  };
-
-  return is_valid_size(nonnegative_int{size}) && is_square_matrix(matrix) &&
-         is_upper_triangular(matrix);
-}
-
-bool is_valid_cell(DiGraphView const &g) {
-  nonnegative_int n_edges = nonnegative_int{get_edges(g).size()};
-  nonnegative_int n_nodes = nonnegative_int{num_nodes(g)};
-  return (is_acyclic(g)) && (get_initial_nodes(g).size() == 1) &&
-         (get_terminal_nodes(g).size() == 1) && (n_edges <= MAX_EDGES) &&
-         (n_edges >= MIN_EDGES) && (n_nodes <= MAX_NODES) &&
-         (n_nodes >= MIN_NODES) &&
-         (n_edges > n_nodes); // filter linear cell and diamond cell
-}
+bool is_valid_config(NasNetBenchConfig const &config);
 
-NasNetBenchConfig generate_random_config() {
-  static std::uniform_int_distribution<> size_dist(
-      MIN_NODES.unwrap_nonnegative(), MAX_NODES.unwrap_nonnegative());
-  Binary bin = Binary(0, 1);
+bool is_valid_cell(DiGraphView const &g);
 
-  size_t num_nodes =
-      Uniform(MIN_NODES.unwrap_nonnegative(), MAX_NODES.unwrap_nonnegative())();
-  std::vector<std::vector<bool>> matrix(num_nodes,
-                                        std::vector<bool>(num_nodes, false));
-
-  for (size_t i = 0; i < num_nodes; ++i) {
-    for (size_t j = i + 1; j < num_nodes; ++j) {
-      matrix[i][j] = bin();
-    }
-  }
-
-  return {matrix};
-}
+NasNetBenchConfig generate_random_config();
 
 std::optional<DiGraph>
-    maybe_generate_nasnet_bench_cell(NasNetBenchConfig const &config) {
-  if (!is_valid_config(config)) {
-    return std::nullopt;
-  }
-
-  DiGraph g = DiGraph::create<AdjacencyDiGraph>();
-  std::vector<Node> nodes = add_nodes(g, config.adjacency_matrix.size());
-
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    for (size_t j = i + 1; j < nodes.size(); ++j) {
-      if (config.adjacency_matrix[i][j]) {
-        g.add_edge(DirectedEdge{nodes[i], nodes[j]});
-      }
-    }
-  }
-
-  g = materialize_digraph_view<AdjacencyDiGraph>(transitive_reduction(g));
-
-  if (!is_valid_cell(g)) {
-    return std::nullopt;
-  }
+    maybe_generate_nasnet_bench_cell(NasNetBenchConfig const &config);
 
-  return g;
-}
+DiGraph generate_nasnet_bench_cell();
 
-DiGraph generate_nasnet_bench_cell() {
-  while (true) {
-    NasNetBenchConfig config = generate_random_config();
-    std::optional<DiGraph> maybe_cell =
-        maybe_generate_nasnet_bench_cell(config);
-    if (maybe_cell) {
-      return maybe_cell.value();
-    }
-  }
-}
+DiGraph generate_nasnet_bench_network();
 
-DiGraph generate_nasnet_bench_network() {
-  DiGraph g = series_composition(
-      transform(repeat(NUM_CELLS, generate_nasnet_bench_cell),
-                [](DiGraph const &cell) -> DiGraphView { return cell; }));
-  return g;
-}
 } // namespace FlexFlow
diff --git a/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/nasnet_bench_graph_generator.cc b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/nasnet_bench_graph_generator.cc
new file mode 100644
index 0000000000..125759b0de
--- /dev/null
+++ b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/nasnet_bench_graph_generator.cc
@@ -0,0 +1,137 @@
+#include "sp-ization-benchmarking/nasnet_bench_graph_generator.h"
+#include "utils/containers/all_of.h"
+#include "utils/containers/repeat.h"
+#include "utils/containers/transform.h"
+#include "utils/graph/algorithms.h"
+#include "utils/graph/digraph/algorithms/get_edges.h"
+#include "utils/graph/digraph/algorithms/get_initial_nodes.h"
+#include "utils/graph/digraph/algorithms/get_terminal_nodes.h"
+#include "utils/graph/digraph/algorithms/is_acyclic.h"
+#include "utils/graph/digraph/algorithms/materialize_digraph_view.h"
+#include "utils/graph/digraph/algorithms/transitive_reduction.h"
+#include "utils/graph/instances/adjacency_digraph.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/series_parallel/digraph_generation.h"
+
+namespace FlexFlow {
+
+static const nonnegative_int MIN_NODES = nonnegative_int{6};
+static const nonnegative_int MAX_NODES = nonnegative_int{8};
+static const nonnegative_int MIN_EDGES = nonnegative_int{8};
+static const nonnegative_int MAX_EDGES = nonnegative_int{11};
+static const nonnegative_int NUM_CELLS = nonnegative_int{9};
+
+using AdjacencyMatrix = std::vector<std::vector<bool>>;
+
+bool is_valid_config(NasNetBenchConfig const &config) {
+  AdjacencyMatrix const &matrix = config.adjacency_matrix;
+  const size_t size = matrix.size();
+
+  auto is_valid_size = [](nonnegative_int s) {
+    return s >= MIN_NODES && s <= MAX_NODES;
+  };
+
+  auto is_square_matrix = [&](auto const &m) {
+    return all_of(m, [&](const auto &row) { return row.size() == size; });
+  };
+
+  auto is_upper_triangular = [&](auto const &m) {
+    for (size_t i = 0; i < size; ++i) {
+      for (size_t j = 0; j <= i; ++j) {
+        if (matrix[i][j]) {
+          return false;
+        }
+      }
+    }
+    return true;
+  };
+
+  return is_valid_size(nonnegative_int{size}) && is_square_matrix(matrix) &&
+         is_upper_triangular(matrix);
+}
+
+bool is_valid_cell(DiGraphView const &g) {
+  nonnegative_int n_edges = nonnegative_int{get_edges(g).size()};
+  nonnegative_int n_nodes = nonnegative_int{num_nodes(g)};
+  return (is_acyclic(g)) && (get_initial_nodes(g).size() == 1) &&
+         (get_terminal_nodes(g).size() == 1) && (n_edges <= MAX_EDGES) &&
+         (n_edges >= MIN_EDGES) && (n_nodes <= MAX_NODES) &&
+         (n_nodes >= MIN_NODES) &&
+         (n_edges > n_nodes); // filter linear cell and diamond cell
+}
+
+NasNetBenchConfig generate_random_config() {
+  static std::uniform_int_distribution<> size_dist(
+      MIN_NODES.unwrap_nonnegative(), MAX_NODES.unwrap_nonnegative());
+  Binary bin = Binary(0, 1);
+
+  size_t num_nodes =
+      Uniform(MIN_NODES.unwrap_nonnegative(), MAX_NODES.unwrap_nonnegative())();
+  std::vector<std::vector<bool>> matrix(num_nodes,
+                                        std::vector<bool>(num_nodes, false));
+
+  for (size_t i = 0; i < num_nodes; ++i) {
+    for (size_t j = i + 1; j < num_nodes; ++j) {
+      matrix[i][j] = bin();
+    }
+  }
+
+  return {matrix};
+}
+
+std::optional<DiGraph>
+    maybe_generate_nasnet_bench_cell(NasNetBenchConfig const &config) {
+  if (!is_valid_config(config)) {
+    return std::nullopt;
+  }
+
+  DiGraph g = DiGraph::create<AdjacencyDiGraph>();
+  std::vector<Node> nodes = add_nodes(g, config.adjacency_matrix.size());
+
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    for (size_t j = i + 1; j < nodes.size(); ++j) {
+      if (config.adjacency_matrix[i][j]) {
+        g.add_edge(DirectedEdge{nodes[i], nodes[j]});
+      }
+    }
+  }
+
+  g = materialize_digraph_view<AdjacencyDiGraph>(transitive_reduction(g));
+
+  if (!is_valid_cell(g)) {
+    return std::nullopt;
+  }
+
+  return g;
+}
+
+DiGraph generate_nasnet_bench_cell() {
+  while (true) {
+    NasNetBenchConfig config = generate_random_config();
+    std::optional<DiGraph> maybe_cell =
+        maybe_generate_nasnet_bench_cell(config);
+    if (maybe_cell) {
+      return maybe_cell.value();
+    }
+  }
+}
+
+DiGraph generate_nasnet_bench_cell() {
+  while (true) {
+    NasNetBenchConfig config = generate_random_config();
+    std::optional<DiGraph> maybe_cell =
+        maybe_generate_nasnet_bench_cell(config);
+    if (maybe_cell) {
+      return maybe_cell.value();
+    }
+  }
+}
+
+DiGraph generate_nasnet_bench_network() {
+  DiGraph g = series_composition(
+      transform(repeat(NUM_CELLS, generate_nasnet_bench_cell),
+                [](DiGraph const &cell) -> DiGraphView { return cell; }));
+  return g;
+}
+
+} // namespace FlexFlow
diff --git a/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/sample_graphs.cc b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/sample_graphs.cc
new file mode 100644
index 0000000000..bdd2525a4d
--- /dev/null
+++ b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/sample_graphs.cc
@@ -0,0 +1 @@
+#include "sp-ization-benchmarking/sample_graphs.h"
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index 795668e32a..7d5d189b1c 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -8,7 +8,7 @@ macro(ff_parse_args)
 endmacro()
 
 function(define_ff_vars target)
-  target_compile_definitions(${target} PRIVATE 
+  target_compile_definitions(${target} PRIVATE
     MAX_OPNAME=${FF_MAX_OPNAME}
     MAX_NUM_OUTPUTS=${FF_MAX_NUM_OUTPUTS}
     MAX_NUM_INPUTS=${FF_MAX_NUM_INPUTS}
@@ -41,24 +41,24 @@ function(ff_set_cxx_properties target)
       CXX_EXTENSIONS NO
   )
   target_compile_options(${target}
-    PUBLIC 
-    $<$<COMPILE_LANGUAGE:CXX>:> 
-    "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." 
-    "-fsanitize=undefined" 
+    PUBLIC
+    $<$<COMPILE_LANGUAGE:CXX>:>
+    "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=."
+    "-fsanitize=undefined"
     "-fno-sanitize-recover=all"
     # add C++ compile flags here
   )
   target_link_options(${target}
-    PUBLIC 
-    $<$<COMPILE_LANGUAGE:CXX>:> 
-    "-fsanitize=undefined" 
+    PUBLIC
+    $<$<COMPILE_LANGUAGE:CXX>:>
+    "-fsanitize=undefined"
     "-fno-sanitize-recover=all"
   )
 endfunction()
 
 function(ff_add_library)
   ff_parse_args(
-    PREFIX 
+    PREFIX
       FF_LIBRARY
     ARGS
       NAME
@@ -71,10 +71,10 @@ function(ff_add_library)
     PARSE
       ${ARGN}
   )
-  
+
   project(${FF_LIBRARY_NAME})
   file(GLOB_RECURSE SRC
-       CONFIGURE_DEPENDS 
+       CONFIGURE_DEPENDS
        LIST_DIRECTORIES False
        ${FF_LIBRARY_SRC_PATTERNS})
 
@@ -103,7 +103,7 @@ endfunction()
 
 function(ff_add_test_executable)
   ff_parse_args(
-    PREFIX 
+    PREFIX
       FF_TEST_EXEC
     ARGS
       NAME
@@ -145,7 +145,7 @@ endfunction()
 
 function(ff_add_benchmark_executable)
   ff_parse_args(
-    PREFIX 
+    PREFIX
       FF_BENCHMARK_EXEC
     ARGS
       NAME
@@ -172,6 +172,11 @@ function(ff_add_benchmark_executable)
     ${FF_BENCHMARK_EXEC_NAME}
     ${SRC})
 
+  target_include_directories(
+    ${FF_BENCHMARK_EXEC_NAME}
+    PRIVATE
+    ${FF_BENCHMARK_EXEC_PRIVATE_INCLUDE})
+
   target_link_libraries(
     ${FF_BENCHMARK_EXEC_NAME}
     ${FF_BENCHMARK_EXEC_DEPS}
@@ -184,7 +189,7 @@ endfunction()
 
 function(ff_add_executable)
   ff_parse_args(
-    PREFIX 
+    PREFIX
       FF_EXEC
     ARGS
       NAME
diff --git a/flake.lock b/flake.lock
index ca71a446a9..e52833e4ed 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1773786960,
-        "narHash": "sha256-XGta5Z2idBD9bAvdmx+6kN0GQpNruwNYq1BSONH1Sgo=",
+        "lastModified": 1778104328,
+        "narHash": "sha256-bn0G8xDqBrVjp5htw1i3u8fPdPMVtoZXFzX7hJ6m9YY=",
         "ref": "refs/heads/master",
-        "rev": "da1097f7ef7ecc659a2ed740203c1be8262de7fa",
-        "revCount": 147,
+        "rev": "535ac756b2674dc10051b37be978b9d2cb9f817d",
+        "revCount": 156,
         "type": "git",
         "url": "https://git.sr.ht/~lockshaw/proj"
       },
diff --git a/lib/compiler/benchmark/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/benchmark/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
index 1b548b0e0e..d6bf636822 100644
--- a/lib/compiler/benchmark/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
+++ b/lib/compiler/benchmark/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
@@ -17,7 +17,7 @@ static void benchmark_get_computation_graph_series_parallel_decomposition(
 
 BENCHMARK_CAPTURE(benchmark_get_computation_graph_series_parallel_decomposition,
                   split_test,
-                  get_split_test_computation_graph(/*batch_size=*/8_n));
+                  get_split_test_computation_graph(/*batch_size=*/8_p));
 
 BENCHMARK_CAPTURE(
     benchmark_get_computation_graph_series_parallel_decomposition,
diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml
deleted file mode 100644
index 7e7e59d7c9..0000000000
--- a/lib/compiler/include/compiler/search_result.struct.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-namespace = "FlexFlow"
-name = "SearchResult"
-features = [
-]
-
-includes = [
-  "pcg/parallel_computation_graph/parallel_computation_graph.h",
-  "compiler/machine_mapping/machine_mapping.h",
-]
-
-[[fields]]
-name = "pcg"
-type = "::FlexFlow::ParallelComputationGraph"
-
-[[fields]]
-name = "machine_mapping"
-type = "::FlexFlow::MachineMapping"
\ No newline at end of file
diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h b/lib/local-execution/include/local-execution/computation_graph_instance.h
similarity index 100%
rename from lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h
rename to lib/local-execution/include/local-execution/computation_graph_instance.h
diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
rename to lib/local-execution/src/local-execution/computation_graph_instance.cc
index 961dfae3f1..ae4e4ada0e 100644
--- a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
+++ b/lib/local-execution/src/local-execution/computation_graph_instance.cc
@@ -1,4 +1,4 @@
-#include "local-execution/computation_graph_instance/computation_graph_instance.h"
+#include "local-execution/computation_graph_instance.h"
 #include "local-execution/per_device_op_state_initialization.h"
 #include "local-execution/task_execution.h"
 #include "local-execution/tensor_allocation.h"
diff --git a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
index 89010c543e..e3e88c7eca 100644
--- a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
+++ b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
@@ -4,7 +4,7 @@
 #include "kernels/device.h"
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
-#include "local-execution/computation_graph_instance/computation_graph_instance.h"
+#include "local-execution/computation_graph_instance.h"
 #include "local-execution/cost_estimator/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
diff --git a/lib/local-execution/test/src/local-execution/test_e2e.cc b/lib/local-execution/test/src/local-execution/computation_graph_instance.cc
similarity index 71%
rename from lib/local-execution/test/src/local-execution/test_e2e.cc
rename to lib/local-execution/test/src/local-execution/computation_graph_instance.cc
index da62d22071..aaeb253b5b 100644
--- a/lib/local-execution/test/src/local-execution/test_e2e.cc
+++ b/lib/local-execution/test/src/local-execution/computation_graph_instance.cc
@@ -1,3 +1,4 @@
+#include "local-execution/computation_graph_instance.h"
 #include "kernels/compare_tensor_accessors.h"
 #include "kernels/copy_tensor_accessor.h"
 #include "kernels/device_handle_t.h"
@@ -7,7 +8,6 @@
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "kernels/tensor_accessor_reductions.h"
-#include "local-execution/computation_graph_instance/computation_graph_instance.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
@@ -357,4 +357,149 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     GenericTensorAccessorR last_epoch = loss_values.back();
     CHECK(did_loss_decrease(first_epoch_loss, last_epoch));
   }
+
+  TEST_CASE("LossFunctions") {
+    // initialize runtime
+    ManagedFFStream managed_stream{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
+
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int output_dim = 32_p;
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT};
+
+    LayerAddedResult inputs_layer =
+        add_input_layer(computation_graph, input_tensor_shape);
+    tensor_guid_t inputs_tensor =
+        require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+    LayerAddedResult weights_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}},
+                   std::nullopt},
+        {},
+        {});
+    tensor_guid_t weights_tensor =
+        require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT);
+
+    LayerAddedResult linear_operator = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        {
+            {
+                TensorSlotName::INPUT,
+                inputs_tensor,
+            },
+        },
+        {
+            {
+                TensorSlotName::WEIGHT,
+                weights_tensor,
+            },
+        });
+    tensor_guid_t logit_tensor =
+        require_only_key(linear_operator.outputs, TensorSlotName::OUTPUT);
+
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
+        SGDOptimizerAttrs{
+            /*lr=*/0.0,
+            /*momentum=*/0.0,
+            /*nesterov=*/false,
+            /*weight_decay=*/0.0,
+        },
+    };
+
+    device_id_t device_idx =
+        make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::GPU);
+    device_handle_t ff_handle =
+        gpu_make_device_handle_t(managed_handle.raw_handle());
+
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> input_tensors;
+
+    auto compute_loss = [&](LossAttrs const &loss_attrs,
+                            GenericTensorAccessorR label_tensor) {
+      ComputationGraphInstance computation_graph_instance =
+          create_computation_graph_instance(
+              /*cg=*/computation_graph,
+              /*optimizer=*/optimizer_attrs,
+              /*loss=*/
+              LossConfig{
+                  /*loss_attrs=*/loss_attrs,
+                  /*label_tensor=*/label_tensor,
+                  /*logit_tensor=*/logit_tensor,
+              },
+              /*input_tensors=*/input_tensors,
+              /*allocator=*/allocator,
+              /*profiling_settings=*/ProfilingSettings{0, 1},
+              /*device_handle=*/ff_handle,
+              /*iteration_config=*/FFIterationConfig{1_p},
+              /*device_idx=*/device_idx);
+
+      perform_all_passes_for_computation_graph_instance(
+          /*instance=*/computation_graph_instance,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*ff_handle=*/ff_handle,
+          /*iteration_config=*/FFIterationConfig{1_p},
+          /*device_idx=*/device_idx);
+      assert_unwrap(computation_graph_instance.get_loss_tensor_accessor());
+    };
+
+    SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
+      TensorShape label_tensor_shape =
+          TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT};
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      LossAttrs loss_attrs = LossAttrs{
+          SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
+
+      compute_loss(loss_attrs, label_tensor);
+    }
+
+    SUBCASE("NonconfigurableLossAttrs") {
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
+        LossAttrs loss_attrs = LossAttrs{
+            NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+
+        compute_loss(loss_attrs, label_tensor);
+      }
+
+      SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
+        LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
+            LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
+
+        compute_loss(loss_attrs, label_tensor);
+      }
+
+      SUBCASE("LossFunction::IDENTITY") {
+        LossAttrs loss_attrs =
+            LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
+
+        compute_loss(loss_attrs, label_tensor);
+      }
+    }
+  }
 }
diff --git a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc b/lib/local-execution/test/src/local-execution/cost_estimator/local_cost_estimator.cc
similarity index 100%
rename from lib/local-execution/test/src/local-execution/local_cost_estimator.cc
rename to lib/local-execution/test/src/local-execution/cost_estimator/local_cost_estimator.cc
diff --git a/lib/local-execution/test/src/local-execution/loss_functions.cc b/lib/local-execution/test/src/local-execution/loss_functions.cc
deleted file mode 100644
index 39aa5f138a..0000000000
--- a/lib/local-execution/test/src/local-execution/loss_functions.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "kernels/device_handle_t.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include "local-execution/computation_graph_instance/computation_graph_instance.h"
-#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
-#include "pcg/computation_graph.h"
-#include "pcg/computation_graph_builder.h"
-#include "pcg/device_id_t.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "utils/containers/require_only_key.h"
-#include "utils/optional.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("LossFunctions") {
-    // initialize runtime
-    ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-
-    Allocator allocator = create_local_cuda_memory_allocator();
-
-    positive_int batch_size = 10_p;
-    positive_int data_dim = 16_p;
-    positive_int output_dim = 32_p;
-
-    // construct computation graph
-    ComputationGraph computation_graph = make_empty_computation_graph();
-
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
-
-    TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT};
-
-    LayerAddedResult inputs_layer =
-        add_input_layer(computation_graph, input_tensor_shape);
-    tensor_guid_t inputs_tensor =
-        require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
-
-    LayerAddedResult weights_layer = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                       weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}},
-                   std::nullopt},
-        {},
-        {});
-    tensor_guid_t weights_tensor =
-        require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT);
-
-    LayerAddedResult linear_operator = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
-                                                       /*use_bias=*/false,
-                                                       DataType::FLOAT,
-                                                       Activation::RELU,
-                                                       std::nullopt}},
-                   std::nullopt},
-        {
-            {
-                TensorSlotName::INPUT,
-                inputs_tensor,
-            },
-        },
-        {
-            {
-                TensorSlotName::WEIGHT,
-                weights_tensor,
-            },
-        });
-    tensor_guid_t logit_tensor =
-        require_only_key(linear_operator.outputs, TensorSlotName::OUTPUT);
-
-    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
-        SGDOptimizerAttrs{
-            /*lr=*/0.0,
-            /*momentum=*/0.0,
-            /*nesterov=*/false,
-            /*weight_decay=*/0.0,
-        },
-    };
-
-    device_id_t device_idx =
-        make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::GPU);
-    device_handle_t ff_handle =
-        gpu_make_device_handle_t(managed_handle.raw_handle());
-
-    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> input_tensors;
-
-    auto compute_loss = [&](LossAttrs const &loss_attrs,
-                            GenericTensorAccessorR label_tensor) {
-      ComputationGraphInstance computation_graph_instance =
-          create_computation_graph_instance(
-              /*cg=*/computation_graph,
-              /*optimizer=*/optimizer_attrs,
-              /*loss=*/
-              LossConfig{
-                  /*loss_attrs=*/loss_attrs,
-                  /*label_tensor=*/label_tensor,
-                  /*logit_tensor=*/logit_tensor,
-              },
-              /*input_tensors=*/input_tensors,
-              /*allocator=*/allocator,
-              /*profiling_settings=*/ProfilingSettings{0, 1},
-              /*device_handle=*/ff_handle,
-              /*iteration_config=*/FFIterationConfig{1_p},
-              /*device_idx=*/device_idx);
-
-      perform_all_passes_for_computation_graph_instance(
-          /*instance=*/computation_graph_instance,
-          /*profiling_settings=*/ProfilingSettings{0, 0},
-          /*ff_handle=*/ff_handle,
-          /*iteration_config=*/FFIterationConfig{1_p},
-          /*device_idx=*/device_idx);
-      assert_unwrap(computation_graph_instance.get_loss_tensor_accessor());
-    };
-
-    SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
-      TensorShape label_tensor_shape =
-          TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT};
-      GenericTensorAccessorW label_tensor =
-          allocator.allocate_tensor(label_tensor_shape);
-
-      LossAttrs loss_attrs = LossAttrs{
-          SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
-
-      compute_loss(loss_attrs, label_tensor);
-    }
-
-    SUBCASE("NonconfigurableLossAttrs") {
-      TensorShape label_tensor_shape = TensorShape{
-          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-      GenericTensorAccessorW label_tensor =
-          allocator.allocate_tensor(label_tensor_shape);
-
-      SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
-        LossAttrs loss_attrs = LossAttrs{
-            NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-
-        compute_loss(loss_attrs, label_tensor);
-      }
-
-      SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
-        LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
-            LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
-
-        compute_loss(loss_attrs, label_tensor);
-      }
-
-      SUBCASE("LossFunction::IDENTITY") {
-        LossAttrs loss_attrs =
-            LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
-
-        compute_loss(loss_attrs, label_tensor);
-      }
-    }
-  }
-}
diff --git a/lib/local-pcg-execution/include/local-pcg-execution/parallel_model_training_instance.h b/lib/local-pcg-execution/include/local-pcg-execution/parallel_model_training_instance.h
deleted file mode 100644
index 8cfc261774..0000000000
--- a/lib/local-pcg-execution/include/local-pcg-execution/parallel_model_training_instance.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef _FLEXFLOW_LIB_LOCAL_PCG_EXECUTION_INCLUDE_LOCAL_PCG_EXECUTION_PARALLEL_MODEL_TRAINING_INSTANCE_H
-#define _FLEXFLOW_LIB_LOCAL_PCG_EXECUTION_INCLUDE_LOCAL_PCG_EXECUTION_PARALLEL_MODEL_TRAINING_INSTANCE_H
-
-#include "compiler/mapped_parallel_computation_graph.dtg.h"
-#include "kernels/allocation.h"
-#include "local-execution/local_atomic_tensor_backing.dtg.h"
-#include "local-execution/local_task_registry.dtg.h"
-#include "local-pcg-execution/local_parallel_tensor_backing.dtg.h"
-#include "local-pcg-execution/task_group_execution_times.dtg.h"
-#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
-#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h"
-#include "task-spec/symbolic/training_symbolic_computation_graph_from_pcg_conversion.dtg.h"
-
-namespace FlexFlow {
-
-struct ParallelModelTrainingInstance {
-  ParallelModelTrainingInstance(Allocator const &,
-                                LossAttrs const &,
-                                OptimizerAttrs const &);
-
-public:
-  std::unordered_map<parallel_layer_guid_t,
-                     std::optional<TaskGroupExecutionTimes>>
-      forward();
-  std::unordered_map<parallel_layer_guid_t,
-                     std::optional<TaskGroupExecutionTimes>>
-      backward();
-  void update();
-  GenericTensorAccessorR get_loss_tensor_accessor() const;
-
-private:
-  Allocator allocator;
-  LossAttrs loss_attrs;
-  OptimizerAttrs optimizer_attrs;
-  TrainingSymbolicComputationGraphFromPcgConversion symbolic_cg;
-  MappedParallelComputationGraph mapped_pcg;
-  LocalParallelTensorBacking local_tensor_backing;
-  LocalAtomicTensorBacking local_atomic_tensor_backing;
-  LocalTaskRegistry local_task_registry;
-  RuntimeArgConfig runtime_arg_config;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/op-attrs/include/op-attrs/num_ptensor_shard_dims_t.dtg.toml b/lib/op-attrs/include/op-attrs/num_ptensor_shard_dims_t.dtg.toml
index 45372cf7e8..8ca055441d 100644
--- a/lib/op-attrs/include/op-attrs/num_ptensor_shard_dims_t.dtg.toml
+++ b/lib/op-attrs/include/op-attrs/num_ptensor_shard_dims_t.dtg.toml
@@ -9,11 +9,11 @@ features = [
   "json",
 ]
 
-doctstring = """\
-A wrapper type describing the number of shard dims (i.e., not including replia dims) in a parallel tensor, 
+docstring = """\
+A wrapper type describing the number of shard dims (i.e., not including replia dims) in a parallel tensor,
 to prevent accidentally confusing the number of shard dims and the total number of parallel dims.
 
-The conversion to/from @ref num_ptensor_parallel_dims_t is trivial, and provided by the 
+The conversion to/from @ref num_ptensor_parallel_dims_t is trivial, and provided by the
 functions @ref num_ptensor_parallel_dims_from_shard_dims and @ref num_ptensor_shard_dims_from_parallel_dims.
 """
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
index 726aef84ba..db5c4a399b 100644
--- a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
@@ -3,6 +3,7 @@
 
 #include "realm-execution/device_specific_ptr.h"
 #include "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h"
+#include "utils/containers/transform.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/src/realm-execution/device_specific_ptr.cc b/lib/realm-execution/src/realm-execution/device_specific_ptr.cc
new file mode 100644
index 0000000000..977f57555d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/device_specific_ptr.cc
@@ -0,0 +1,10 @@
+#include "realm-execution/device_specific_ptr.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template struct DeviceSpecificPtr<T>;
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm.cc b/lib/realm-execution/src/realm-execution/realm.cc
new file mode 100644
index 0000000000..38b3281f8b
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm.cc
@@ -0,0 +1 @@
+#include "realm-execution/realm.h"
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_device_specific_ptr.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_device_specific_ptr.cc
new file mode 100644
index 0000000000..13ea814889
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_device_specific_ptr.cc
@@ -0,0 +1,14 @@
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template SerializableDeviceSpecificPtr
+    device_specific_ptr_to_serializable(DeviceSpecificPtr<T> const &);
+
+template DeviceSpecificPtr<T> device_specific_ptr_from_serializable(
+    SerializableDeviceSpecificPtr const &);
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/task_arg_serializer.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/task_arg_serializer.cc
new file mode 100644
index 0000000000..e17e24ba68
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/task_arg_serializer.cc
@@ -0,0 +1,12 @@
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
+#include "utils/archetypes/jsonable_value_type.h"
+
+namespace FlexFlow {
+
+using T = jsonable_value_type<0>;
+
+template std::string serialize_task_args(T const &);
+
+template T deserialize_task_args(void const *, size_t);
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_node_invocation.h b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_node_invocation.h
deleted file mode 100644
index 94a4886b49..0000000000
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_node_invocation.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_NODE_INVOCATION_H
-#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_NODE_INVOCATION_H
-
-#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
-
-namespace FlexFlow {
-
-bool invocation_fully_satisfies_expansion_conditions(
-    std::function<bool(DynamicNodeAttrs const &)> const &node_condition,
-    std::function<bool(DynamicTensorSlot const &)> const &slot_condition,
-    std::function<bool(DynamicTensorSlotArguments const &)> const &) {
-
-]
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/include/task-spec/ops/impl/parallel_op.h b/lib/task-spec/include/task-spec/ops/impl/parallel_op.h
deleted file mode 100644
index 7061821b62..0000000000
--- a/lib/task-spec/include/task-spec/ops/impl/parallel_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_IMPL_PARALLEL_OP_H
-#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_IMPL_PARALLEL_OP_H
-
-#include "parallel_op_info.h"
-#include "utils/optional.h"
-
-namespace FlexFlow {
-
-struct ParallelOpJoinResult {
-  std::optional<ParallelOpInfo> op = std::nullopt;
-  bool join_did_succeed = false;
-};
-
-ParallelOpJoinResult try_join_parallel_ops(ParallelOpInfo const &,
-                                           ParallelOpInfo const &);
-
-/* class ParallelOp : public Op { */
-/* public: */
-/*   ParallelOp(FFModel &model, */
-/*              OperatorType type, */
-/*              char const *_name, */
-/*              const ParallelTensor input); */
-/*   virtual void init(FFModel const &) = 0; */
-/*   virtual void forward(FFModel const &) = 0; */
-/*   virtual void backward(FFModel const &) = 0; */
-/*   virtual void create_input_partition(FFModel &model) = 0; */
-/*   virtual bool measure_operator_cost(Simulator *sim, */
-/*                                      MachineView const &pc, */
-/*                                      CostMetrics &cost_metrics) const = 0; */
-/*   virtual bool append_parallel_op_info( */
-/*       std::vector<ParallelOpInfo> &parallel_ops) const = 0; */
-/*   virtual bool is_parallel_op() const; */
-
-/* public: */
-/*   Legion::LogicalPartition input_lp, output_grad_lp; */
-/* }; */
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/src/task-spec/serialization.cc b/lib/task-spec/src/task-spec/serialization.cc
new file mode 100644
index 0000000000..a2ad6eabfa
--- /dev/null
+++ b/lib/task-spec/src/task-spec/serialization.cc
@@ -0,0 +1 @@
+#include "task-spec/serialization.h"
diff --git a/lib/task-spec/test/CMakeLists.txt b/lib/task-spec/test/CMakeLists.txt
index 354d9358a5..9665dba88e 100644
--- a/lib/task-spec/test/CMakeLists.txt
+++ b/lib/task-spec/test/CMakeLists.txt
@@ -2,8 +2,8 @@ ff_add_test_executable(
   NAME
     task-spec-tests
   SRC_PATTERNS
-    src/task-spec/dynamic_graph/*.cc
-  PRIVATE_INCLUDE 
+    src/*.cc
+  PRIVATE_INCLUDE
     src/
   DEPS
     doctest
diff --git a/lib/task-spec/test/src/task-spec/device_specific.cc b/lib/task-spec/test/src/task-spec/device_specific.cc
index b5ee11d109..34ef9b2bef 100644
--- a/lib/task-spec/test/src/task-spec/device_specific.cc
+++ b/lib/task-spec/test/src/task-spec/device_specific.cc
@@ -5,13 +5,17 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("DeviceSpecific") {
-    DeviceSpecific<std::string> device_specific =
+    DeviceSpecific<std::string> device_specific1 =
+        DeviceSpecific<std::string>::create(device_id_t{gpu_id_t{0_n}},
+                                            "hello world");
+
+    DeviceSpecific<std::string> device_specific2 =
         DeviceSpecific<std::string>::create(device_id_t{gpu_id_t{1_n}},
                                             "hello world");
 
-    std::string result = fmt::to_string(device_specific);
-    std::string correct = "hi";
+    std::string result1 = fmt::to_string(device_specific1);
+    std::string result2 = fmt::to_string(device_specific2);
 
-    ASSERT(result == correct);
+    CHECK(result1 != result2);
   }
 }
diff --git a/lib/task-spec/test/src/task-spec/op_ordered_slot_signature.cc b/lib/task-spec/test/src/task-spec/op_ordered_slot_signature.cc
deleted file mode 100644
index c9da5953da..0000000000
--- a/lib/task-spec/test/src/task-spec/op_ordered_slot_signature.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "task-spec/op_ordered_slot_signature.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("get_op_ordered_slot_signature_for_binding") {
-    CHECK_MESSAGE(false, "TODO: get_op_ordered_slot_signature_for_binding");
-  }
-}
diff --git a/lib/task-spec/test/src/task-spec/training_tensor_group.cc b/lib/task-spec/test/src/task-spec/training_tensor_group.cc
deleted file mode 100644
index b40c38ce69..0000000000
--- a/lib/task-spec/test/src/task-spec/training_tensor_group.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "task-spec/training_tensor_group.h"
-#include "test/utils/doctest/fmt/unordered_set.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("get_all_training_tensors_in_tensor_group") {
-    forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3};
-    gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5};
-    optimizer_tensor_guid_t optimizer_tensor1 = optimizer_tensor_guid_t{8};
-    optimizer_tensor_guid_t optimizer_tensor2 = optimizer_tensor_guid_t{3};
-
-    std::vector<optimizer_tensor_guid_t> optimizer_tensors = {
-        optimizer_tensor1,
-        optimizer_tensor2,
-    };
-
-    TrainingTensorGroup training_tensor_group = TrainingTensorGroup{
-        /*forward_tensor=*/forward_tensor,
-        /*gradient_tensor=*/gradient_tensor,
-        /*optimizer_tensors=*/optimizer_tensors,
-    };
-
-    std::unordered_set<training_tensor_guid_t> result =
-        get_all_training_tensors_in_tensor_group(training_tensor_group);
-    std::unordered_set<training_tensor_guid_t> correct = {
-        training_tensor_guid_t{forward_tensor},
-        training_tensor_guid_t{gradient_tensor},
-        training_tensor_guid_t{optimizer_tensor1},
-        training_tensor_guid_t{optimizer_tensor2},
-    };
-
-    CHECK(result == correct);
-  }
-}
diff --git a/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc b/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc
deleted file mode 100644
index f769a877ad..0000000000
--- a/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "task-spec/training_tensor_group_with_attrs.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("make_training_tensor_group_with_attrs_from_group_and_attrs") {
-    TensorAttrs tensor_attrs = TensorAttrs{
-        /*shape=*/TensorShape{
-            /*dims=*/TensorDims{FFOrdered{
-                8_p,
-                2_p,
-                3_p,
-            }},
-            /*data_type=*/DataType::FLOAT,
-        },
-        /*create_grad=*/CreateGrad::YES,
-    };
-
-    forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3};
-    gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5};
-    std::vector<optimizer_tensor_guid_t> optimizer_tensors = {
-        optimizer_tensor_guid_t{8},
-        optimizer_tensor_guid_t{3},
-    };
-
-    TrainingTensorGroup training_tensor_group = TrainingTensorGroup{
-        /*forward_tensor=*/forward_tensor,
-        /*gradient_tensor=*/gradient_tensor,
-        /*optimizer_tensors=*/optimizer_tensors,
-    };
-
-    TrainingTensorGroupWithAttrs result =
-        make_training_tensor_group_with_attrs_from_group_and_attrs(
-            training_tensor_group, tensor_attrs);
-    TrainingTensorGroupWithAttrs correct = TrainingTensorGroupWithAttrs{
-        /*tensor_attrs=*/tensor_attrs,
-        /*forward_tensor=*/forward_tensor,
-        /*gradient_tensor=*/gradient_tensor,
-        /*optimizer_tensors=*/optimizer_tensors,
-    };
-
-    CHECK(result == correct);
-  }
-
-  TEST_CASE("tensor_group_without_attrs") {
-    TensorAttrs tensor_attrs = TensorAttrs{
-        /*shape=*/TensorShape{
-            /*dims=*/TensorDims{FFOrdered{
-                8_p,
-                2_p,
-                3_p,
-            }},
-            /*data_type=*/DataType::FLOAT,
-        },
-        /*create_grad=*/CreateGrad::YES,
-    };
-
-    forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3};
-    gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5};
-    std::vector<optimizer_tensor_guid_t> optimizer_tensors = {
-        optimizer_tensor_guid_t{8},
-        optimizer_tensor_guid_t{3},
-    };
-
-    TrainingTensorGroupWithAttrs tensor_group_with_attrs =
-        TrainingTensorGroupWithAttrs{
-            /*tensor_attrs=*/tensor_attrs,
-            /*forward_tensor=*/forward_tensor,
-            /*gradient_tensor=*/gradient_tensor,
-            /*optimizer_tensors=*/optimizer_tensors,
-        };
-
-    TrainingTensorGroup result =
-        tensor_group_without_attrs(tensor_group_with_attrs);
-    TrainingTensorGroup correct = TrainingTensorGroup{
-        /*forward_tensor=*/forward_tensor,
-        /*gradient_tensor=*/gradient_tensor,
-        /*optimizer_tensors=*/optimizer_tensors,
-    };
-
-    CHECK(result == correct);
-  }
-}
diff --git a/lib/utils/benchmark/src/utils/graph/digraph/algorithms/random_dag.cc b/lib/utils/benchmark/src/internal/random_dag.cc
similarity index 100%
rename from lib/utils/benchmark/src/utils/graph/digraph/algorithms/random_dag.cc
rename to lib/utils/benchmark/src/internal/random_dag.cc
diff --git a/lib/utils/benchmark/src/utils/graph/digraph/algorithms/random_dag.h b/lib/utils/benchmark/src/internal/random_dag.h
similarity index 100%
rename from lib/utils/benchmark/src/utils/graph/digraph/algorithms/random_dag.h
rename to lib/utils/benchmark/src/internal/random_dag.h
diff --git a/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_closure.cc b/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_closure.cc
index a22b41ee6c..80f393eccd 100644
--- a/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_closure.cc
+++ b/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_closure.cc
@@ -1,5 +1,5 @@
 #include "utils/graph/digraph/algorithms/transitive_closure.h"
-#include "./random_dag.h"
+#include "internal/random_dag.h"
 #include <benchmark/benchmark.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_reduction.cc b/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_reduction.cc
index bbb3d238a2..f72b0908ac 100644
--- a/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_reduction.cc
+++ b/lib/utils/benchmark/src/utils/graph/digraph/algorithms/transitive_reduction.cc
@@ -1,5 +1,5 @@
 #include "utils/graph/digraph/algorithms/transitive_reduction.h"
-#include "./random_dag.h"
+#include "internal/random_dag.h"
 #include <benchmark/benchmark.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/utils/include/utils/full_binary_tree/binary_tree_path_entry.dtg.toml b/lib/utils/include/utils/full_binary_tree/binary_tree_path_entry.dtg.toml
index c4567a0e87..84f95c0d2e 100644
--- a/lib/utils/include/utils/full_binary_tree/binary_tree_path_entry.dtg.toml
+++ b/lib/utils/include/utils/full_binary_tree/binary_tree_path_entry.dtg.toml
@@ -10,8 +10,6 @@ features = [
 
 [[values]]
 name = "LEFT_CHILD"
-key = "left"
 
 [[values]]
 name = "RIGHT_CHILD"
-key = "right"
diff --git a/lib/utils/include/utils/full_binary_tree/full_binary_tree_node_type.dtg.toml b/lib/utils/include/utils/full_binary_tree/full_binary_tree_node_type.dtg.toml
index dc49c0b696..e9148a6506 100644
--- a/lib/utils/include/utils/full_binary_tree/full_binary_tree_node_type.dtg.toml
+++ b/lib/utils/include/utils/full_binary_tree/full_binary_tree_node_type.dtg.toml
@@ -10,8 +10,6 @@ features = [
 
 [[values]]
 name = "PARENT"
-key = "parent"
 
 [[values]]
 name = "LEAF"
-key = "leaf"
diff --git a/lib/utils/src/utils/graph/open_kwarg_dataflow_graph/open_kwarg_dataflow_graph.cc b/lib/utils/src/utils/graph/open_kwarg_dataflow_graph/open_kwarg_dataflow_graph.cc
new file mode 100644
index 0000000000..23908cf784
--- /dev/null
+++ b/lib/utils/src/utils/graph/open_kwarg_dataflow_graph/open_kwarg_dataflow_graph.cc
@@ -0,0 +1,11 @@
+#include "utils/graph/open_kwarg_dataflow_graph/open_kwarg_dataflow_graph.h"
+#include "utils/archetypes/ordered_value_type.h"
+
+namespace FlexFlow {
+
+using GraphInputName = ordered_value_type<0>;
+using SlotName = ordered_value_type<1>;
+
+template struct OpenKwargDataflowGraph<GraphInputName, SlotName>;
+
+} // namespace FlexFlow