diff --git a/.gitignore b/.gitignore
index 5e135d16..728bdc1a 100755
--- a/.gitignore
+++ b/.gitignore
@@ -63,3 +63,23 @@ env-fast
 
 # TLS certificates — local only, never commit (paths to certs are in .env)
 .certs/
+
+# Benchmark simulation output files
+sim_*.tsv
+sim_*.tsv.zst
+
+# Sweep run logs and results (local benchmark output)
+sweep_logs/
+sweep_flux_master.log
+results/
+
+# Test scripts and helpers not part of the benchmark suite
+test_s3dlio_gen_direct.py
+
+# Hydra runtime output (created in cwd when running workloads with hydra config)
+hydra_log/
+
+# Timestamped sweep run logs written to repo root by sweep_*.sh scripts
+sweep_unet3d_*.log
+sweep_dlrm_*.log
+sweep_flux_*.log
diff --git a/configs/dlio/workload/dlrm_b200.yaml b/configs/dlio/workload/dlrm_b200.yaml
index 51341d5b..13eedb68 100644
--- a/configs/dlio/workload/dlrm_b200.yaml
+++ b/configs/dlio/workload/dlrm_b200.yaml
@@ -12,7 +12,7 @@ dataset:
   data_folder: data/dlrm/
   format: parquet
   num_files_train: 1024        # Number of training files to generate
-  num_samples_per_file: 4718592    # Samples per parquet file
+  num_samples_per_file: 1536000    # 250 RGs × 6144 → ~3.1 MiB footer (under s3-ultra 4 MiB limit)
   record_length_bytes: 761
   compression: none          # Options: snappy, gzip, lz4, zstd, none
   
@@ -627,12 +627,12 @@ dataset:
 reader:
   data_loader: pytorch
   batch_size: 12288
-  prefetch_size: 2  # Increase from default 2 for better I/O overlap
-  read_threads: 4   # Increase parallelism
+  prefetch_size: 0
+  read_threads: 0   # single-process, no IPC overhead; ThreadPoolExecutor handles I/O
   file_shuffle: seed
 
 train:
-  epochs: 1
+  epochs: 2
   computation_time: 0.000375
 
 metric:
diff --git a/configs/dlio/workload/dlrm_datagen.yaml b/configs/dlio/workload/dlrm_datagen.yaml
index 46eb1533..28944102 100755
--- a/configs/dlio/workload/dlrm_datagen.yaml
+++ b/configs/dlio/workload/dlrm_datagen.yaml
@@ -13,13 +13,14 @@ dataset:
   data_folder: data/dlrm/
   format: parquet
   num_files_train: 1024        # Number of training files to generate
-  num_samples_per_file: 4718592    # Samples per parquet file
+  num_samples_per_file: 1536000    # Samples per parquet file (250 RGs × 6144 → ~3.1 MiB footer, under s3-ultra 4 MiB limit)
   record_length_bytes: 761
   compression: none          # Options: snappy, gzip, lz4, zstd, none
   
   # Parquet-specific configuration
   parquet:
-    row_group_size: 8192
+    use_s3dlio_gen: true
+    row_group_size: 6144  # Match batch_size for optimal caching
     read_mode: row_group
     
     columns:
diff --git a/configs/dlio/workload/flux_datagen.yaml b/configs/dlio/workload/flux_datagen.yaml
index 001d4ae4..6cc1dd0d 100755
--- a/configs/dlio/workload/flux_datagen.yaml
+++ b/configs/dlio/workload/flux_datagen.yaml
@@ -17,6 +17,8 @@ dataset:
   record_length: 2164832
 
   parquet:
+    use_s3dlio_gen: true
+    row_group_size: 48
     # Parquet-specific field specifications
     columns:
       - name: t5_encodings
diff --git a/configs/dlio/workload/unet3d_b200.yaml b/configs/dlio/workload/unet3d_b200.yaml
new file mode 100644
index 00000000..2db0b8cd
--- /dev/null
+++ b/configs/dlio/workload/unet3d_b200.yaml
@@ -0,0 +1,40 @@
+model: 
+  name: unet3d
+  type: cnn
+  model_size: 499153191
+
+framework: pytorch
+
+workflow:
+  generate_data: False
+  train: True
+  checkpoint: False
+
+dataset: 
+  data_folder: data/unet3d/
+  format: npz
+  num_files_train: 7200        # ~984 GiB: 7200 × ~140 MiB avg file size
+  num_samples_per_file: 1
+  record_length_bytes: 146600628
+  record_length_bytes_stdev: 68341808
+  record_length_bytes_resize: 2097152
+
+reader: 
+  data_loader: pytorch
+  batch_size: 7
+  read_threads: 4
+  file_shuffle: seed
+  sample_shuffle: seed
+
+train:
+  epochs: 5
+  # B200 computation_time = H100 (0.323 s) ÷ 2 (B200 is ~2× faster than H100)
+  computation_time: 0.162
+
+checkpoint:
+  checkpoint_folder: checkpoints/unet3d
+  checkpoint_after_epoch: 5
+  epochs_between_checkpoints: 2
+
+metric:
+  au: 0.90
diff --git a/docs/DATALOADER_ARCHITECTURE.md b/docs/DATALOADER_ARCHITECTURE.md
new file mode 100644
index 00000000..122d7b87
--- /dev/null
+++ b/docs/DATALOADER_ARCHITECTURE.md
@@ -0,0 +1,310 @@
+# Data Loader Architecture: Map-Style vs. Iterable-Style
+
+**Status**: Implemented in `dlio_benchmark/dlio_benchmark/data_loader/torch_data_loader.py`.
+**Relevant workloads**: UNet3D (NPZ), RetinaNet (JPEG), and any NPY/PNG workload on S3 or POSIX storage.
+
+---
+
+## Table of Contents
+
+1. [Background: The Conventional Wisdom](#background-the-conventional-wisdom)
+2. [What Actually Matters for Object Storage](#what-actually-matters-for-object-storage)
+3. [Implementation: TorchIterableDatasetSimple](#implementation-torchiterabledatasetsimple)
+4. [Known Limitations and Future Work](#known-limitations-and-future-work)
+5. [Summary](#summary)
+6. [Related Documents](#related-documents)
+7. [O_DIRECT Local Storage: Two Independent Paths](#o_direct-local-storage-two-independent-paths)
+   - [Why O_DIRECT Matters for NVMe Benchmarks](#why-o_direct-matters-for-nvme-benchmarks)
+   - [Path 1: `odirect: true` — Python O_DIRECT (legacy map-style)](#path-1-odirect-true--python-o_direct-legacy-map-style)
+   - [Path 2: `storage_library: direct` — Rust/Tokio O_DIRECT (new async path)](#path-2-storage_library-direct--rusttokio-o_direct-new-async-path)
+   - [Comparison](#comparison)
+   - [Which Path to Use](#which-path-to-use)
+
+---
+
+## Background: The Conventional Wisdom
+
+A common recommendation is that **iterable-style data loaders are better for large datasets**.
+This advice is correct in its original context — local filesystem reads on spinning disk — but the
+reasoning does *not* transfer directly to object storage. Understanding *why* iterable can be better
+(and when it is not) is critical for choosing the right approach.
+
+The original case for iterable:
+
+- **Map-style requires a full index upfront** — you must know `len(dataset)` to build a sampler.
+- **Map-style with shuffled indices causes random seeks** — on HDDs, jumping around the dataset
+  produces catastrophically bad throughput.
+- **Iterable-style reads sequentially** — the iterator delivers samples in whatever order it
+  generates them, which aligns naturally with sequential disk I/O.
+
+For object storage, neither of these concerns applies. There is no seek penalty — an S3 GET for
+object #7,199 costs the same as a GET for object #0. The raw "iterable is better" rule does not
+carry over.
+
+---
+
+## What Actually Matters for Object Storage
+
+The real performance argument for iterable-style on object storage is about
+**concurrency pipeline depth**, not seek patterns.
+
+### Previous path — map-style TorchDataset, 4 workers (replaced)
+
+```
+Worker 0: __getitem__(idx_0) → read_index() → _s3_ensure_cached() → get_many([1 object])
+Worker 1: __getitem__(idx_1) → read_index() → _s3_ensure_cached() → get_many([1 object])
+Worker 2: __getitem__(idx_2) → read_index() → _s3_ensure_cached() → get_many([1 object])
+Worker 3: __getitem__(idx_3) → read_index() → _s3_ensure_cached() → get_many([1 object])
+```
+
+Total in-flight S3 requests: **4** (one per DataLoader worker). Map-style is still used for
+format types that do not have iterator-based readers (e.g. SYNTHETIC, HDF5 without S3 backend).
+
+### Current path — TorchIterableDatasetSimple, 4 workers (implemented)
+
+```
+Worker 0: next() → _s3_prefetch_all() → get_many([~1800 objects, max_in_flight=64])
+Worker 1: next() → _s3_prefetch_all() → get_many([~1800 objects, max_in_flight=64])
+Worker 2: next() → _s3_prefetch_all() → get_many([~1800 objects, max_in_flight=64])
+Worker 3: next() → _s3_prefetch_all() → get_many([~1800 objects, max_in_flight=64])
+```
+
+For local / POSIX storage, `_localfs_prefetch_all()` is used instead:
+```
+Worker k: next() → _localfs_prefetch_all() → ThreadPoolExecutor(64 threads) → pread(1 file each)
+```
+
+Total in-flight: up to **64 objects per worker × 4 workers = 256 concurrent S3 GETs**
+(or 256 concurrent `pread` calls for local FS).
+
+While the compute side is processing one object, up to 63 more are already being fetched for that
+worker alone. This keeps the network link and storage server fully utilized even when individual
+GETs have variable latency.
+
+---
+
+## Implementation: TorchIterableDatasetSimple
+
+The fix is `TorchIterableDatasetSimple` in `torch_data_loader.py`, which activates for all
+`_simple_iterable_formats = (NPZ, NPY, JPEG, PNG)` on both S3 and local FS.
+
+Key mechanics:
+
+1. **File sharding** — `__iter__` computes `my_files = all_files[worker_id::num_workers]`,
+   giving each PyTorch worker a distinct non-overlapping file subset.
+
+2. **file_map installation** — the shard is installed as
+   `reader.file_map[thread_index] = [(global_idx, filename, sample_in_file), ...]`
+   so that `reader.next()` (which reads `file_map[thread_index]`) picks it up.
+
+3. **Bulk prefetch** — `reader.next()` calls `_s3_prefetch_all()` (S3) or
+   `_localfs_prefetch_all()` (local FS) before starting iteration. All files for this
+   worker's shard are fetched in parallel (up to 64 in-flight) before any sample is yielded.
+
+4. **Yield** — one dummy item is yielded per complete batch, consistent with the Parquet
+   `TorchIterableDataset` pattern. `batch_size=None` in the DataLoader passes items through
+   unchanged. FormatReader.next() handles drop-last internally.
+
+The DLIO log now prints `TorchIterableDatasetSimple(bulk-prefetch, N workers)` for these
+formats instead of `TorchDataset(map-style, N workers)`.
+
+---
+
+## Known Limitations and Future Work
+
+### 1. Per-epoch file shuffle in workers
+
+PyTorch DataLoader workers are spawned with a pickled snapshot of `ConfigArguments`.
+When the main process calls `reconfigure(epoch+1)`, the shuffled `file_list_train` is
+not propagated to persistent workers. Each worker's `_file_list` reflects the epoch-1
+ordering for all subsequent epochs.
+
+For a **storage I/O benchmark**, this is acceptable: throughput and latency measurements
+are not affected by file ordering on object storage (no HDD seek penalty). File order
+does not affect whether all files are read.
+
+For **ML training correctness**, per-epoch reshuffling matters. A future improvement:
+pass an epoch seed into `TorchIterableDatasetSimple` and shuffle `all_files` with
+`np.random.default_rng(seed + epoch)` inside `__iter__`.
+
+### 2. Prefetch memory for small objects
+
+`_s3_prefetch_all()` issues GETs for all objects in a worker's slice (up to ~1,800 for NP=4)
+with 64 in-flight. The cache stores `{key: byte_count}` only — actual bytes are consumed
+by s3dlio's callback immediately after transfer. Memory footprint is bounded by the
+in-flight window size (64 × object_size), not the full epoch size.
+
+For UNet3D (140 MiB objects): 64 × 140 MiB ≈ 9 GiB peak per worker.
+For RetinaNet (315 KB objects): 64 × 315 KB ≈ 20 MiB peak per worker — negligible.
+
+### 3. Drop-last behavior
+
+`FormatReader.next()` drops the final partial batch if `len(shard) % batch_size != 0`.
+This matches the map-style `drop_last=True` behavior. No action needed.
+
+---
+
+## Summary
+
+| Property | Map-style (old) | TorchIterableDatasetSimple (current) |
+|---|---|---|
+| Formats | All | NPZ, NPY, JPEG, PNG |
+| Storage backends | All | S3 (s3dlio/minio/s3torch) **and** POSIX/local FS |
+| In-flight S3 requests | `1 × num_workers` | `64 × num_workers` |
+| In-flight local reads | `1 × num_workers` | `64 × num_workers` (ThreadPool) |
+| Per-object bandwidth | Good (s3dlio byte-range) | Same |
+| Worker file partitioning | Automatic via Sampler | `all_files[worker_id::num_workers]` |
+| Per-epoch file shuffle | Via VirtualIndexMap | `_file_list` as-is (epoch 1 order) |
+| Implementation status | Retired for NPZ/NPY/JPEG/PNG | **Active** |
+
+The most important validation step: a side-by-side benchmark sweep (UNet3D and RetinaNet,
+identical NP/config) measuring `train_throughput_MB_per_second` with the new vs. old path.
+Expected improvement is largest for small objects (RetinaNet 315 KB: no byte-range splitting,
+pipeline depth was 1 per worker, now 64 per worker).
+
+---
+
+## Related Documents
+
+- [UNet3D_NP_Scaling_Results.md](UNet3D_NP_Scaling_Results.md) — benchmark results where this
+  architectural choice is most relevant
+- [ARCHITECTURE.md](ARCHITECTURE.md) — overall system architecture
+- [STORAGE_LIBRARIES.md](STORAGE_LIBRARIES.md) — s3dlio capabilities (get_many, byte-range GETs,
+  ObjectSizeCache)
+- [PARQUET_FORMATS.md](PARQUET_FORMATS.md) — the Parquet iterable reader that already uses the
+  `TorchIterableDataset` path
+
+---
+
+# O_DIRECT Local Storage: Two Independent Paths
+
+DLIO has **two separate mechanisms** for bypassing the Linux page cache when reading local
+(POSIX/NVMe) files. Both are preserved and intentionally kept distinct so they can be compared
+against each other directly.
+
+---
+
+## Why O_DIRECT Matters for NVMe Benchmarks
+
+The Linux page cache caches file data in DRAM. After the first read pass, subsequent reads of the
+same files are served entirely from memory, not from the storage device. For an I/O benchmark
+intended to stress NVMe drives this is fatal: repeated runs measure DRAM bandwidth (40–60 GB/s
+on a modern server) rather than NVMe device bandwidth (3–15 GB/s per drive). The numbers are
+plausible-looking but completely wrong.
+
+`O_DIRECT` opens files with the `O_DIRECT` flag, which instructs the kernel to transfer data
+directly between the storage device and a userspace buffer, bypassing the page cache entirely.
+Cold-run and warm-run throughput become essentially identical, accurately reflecting the hardware.
+The tradeoff: userspace buffers must be 4 KiB-aligned and reads must be a multiple of the block
+size (512 B or 4096 B depending on the device).
+
+---
+
+## Path 1: `odirect: true` — Python O_DIRECT (legacy map-style)
+
+Activated by setting the top-level `odirect: true` flag in the DLIO YAML config:
+
+```yaml
+odirect: true
+```
+
+**Implementation**: `reader_factory.py` detects `odirect == True` and routes to
+`NPZReaderODIRECT` / `NPYReaderODirect` instead of the default readers.
+
+**How it works** (`npy_reader_odirect.py`, `npz_reader_odirect.py`):
+
+1. `os.open(filepath, os.O_RDONLY | os.O_DIRECT)` — opens the file with O_DIRECT in Python.
+2. A 4 KiB-aligned buffer is manually allocated with `ctypes` + `bytearray` arithmetic.
+3. `os.readv(fd, [mem_view])` — single synchronous read into the aligned buffer.
+4. `parse_npy()` / `parse_npz()` — full NPY/NPZ format decode in Python: `struct.unpack` header
+   parsing, optional `zlib.decompress()` (NPZ), and `np.ndarray()` construction from the
+   in-memory buffer (zero-copy array view).
+
+**Concurrency model**: map-style `__getitem__` path. Each PyTorch DataLoader worker calls
+`odirect_read()` once per sample index, synchronously. There is no prefetch, no concurrency
+within a worker, and no inter-worker coordination. Concurrency is provided only by the number of
+DataLoader workers (`num_workers` in `torch.utils.data.DataLoader`).
+
+**PyTorch involvement**: PyTorch provides the outer loop (the DataLoader process pool and
+`__getitem__` dispatch). PyTorch does **not** issue any I/O itself — all reads are done by the
+Python `os.open` + `os.readv` path above. The term "PyTorch O_DIRECT" would be misleading;
+this is purely Python-level O_DIRECT wired into the PyTorch DataLoader's index-based interface.
+
+---
+
+## Path 2: `storage_library: direct` — Rust/Tokio O_DIRECT (new async path)
+
+Activated by setting `storage_library: direct` inside `storage_options` in the DLIO YAML config:
+
+```yaml
+storage:
+  storage_type: local_fs
+  storage_root: /mnt/nvme/dataset
+  storage_options:
+    storage_library: direct   # activates Rust async O_DIRECT
+```
+
+**Implementation**: `_LocalFSIterableMixin._localfs_init()` reads `storage_options.storage_library`.
+When set to `"direct"`, it sets `self._use_direct = True` and validates that `s3dlio` is
+importable. `_localfs_prefetch_all()` then dispatches to `_prefetch_direct()` instead of
+`_prefetch_buffered()`.
+
+**How it works** (`_local_fs_iterable_mixin.py`):
+
+1. Converts each local path to a `direct://` URI: `f"direct://{os.path.abspath(path)}"`.
+2. Calls `s3dlio.get_many(uris, max_in_flight=min(64, len(uris)))`.
+3. s3dlio's Rust backend (`file_store_direct.rs`) opens each file with `libc::O_DIRECT`,
+   allocates 4 KiB-aligned buffers in Rust, and reads via Tokio async I/O. The GIL is fully
+   released for all I/O.
+4. `_prefetch_direct()` collects byte counts from `BytesView` objects (O(1), no Python copy).
+5. Byte counts are accumulated into `_total_bytes_read` / `_total_objects_read` for
+   `finalize_local_bytes()` reporting.
+
+**Concurrency model**: iterable-style `TorchIterableDatasetSimple` path. Each worker calls
+`_localfs_prefetch_all()` once per shard, submitting up to 64 O_DIRECT reads concurrently into
+the Tokio runtime. Results are streamed back as they complete (not in submission order).
+Total concurrency: `64 × num_workers` simultaneous O_DIRECT reads.
+
+---
+
+## Comparison
+
+| Property | `odirect: true` (Path 1) | `storage_library: direct` (Path 2) |
+|---|---|---|
+| Config key | `odirect: true` (top-level) | `storage_options.storage_library: direct` |
+| I/O syscall | `os.open + os.readv` (Python) | `libc::open + O_DIRECT` (Rust, Tokio) |
+| Alignment | Python `ctypes` manual alignment | Rust automatic 4 KiB alignment |
+| GIL behavior | Held during `os.readv` | Released for all I/O |
+| Prefetch depth | 1 per DataLoader worker | 64 per DataLoader worker |
+| DataLoader style | Map-style (`__getitem__`) | Iterable-style (`__iter__`) |
+| Concurrency | `1 × num_workers` | `64 × num_workers` |
+| NPY/NPZ decode | Full in-Python decode per file | None (byte count only, decode deferred) |
+| Page cache bypass | Yes (`O_DIRECT`) | Yes (`O_DIRECT` via `direct://` URI) |
+| s3dlio dependency | No | Yes (must be installed) |
+| Formats | NPZ, NPY | NPZ, NPY, JPEG, PNG |
+| Status | Preserved (comparison baseline) | Implemented (high-concurrency path) |
+
+---
+
+## Which Path to Use
+
+Both paths are intentionally preserved. Neither removes the other.
+
+- **Use `odirect: true`** as a baseline. It provides the simplest possible O_DIRECT
+  implementation: one synchronous Python read per file per worker. If this path achieves the
+  same throughput as Path 2, it means the bottleneck is not I/O concurrency (perhaps it is
+  CPU-side decode or tensor construction).
+
+- **Use `storage_library: direct`** when you want maximum I/O concurrency on NVMe. The Rust
+  async path with 64 in-flight reads per worker is the correct model for high-queue-depth NVMe
+  drives, which perform best when saturated with many parallel requests (QD=32–128 is typical
+  for NVMe SSDs). Python map-style with 1 read per worker cannot saturate a modern NVMe device
+  regardless of the number of DataLoader workers.
+
+- **Comparing the two** directly — identical config except swapping `odirect: true` vs.
+  `storage_library: direct` — isolates the contribution of:
+  1. I/O concurrency depth (1 vs. 64 per worker)
+  2. GIL contention (held during Python `os.readv` vs. fully released in Rust)
+  3. Prefetch pipelining (none vs. up to 64 in-flight while compute processes the previous batch)
+
+This comparison is one of the primary intended use cases for keeping both paths available.
diff --git a/docs/DLRM_NP_Scaling_Results.md b/docs/DLRM_NP_Scaling_Results.md
new file mode 100644
index 00000000..e5500c3a
--- /dev/null
+++ b/docs/DLRM_NP_Scaling_Results.md
@@ -0,0 +1,222 @@
+# DLRM Training — Compute Time & NP Scaling Study
+
+---
+
+## Test Environment
+
+| Parameter | Value |
+|-----------|-------|
+| Host | 24 vCPU VM (with hyperthreading), 48 GB RAM |
+| Object storage | s3-ultra (`http://127.0.0.1:9000`, co-located on test host) |
+| Bucket / path | `mlp-dlrm / data/dlrm` |
+| Dataset | 200 files × 1,536,000 samples/file |
+| Record length | 761 bytes |
+| Batch size | 12,288 |
+| `decode_mode` | `none` |
+| Epochs | 2 |
+| Steps per epoch | 25,000 ÷ NP |
+| Model config | `dlrm_b200.yaml` |
+| MPI invocation | `mpirun -n NP -host 127.0.0.1:NP` |
+
+> **⚠️ Co-located test configuration.** The s3-ultra storage server and all benchmark processes run on the
+> **same** 24 vCPU / 48 GB RAM host, sharing CPU cores, memory, and the loopback network interface.
+> In a real deployment the storage target would be a dedicated remote system, and the CPU/memory
+> pressure that limits scaling here (particularly at NP ≥ 4) would not apply to the test processes.
+> The resource constraints described in this document are a property of this co-located setup, not
+> of the storage technology itself.
+
+**AU (Accelerator Utilization)** — fraction of wall time the simulated GPU was computing rather than waiting for I/O. AU ≥ 70% is the target threshold for a "pass." Below that, the workload is I/O-bound and the storage system cannot keep the accelerator fed.
+
+---
+
+## Phase 1 — Compute Time Sweep (NP = 1)
+
+Objective: find the `computation_time` at which the DLRM workload transitions from I/O-bound to
+compute-bound on a single accelerator. Four values were tested: 375 µs, 1 ms, 5 ms, and 10 ms.
+
+### Phase 1 — Summary Table
+
+| `computation_time` | AU% (avg) | Samples/s | I/O MiB/s | AU ≥ 70%? |
+|--------------------|-----------|-----------|-----------|-----------|
+| 375 µs | 7.88% | 2,053,984 | 1,490.7 | ❌ FAIL |
+| 1 ms | 19.59% | 2,178,529 | 1,581.1 | ❌ FAIL |
+| 5 ms | 78.69% | 1,877,874 | 1,362.9 | ✅ PASS |
+| 10 ms | 87.71% | 1,060,327 | 769.5 | ✅ PASS |
+
+### Phase 1 — Per-Epoch Detail
+
+| `computation_time` | Epoch | Wall (s) | Samples/s | AU% |
+|--------------------|-------|----------|-----------|-----|
+| 375 µs | 1 | 182.21 | 1,729,031 | 6.65% |
+| 375 µs | 2 | 129.39 | 2,378,938 | 9.10% |
+| 1 ms | 1 | 168.65 | 1,869,815 | 16.85% |
+| 1 ms | 2 | 123.55 | 2,487,243 | 22.33% |
+| 5 ms | 1 | 162.86 | 1,940,250 | 81.25% |
+| 5 ms | 2 | 169.51 | 1,815,498 | 76.13% |
+| 10 ms | 1 | 291.79 | 1,068,892 | 88.44% |
+| 10 ms | 2 | 292.12 | 1,051,762 | 86.97% |
+
+### Phase 1 — Key Observations
+
+- **The AU knee lies between 1 ms and 5 ms.** At 1 ms the workload is severely I/O-bound (AU ≈ 20%);
+  at 5 ms it passes the 70% threshold (AU ≈ 79%).
+- **Peak I/O throughput occurs in the 375 µs – 1 ms range** (~1,500–1,580 MiB/s), where the
+  simulated GPU is nearly always waiting and the pipeline is fully storage-saturated.
+- **Epoch 2 is consistently faster than Epoch 1** at low compute times — page-cache warming and
+  S3 connection reuse reduce cold-start overhead on the second pass.
+- **At ct = 10 ms the workload is strongly compute-bound** (AU ≈ 88%) and I/O throughput drops to
+  ~770 MiB/s because the GPU consumes data more slowly than storage can deliver it.
+
+---
+
+## Phase 2 — NP Scaling Sweep (ct = 1 ms and ct = 5 ms)
+
+Objective: determine how aggregate throughput and per-accelerator AU scale as NP grows from 1 to 8,
+at two operating points: one I/O-bound (ct = 1 ms) and one near the AU threshold (ct = 5 ms).
+
+Each NP rank was mapped to the same host: `mpirun -n NP -host 127.0.0.1:NP`.
+
+### Phase 2 — Summary Table
+
+| ct | NP | AU% (avg) | Samples/s | I/O MiB/s | Scaling vs NP=1 | AU ≥ 70%? |
+|----|----|-----------|-----------|-----------|-----------------|-----------|
+| 1 ms | 1 | 17.77% | 1,972,511 | 1,431.5 | 1.00× | ❌ FAIL |
+| 1 ms | 2 | 17.65% | 3,968,010 | 2,879.8 | 2.01× | ❌ FAIL |
+| 1 ms | 4 | 15.02% | 6,784,287 | 4,923.7 | 3.44× | ❌ FAIL |
+| 1 ms | 8 | — | — | — | — | 💥 CRASH (OOM) |
+| 5 ms | 1 | 80.91% | 1,933,857 | 1,403.5 | 1.00× | ✅ PASS |
+| 5 ms | 2 | 71.79% | 3,418,977 | 2,481.3 | 1.77× | ✅ PASS |
+| 5 ms | 4 | 68.67% | 6,545,863 | 4,750.6 | 3.39× | ❌ FAIL |
+| 5 ms | 8 | — | — | — | — | 💥 CRASH (OOM) |
+
+**Scaling vs NP=1**: ratio of aggregate `samples/s` at NP=N to NP=1 within the same ct group.
+Perfect linear scaling would yield 2.00×, 4.00×, 8.00× for NP=2, 4, 8.
+
+### Phase 2 — Per-Epoch Detail
+
+| ct | NP | Epoch | Wall (s) | Samples/s | AU% |
+|----|----|-------|----------|-----------|-----|
+| 1 ms | 1 | 1 | 179.15 | 1,754,308 | 15.66% |
+| 1 ms | 1 | 2 | 140.46 | 2,190,715 | 19.88% |
+| 5 ms | 1 | 1 | 165.13 | 1,911,922 | 80.19% |
+| 5 ms | 1 | 2 | 157.51 | 1,955,793 | 81.63% |
+| 1 ms | 2 | 1 | 95.23 | 3,384,832 | 14.97% |
+| 1 ms | 2 | 2 | 67.83 | 4,567,957 | 20.90% |
+| 5 ms | 2 | 1 | 94.48 | 3,414,248 | 71.64% |
+| 5 ms | 2 | 2 | 89.93 | 3,421,878 | 71.80% |
+| 1 ms | 4 | 1 | 50.28 | 6,716,084 | 14.77% |
+| 1 ms | 4 | 2 | 45.27 | 6,891,347 | 16.23% |
+| 5 ms | 4 | 1 | 52.55 | 6,424,380 | 67.64% |
+| 5 ms | 4 | 2 | 46.49 | 6,708,777 | 70.49% |
+| 1 ms | 8 | — | — | — | 💥 OOM (SIGKILL rank 4) |
+| 5 ms | 8 | — | — | — | 💥 OOM (SIGKILL rank 3) |
+
+---
+
+## Scaling Analysis
+
+### Aggregate Throughput Scaling (ct = 1 ms)
+
+| NP | Samples/s | vs NP=1 | Efficiency |
+|----|-----------|---------|------------|
+| 1 | 1,972,511 | 1.00× | 100% |
+| 2 | 3,968,010 | 2.01× | 100.5% |
+| 4 | 6,784,287 | 3.44× | 86.0% |
+
+Near-linear scaling to NP=2 (2.01× vs ideal 2.00×). At NP=4, efficiency drops to 86% — the storage
+backend is saturating at ~4,924 MiB/s and cannot maintain linear per-rank delivery.
+
+### Aggregate Throughput Scaling (ct = 5 ms)
+
+| NP | Samples/s | vs NP=1 | Efficiency |
+|----|-----------|---------|------------|
+| 1 | 1,933,857 | 1.00× | 100% |
+| 2 | 3,418,977 | 1.77× | 88.3% |
+| 4 | 6,545,863 | 3.39× | 84.7% |
+| 8 | — (CRASH) | — | — |
+
+At ct = 5 ms the workload is already near-AU-threshold at NP=1, so adding ranks increases I/O
+pressure while the per-rank compute budget remains fixed. AU degrades monotonically:
+80.91% → 71.79% → 68.67%, crossing below the 70% pass threshold at NP=4.
+
+### I/O Throughput Scaling
+
+| NP | ct=1ms I/O (MiB/s) | ct=5ms I/O (MiB/s) |
+|----|-------------------|-------------------|
+| 1 | 1,431.5 | 1,403.5 |
+| 2 | 2,879.8 | 2,481.3 |
+| 4 | 4,923.7 | 4,750.6 |
+
+I/O scales well through NP=4, with the two ct groups converging toward a similar ceiling near
+~4,750–4,924 MiB/s. This suggests the loopback MinIO instance is approaching its throughput limit
+at ~5 GB/s when 4 concurrent s3dlio processes are active.
+
+### Per-Accelerator (per-rank) Samples/s
+
+| ct | NP=1 | NP=2 | NP=4 | NP=8 |
+|----|------|------|------|------|
+| 1 ms | 1,972,511 | 1,984,005 | 1,696,072 | — |
+| 5 ms | 1,933,857 | 1,709,489 | 1,636,466 | — |
+
+At ct = 1 ms, per-rank throughput is nearly constant from NP=1 to NP=2, then drops ~15% at NP=4
+as I/O contention grows. At ct = 5 ms, per-rank throughput drops earlier because the workload is
+already closer to the storage saturation point at NP=1.
+
+---
+
+## NP = 8 Failure Analysis
+
+Both ct = 1 ms and ct = 5 ms runs at NP = 8 crashed before completing any training steps.
+
+**Root causes:**
+
+1. **OOM — kernel SIGKILL.** Each MPI rank spawns a Python process. At NP = 8, the combined memory
+   footprint (Python interpreter, DLIO data buffers, s3dlio connection pool, prefetch queues,
+   MPI runtime) exceeded the 48 GB RAM limit. The kernel OOM killer sent SIGKILL to rank 3 or 4.
+   - `mpirun noticed that process rank N exited on signal 9 (Killed)`
+
+2. **S3 TCP connection exhaustion.** 8 concurrent s3dlio processes each attempted to open
+   connection pools to s3-ultra on loopback. The aggregate connection demand — combined with
+   s3-ultra itself consuming CPU on the same host — overwhelmed the server's listener backlog,
+   causing TCP connection rejection errors on all ranks before the OOM fired on some runs.
+
+**Conclusion:** NP = 8 is not viable on this co-located 24 vCPU / 48 GB RAM setup. Maximum usable
+NP = 4. In a real deployment where s3-ultra runs on a dedicated remote system, NP = 8 would have
+the full 48 GB and all 24 vCPUs available exclusively for the benchmark processes, making this
+limitation irrelevant.
+
+---
+
+## Overall Key Findings
+
+1. **The AU knee for DLRM on this storage stack is between ct = 1 ms and ct = 5 ms.**
+   - At ct ≤ 1 ms: severely I/O-bound (AU ≈ 7–20%); storage cannot keep up regardless of NP.
+   - At ct = 5 ms: marginally passes at NP=1 and NP=2 (AU ≈ 71–81%); fails at NP=4 (AU = 68.7%).
+   - At ct = 10 ms: comfortably passes (AU ≈ 88%); workload is strongly compute-bound.
+
+2. **Storage saturates near 5 GB/s on this co-located setup.** Both ct groups hit ~4.75–4.93 GB/s
+   at NP=4, and AU begins degrading. This ceiling reflects the shared CPU/memory budget — s3-ultra
+   and the benchmark processes are competing for the same resources. On a dedicated remote storage
+   system, this throughput ceiling would be significantly higher.
+
+3. **Aggregate throughput scales near-linearly to NP=4 in the I/O-bound regime (ct = 1 ms).**
+   3.44× aggregate throughput at NP=4 (86% efficiency) reflects good parallelism up to the
+   storage bandwidth limit.
+
+4. **AU degrades with NP even when compute time is fixed.** Each additional rank increases
+   per-step I/O demand without increasing the per-step compute budget, so the storage-to-compute
+   ratio worsens. At ct = 5 ms, NP=4 drops just below the 70% threshold.
+
+5. **Epoch 2 is consistently faster than Epoch 1** at low compute times. Page-cache warming and
+   persistent S3 connections from epoch 1 reduce cold-start cost in epoch 2.
+
+6. **NP = 8 is not viable on this VM** due to OOM and S3 TCP exhaustion. Maximum recommended
+   NP for this host configuration: **4**.
+
+---
+
+---
+
+*Benchmark date: May 12, 2026*  
+*Host: loki-russ*  
+*s3-ultra (localhost:9000, co-located on test host)*
diff --git a/docs/Flux_NP_ReadThreads_Scaling_Results.md b/docs/Flux_NP_ReadThreads_Scaling_Results.md
new file mode 100644
index 00000000..069856b4
--- /dev/null
+++ b/docs/Flux_NP_ReadThreads_Scaling_Results.md
@@ -0,0 +1,158 @@
+# Flux Training — NP × Read-Threads Scaling Study
+
+---
+
+> ## ⚠️ **NON-STANDARD `computation_time` — RESULTS ARE NOT REPRESENTATIVE OF REAL TRAINING**
+>
+> **All runs in this study used `computation_time = 0.05 s` — the simulated GPU compute sleep per step.**
+>
+> **The production default for Flux (flux_b200.yaml) is `computation_time = 1.35 s`.**
+>
+> **This 27× reduction was intentional — it stress-tests the storage stack by making I/O the
+> dominant cost — but it means AU numbers and samples/s figures cannot be directly compared
+> to a real Flux training job or to any benchmark run with default settings.**
+>
+> **Do not cite these AU numbers as "Flux training performance." They are I/O-stress results only.**
+
+---
+
+## Test Environment
+
+| Parameter | Value |
+|-----------|-------|
+| Host | 24 vCPU VM (with hyperthreading), 48 GB RAM |
+| Object storage | s3-ultra (`localhost:9000`, co-located on test host) |
+| Dataset | 500 Parquet files, ~595 MiB each, 6 row groups × 99 MiB |
+| Samples/file | 288 (batch_size=48) |
+| `computation_time` | 0.05 s (fixed — stress I/O, not compute) |
+| `coalesce_rgs` | 1 (99 MiB per GET) |
+| `prefetch_workers` | 2 |
+| Model config | flux\_b200.yaml |
+
+> **⚠️ Co-located test configuration.** The s3-ultra storage server and all benchmark processes run on the
+> **same** 24 vCPU / 48 GB RAM host, sharing CPU cores, memory, and the loopback network interface.
+> In a real deployment the storage target would be a dedicated remote system, and the CPU/memory
+> pressure that limits scaling here (particularly at NP ≥ 4) would not apply to the test processes.
+> The resource constraints described in this document are a property of this co-located setup, not
+> of the storage technology itself.
+
+## Results
+
+| NP | RT | AU% | samples/s | **samp/s/GPU** | I/O MiB/s | Wall (s) | Steps | Notes |
+|----|----|-----|-----------|---------------|-----------|----------|-------|-------|
+| 1 | 1 | 96.8 | 926 | **926** | 1,911 | 188 | 3000 | |
+| 1 | 2 | 96.7 | 925 | **925** | 1,911 | 174 | 3000 | |
+| 1 | 4 | 96.7 | 925 | **925** | 1,911 | 178 | 3000 | |
+| 1 | 8 | 96.7 | 925 | **925** | 1,911 | 188 | 3000 | |
+| 2 | 1 | 96.7 | 1,849 | **925** | 3,818 | 110 | 1500 | |
+| 2 | 2 | 96.7 | 1,850 | **925** | 3,820 | 95 | 1500 | |
+| 2 | 4 | 96.4 | 1,844 | **922** | 3,807 | 102 | 1500 | |
+| 2 | 8 | 96.7 | 1,849 | **925** | 3,818 | 111 | 1500 | |
+| 4 | 1 | 91.7 | 3,496 | **874** | 7,217 | 73 | 750 | |
+| 4 | 2 | 93.2 | 3,557 | **889** | 7,343 | 60 | 750 | |
+| 4 | 4 | 92.4 | 3,526 | **882** | 7,279 | 64 | 750 | |
+| 4 | 8 | 91.7 | 3,496 | **874** | 7,217 | 76 | 750 | CPU constrained (NP×RT=32) |
+| 8 | 1 | 59.9 | 4,477 | **560** | 9,244 | 55 | 375 | |
+| 8 | 2 | 57.2 | 4,316 | **540** | 8,910 | 53 | 375 | |
+| 8 | 4 | 61.0 | 4,532 | **567** | 9,356 | 58 | 375 | CPU constrained (NP×RT=32) |
+| 8 | 8 | — | — | **—** | — | — | — | OOM — worker killed (SIGKILL); NP×RT=64 |
+
+**NP** = number of MPI ranks (`--num-accelerators`).  
+**RT** = `reader.read_threads` (Torch DataLoader workers per rank).  
+**AU** = Accelerator Utilization — fraction of time the simulated GPU was computing rather than waiting for data.  
+**samp/s/GPU** = `samples/s ÷ NP` — per-GPU throughput; the key scaling efficiency metric. Perfect linear scaling would hold this constant as NP grows. The drop from ~925 at NP=1–2 to ~560–567 at NP=8 shows the storage system losing ~40% per-GPU efficiency at 8 ranks.
+
+## CPU Constraint Threshold
+
+On this 24 vCPU (hyperthreaded) host, the practical CPU budget **shared between the benchmark
+processes and the co-located s3-ultra server** is:
+
+> **NP × RT ≤ 8 — sufficient CPU; NP × RT > 8 — CPU constrained**
+
+All combinations at or below NP×RT=8 ran with high AU (91–97%) and consistent throughput.
+Combinations above that threshold showed either degraded AU or outright failure:
+
+- **NP=4, RT=8 (NP×RT=32)** and **NP=8, RT=4 (NP×RT=32)**: AU dropped; more threads competing for 24 vCPUs than the host can efficiently schedule — and s3-ultra is consuming a share of those vCPUs on the same machine.
+- **NP=8, RT=8 (NP×RT=64)**: OOM. 8 MPI ranks × 8 DataLoader workers × 2 prefetch buffers × 99 MiB/GET ≈ 12+ GB I/O buffer pressure on a 48 GB host, combined with Python process overhead per rank and s3-ultra's own memory footprint — the kernel OOM killer fired.
+
+**In a real deployment** with s3-ultra on a dedicated remote server, all 24 vCPUs and 48 GB RAM
+would be available exclusively to the benchmark processes, and these specific constraints would
+not apply.
+
+## Key Observations
+
+1. **`read_threads` has negligible effect at NP=1 and NP=2.** AU is flat at ~96.7% across RT=1–8. With only 1–2 ranks and 0.05 s compute, a single reader thread can keep the pipeline fed. This is a storage benchmark and storage is not the bottleneck at low NP.
+
+2. **NP=4 is where storage starts to bite.** AU falls to 91–93%; throughput doubles vs NP=2 but AU drops ~5 points. RT=2 is the sweet spot here (93.2% AU, 7,343 MiB/s).
+
+3. **NP=8 makes storage the clear bottleneck.** AU falls to 57–61% — ranks are spending ~40% of their time waiting for I/O. Peak observed throughput was ~9,356 MiB/s (NP=8, RT=4). RT=4 outperforms RT=1 and RT=2 here because more concurrent reader threads help overlap I/O with the pipeline.
+
+4. **The co-located setup is the limiting factor at high NP×RT, not the storage stack itself.**
+   s3-ultra and the benchmark processes share the same CPU and memory. On a system where s3-ultra
+   is deployed on a dedicated remote server, the full host resources would be available to the
+   benchmark, and the configurations with higher NP×RT products would be expected to perform
+   significantly better.
+
+## Impact of `computation_time` on AU and Throughput
+
+### Background: How AU is Computed
+
+$$AU = \frac{t_{compute}}{t_{compute} + t_{io\_wait}}$$
+
+The I/O wait per step is a property of the **storage system only** — it does not change when
+the sleep time changes. From the measured AU values at `computation_time = 0.05 s` we can
+back-calculate the actual I/O wait the storage imposed on each configuration:
+
+| NP | RT | Measured AU (0.05s) | Implied I/O wait/step |
+|----|----|--------------------|-----------------------|
+| 1 | 1–8 | ~96.8% | ~1.7 ms |
+| 2 | 1–8 | ~96.6% | ~1.7 ms |
+| 4 | 2 | 93.2% | ~3.7 ms |
+| 4 | 1,4,8 | ~91.7–92.4% | ~4–5 ms |
+| 8 | 4 | 61.0% | ~32 ms |
+| 8 | 1 | 59.9% | ~33 ms |
+| 8 | 2 | 57.2% | ~37 ms |
+
+### Projected AU at Higher Sleep Values
+
+Plugging those I/O wait numbers into the AU formula at `0.5 s` and `1.35 s` (the production
+default):
+
+| NP | RT | AU at 0.05 s (actual) | AU at 0.5 s (projected) | AU at 1.35 s (projected) |
+|----|----|-----------------------|--------------------------|--------------------------|
+| 1 | 1–8 | ~96.8% | ~99.7% | ~99.9% |
+| 2 | 1–8 | ~96.6% | ~99.7% | ~99.9% |
+| 4 | 2 | 93.2% | 99.3% | 99.7% |
+| 4 | 1,4,8 | 91.7–92.4% | 99.1–99.2% | 99.7% |
+| 8 | 4 | 61.0% | **94.0%** | **97.7%** |
+| 8 | 1 | 59.9% | **93.7%** | **97.6%** |
+| 8 | 2 | 57.2% | **93.0%** | **97.3%** |
+
+### What This Means
+
+1. **At 0.5 s sleep**, the storage bottleneck at NP=8 is still visible (AU ≈ 93–94%) but
+   much less alarming than the 57–61% we measured. All NP≤4 runs would look essentially
+   perfect (>99% AU), completely hiding any storage sensitivity.
+
+2. **At 1.35 s (production default)**, *every single configuration* — including NP=8 — would
+   report AU above 97%. The benchmark would appear to pass with flying colours and the storage
+   system would look like it is never the bottleneck, even though at NP=8 it is imposing
+   30–37 ms of wait per step.
+
+3. **The 0.05 s setting is the right choice for a storage benchmark.** It amplifies the
+   storage signal by a factor of ~27 relative to real training. The AU drop from 96% (NP=1)
+   to 61% (NP=8) is the entire point — it reveals that the storage system has a real scaling
+   wall somewhere between NP=4 and NP=8 on this platform.
+
+4. **Throughput (samples/s and MiB/s) is unaffected by the sleep value** — the storage stack
+   does the same amount of I/O work regardless. I/O MiB/s figures in the results table are
+   valid for any sleep setting.
+
+5. **To project to a real Flux B200 job** (1.35 s compute), the NP=8 results above suggest
+   AU ≈ 97–98%. That means storage would *just barely* keep up on real hardware at 8 GPUs —
+   which is still actionable: a faster or more parallel storage backend would meaningfully
+   improve training time at scale.
+
+## Date
+
+Run: 2026-05-11
diff --git a/docs/README.md b/docs/README.md
index 1f2b85dc..6751bfeb 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -30,6 +30,8 @@ mlp-storage hosts **four benchmark workloads**:
 | Set up object storage (S3 / MinIO / Azure / GCS) | [Object_Storage.md](Object_Storage.md) |
 | Install and configure an object storage library | [Object_Storage_Library_Setup.md](Object_Storage_Library_Setup.md) |
 | Compare object storage libraries (s3dlio, minio, s3torchconnector) | [STORAGE_LIBRARIES.md](STORAGE_LIBRARIES.md) |
+| Understand map-style vs. iterable DataLoader tradeoffs for S3 | [DATALOADER_ARCHITECTURE.md](DATALOADER_ARCHITECTURE.md) |
+| Benchmark NVMe with O_DIRECT (bypass page cache) | [DATALOADER_ARCHITECTURE.md — O_DIRECT section](DATALOADER_ARCHITECTURE.md#o_direct-local-storage-two-independent-paths) |
 | Understand AIStore gaps, reader/checkpoint issues, rationalization options | [dlio_benchmark/docs/AIStore_Analysis.md](../dlio_benchmark/docs/AIStore_Analysis.md) |
 | Test streaming checkpointing | [Streaming-Chkpt-Guide.md](Streaming-Chkpt-Guide.md) |
 | Configure multi-endpoint / load-balanced object storage | [MULTI_ENDPOINT_GUIDE.md](MULTI_ENDPOINT_GUIDE.md) |
@@ -205,6 +207,28 @@ Parquet format support via two new DLIO reader classes: `ParquetReader`
 `ParquetReaderS3Iterable` (S3 object storage, byte-range GETs, all three
 object storage libraries). Includes YAML config examples and unit test commands.
 
+#### [DATALOADER_ARCHITECTURE.md](DATALOADER_ARCHITECTURE.md)
+
+Architecture and tradeoff analysis for **map-style vs. iterable-style data loaders** on both
+object storage and local NVMe. Two major topics:
+
+**Part 1 — Map-style vs. iterable on S3** (implemented via `TorchIterableDatasetSimple`):
+Explains why the conventional "iterable is better for large datasets" advice originates from
+spinning-disk seek patterns and does *not* transfer directly to S3. Covers the real argument
+for iterable on object storage (pipeline depth: 64 in-flight GETs per worker, up to 256 total),
+the tradeoffs (shuffling, worker partitioning, prefetch memory), and current implementation
+status for NPZ/NPY/JPEG/PNG workloads.
+
+**Part 2 — O_DIRECT on local NVMe** ([two independent paths](DATALOADER_ARCHITECTURE.md#o_direct-local-storage-two-independent-paths)):
+Why O_DIRECT is required for accurate NVMe benchmarking (page cache bypass). Detailed comparison
+of both available O_DIRECT mechanisms:
+- `odirect: true` — legacy Python `os.open + os.readv`, map-style, 1 read/worker (baseline)
+- `storage_library: direct` — Rust/Tokio `libc::O_DIRECT`, iterable-style, 64 reads/worker
+
+Includes a full 12-property comparison table and guidance on when to use each path (and why
+keeping both enables a direct comparison isolating I/O concurrency depth and GIL contention).
+**Essential reading before any DataLoader refactor or NVMe benchmarking run.**
+
 ---
 
 ### Extending the Benchmark Suite
diff --git a/docs/RetinaNet_NP_Scaling_Results.md b/docs/RetinaNet_NP_Scaling_Results.md
new file mode 100644
index 00000000..bf91bcd9
--- /dev/null
+++ b/docs/RetinaNet_NP_Scaling_Results.md
@@ -0,0 +1,114 @@
+# RetinaNet NP Scaling Results
+
+**Sweep date**: 2026-05-12 17:39  
+**dlio_benchmark commit**: `fc92d7f` (feat/parquet-dgen-streaming)  
+**DataLoader path**: `TorchIterableDatasetSimple` + `_s3_stream_next()` pipelined chunking
+
+---
+
+## Test Environment
+
+| Parameter | Value |
+|-----------|-------|
+| Host | 24 vCPU (Cascade Lake, no SHA-NI), 48 GB RAM |
+| Object storage | s3-ultra (`http://127.0.0.1:9000`, co-located on test host) |
+| Bucket / path | `mlp-retinanet/data/retinanet` |
+| Dataset | 50,000 JPEG files × 1 sample/file (≈ 15,399 MiB / ~15 GiB) |
+| Record length | 322,957 bytes (~315 KiB / file) |
+| Batch size | 24 |
+| Read threads | 8 |
+| `computation_time` | 0.04755 s (B200) |
+| DataLoader | `TorchIterableDatasetSimple` — pipelined chunked GETs via `_s3_stream_next()` |
+| `prefetch_window` | 256 (default) — chunk N+1 fetched in background while yielding chunk N |
+| Epochs | 8 |
+| AU target | ≥ 85% |
+| Model config | `retinanet_b200.yaml` |
+| MPI invocation | `mpirun -n NP -host 127.0.0.1:NP` |
+
+> **⚠️ Co-located test configuration.** The s3-ultra storage server and all benchmark
+> processes run on the **same** 24 vCPU / 48 GB RAM host, sharing CPU cores, memory,
+> and the loopback network interface. In a real deployment storage would be a dedicated
+> remote system; the CPU/memory pressure that limits scaling here would not apply.
+>
+> **AU (Accelerator Utilization)** — fraction of wall time the simulated accelerator was
+> computing rather than waiting for I/O. AU ≥ 85% is the MLPerf Storage target for
+> retinanet.
+
+---
+
+## NP Scaling Results
+
+| NP | AU% (mean ± σ) | Samples/s (mean ± σ) | I/O MiB/s (mean ± σ) | Wall (s) | AU ≥ 85%? |
+|----|----------------|----------------------|----------------------|----------|-----------|
+| 1 | 96.48 ± 0.08 | 485.0 ± 0.4 | 149.4 ± 0.1 | 864 | ✅ PASS |
+| 2 | 95.88 ± 0.07 | 964.1 ± 0.8 | 296.9 ± 0.2 | 458 | ✅ PASS |
+| 4 | 95.43 ± 0.20 | 1918.9 ± 4.5 | 591.0 ± 1.4 | 252 | ✅ PASS |
+
+### Per-epoch AU% breakdown
+
+| Epoch | NP=1 | NP=2 | NP=4 |
+|-------|------|------|------|
+| 1 | 96.42 | 95.83 | 94.93 |
+| 2 | 96.41 | 96.00 | 95.65 |
+| 3 | 96.56 | 95.94 | 95.49 |
+| 4 | 96.60 | 95.84 | 95.54 |
+| 5 | 96.51 | 95.84 | 95.40 |
+| 6 | 96.53 | 95.94 | 95.45 |
+| 7 | 96.38 | 95.89 | 95.44 |
+| 8 | 96.41 | 95.79 | 95.53 |
+
+AU is extremely stable across epochs (σ < 0.2% at all NP values), confirming the
+pipelined I/O path is not accumulating latency or drift between epochs.
+
+---
+
+## Scaling Analysis
+
+### Throughput Scaling Efficiency
+
+| Transition | Samples/s | Ideal | Efficiency |
+|------------|-----------|-------|------------|
+| NP=1 → NP=2 | 485.0 → 964.1 | 970.0 | **99.4%** |
+| NP=1 → NP=4 | 485.0 → 1918.9 | 1940.0 | **98.9%** |
+
+Near-perfect linear scaling through NP=4. The small efficiency loss at NP=4 is
+consistent with co-located SHA-256 signing load (no SHA-NI on this Cascade Lake
+host) competing for CPU cores with the benchmark processes.
+
+### I/O Throughput per NP
+
+| NP | I/O MiB/s | Per-accelerator MiB/s |
+|----|-----------|----------------------|
+| 1 | 149.4 | 149.4 |
+| 2 | 296.9 | 148.5 |
+| 4 | 591.0 | 147.8 |
+
+Per-accelerator I/O throughput is flat (within 1.1%) across all NP values —
+the storage backend is not the bottleneck, and adding accelerators does not
+degrade per-accelerator I/O bandwidth.
+
+### DataLoader Architecture Note
+
+RetinaNet (315 KiB × 50,000 files) is the most demanding small-object workload
+in the suite. Key design decisions that enable the above results:
+
+- **`TorchIterableDatasetSimple`** — file-sharded across workers, not map-style
+  `__getitem__`, eliminating per-sample Python dispatch overhead.
+- **`_s3_stream_next()` pipelined chunking** — chunk N+1 is submitted to a
+  background thread (via `_PREFETCH_POOL`) the instant the yield loop for chunk
+  N begins. Since s3dlio releases the GIL during Rust async I/O, fetch and
+  Python compute overlap truly concurrently. Peak concurrent GETs per worker:
+  `min(prefetch_window, 64) = 64`.
+- **Worker stagger** — worker `k` delays `k × computation_time` seconds before
+  its first chunk to spread startup I/O across one GPU-cycle window.
+
+---
+
+## Raw Results Location
+
+```
+results/retinanet_np_sweep/20260512_173956/
+├── NP1/training/retinanet/run/20260512_173956/summary.json
+├── NP2/training/retinanet/run/20260512_175421/summary.json
+└── NP4/training/retinanet/run/20260512_180159/summary.json
+```
diff --git a/docs/UNet3D_NP_Scaling_Results.md b/docs/UNet3D_NP_Scaling_Results.md
new file mode 100644
index 00000000..f8ec5b30
--- /dev/null
+++ b/docs/UNet3D_NP_Scaling_Results.md
@@ -0,0 +1,383 @@
+# UNet3D Training — NP Scaling Study
+
+**Date**: May 12, 2026  
+**Host**: loki-russ  
+**Storage**: s3-ultra (`http://127.0.0.1:9000`, co-located)  
+**Sweep ID**: `20260512_141130`
+
+---
+
+## Test Environment
+
+| Parameter | Value |
+|-----------|-------|
+| Host CPU | Intel Xeon Platinum 8280L @ 2.70 GHz, 28 vCPUs visible |
+| Host RAM | 47 GB |
+| Object storage | s3-ultra, co-located loopback (`http://127.0.0.1:9000`) |
+| Bucket / path | `s3://mlp-flux/data/unet3d/train/` |
+| Storage library | `s3dlio` |
+| `decode_mode` | `none` |
+| Batch size | 7 |
+| Read threads | 4 |
+| `computation_time` | 0.162 s (B200 = H100 0.323 s ÷ 2) |
+| Epochs | 5 |
+| AU target | ≥ 90% |
+| Model config | `unet3d_b200.yaml` |
+| MPI invocation | `mpirun -n NP -host 127.0.0.1:NP` |
+
+> **⚠️ Co-located test configuration.** The s3-ultra storage server and all benchmark
+> processes run on the **same** host, sharing CPU cores, memory, and the loopback network
+> interface. In a real deployment storage is a dedicated remote system; the CPU/memory
+> pressure that limits AU and throughput scaling here would not apply.
+>
+> **AU (Accelerator Utilization)** — fraction of wall time the simulated accelerator
+> spent computing rather than stalled waiting for I/O. AU ≥ 90% is the MLPerf Storage
+> pass threshold for UNet3D (compared to ≥ 70% for DLRM).
+>
+> **Note on DLIO I/O tracking.** `train_io_mean_MB_per_second` was near-zero for all
+> runs in the original sweep (May 12, 2026). Root cause: `config.py` unconditionally
+> executed `record_length = np.prod(record_dims) × element_bytes`. Because UNet3D sets
+> no `record_dims`, `np.prod([]) = 1.0`, silently overwriting the user-supplied
+> `record_length_bytes = 146,600,628` with `1 byte`. Fixed in `dlio_benchmark/utils/config.py`
+> by guarding the assignment with `if self.record_dims:`. From the next run onward,
+> `train_io_mean_MB_per_second` will report the correct value using `record_length_bytes`.
+> The **Derived IO** column below uses the original formula and remains accurate regardless.
+
+---
+
+## Dataset
+
+| Parameter | Value |
+|-----------|-------|
+| Format | NPZ |
+| Files | 7,200 |
+| Samples per file | 1 |
+| Avg file size | 146,600,628 bytes (139.8 MiB) |
+| Std dev file size | 68,341,808 bytes (65.2 MiB) |
+| Resize target | 2,097,152 bytes (2 MiB) |
+| Total dataset size | ≈ 983 GiB |
+
+### Dataset Context — UNet3D vs Other MLPerf Storage Workloads
+
+| Model | Files | Avg file size | Total | Format | Samples/file |
+|-------|-------|---------------|-------|--------|--------------|
+| DLRM  | 200   | 761 B × 1,536,000 samples | ~223 GiB | binary | 1,536,000 |
+| Flux  | 500   | ~50 MiB | ~25 GiB | Parquet | many |
+| **UNet3D** | **7,200** | **~140 MiB** | **~984 GiB** | **NPZ** | **1** |
+
+UNet3D is the most I/O-intensive workload: large random objects, 1 sample/file (no
+cross-sample batching), and a 984 GiB corpus to traverse each epoch. Compare to DLRM,
+where each 223 GiB file contains 1.5 M samples that are read sequentially in one pass.
+
+### Data Generation Performance
+
+7,200 NPZ files generated using `gen_unet3d_npz.sh` (NP=4 datagen workers) in **10m 02s (602 s)**.
+
+| Metric | Value |
+|--------|-------|
+| Generator | `s3dlio.generate_npz_bytes()` — pure Rust, hardware CRC32, zero Python-side copies |
+| Files written | 7,200 |
+| Total data written | ~1,055 GB |
+| Wall time | 602 s (10m 02s) |
+| Write throughput | **1,753 MB/s (1.75 GB/s)** |
+
+---
+
+## NP Scaling Results
+
+> **Derived IO** = `train_throughput_mean_samples_per_second × 146,600,628 bytes ÷ 1,000,000`
+
+| NP | AU% (mean) | AU std | Samples/s (mean) | Derived IO | Wall time | AU ≥ 90%? |
+|----|-----------|--------|-----------------|-------------------------|-----------|-----------|
+| 1  | 53.73%    | ±1.86% | 23.18           | 3,398 MB/s (3.40 GB/s)  | 1584 s (26m 24s) | ❌ FAIL |
+| 2  | 42.95%    | ±0.38% | 37.03           | 5,429 MB/s (5.43 GB/s)  | 1003 s (16m 43s) | ❌ FAIL |
+| 4  | 28.24%    | ±0.10% | 48.55           | 7,116 MB/s (7.12 GB/s)  | 777 s  (12m 57s) | ❌ FAIL |
+
+---
+
+## Per-Epoch Detail
+
+### NP=1
+
+| Epoch | AU%   | Samples/s | Derived IO (MB/s) | Duration (s) |
+|-------|-------|-----------|-------------------|--------------|
+| 1     | 51.38% | 22.16    | 3,249             | 339.4        |
+| 2     | 51.94% | 22.41    | 3,285             | 322.1        |
+| 3     | 53.74% | 23.18    | 3,397             | 311.4        |
+| 4     | 55.81% | 24.08    | 3,529             | 299.8        |
+| 5     | 55.79% | 24.07    | 3,527             | 300.0        |
+
+_Warm-up effect visible: AU and throughput rise ~8% from E1 to E4–5. The primary
+mechanism is the **s3dlio `ObjectSizeCache`**: on epoch 1, every object requires a
+`HeadObject` call to determine size before issuing concurrent byte-range GETs. Those
+results are stored in a process-wide cache (`GLOBAL_SIZE_CACHE`, 1-hour TTL). From
+epoch 2 onward the cache is fully warm and HEAD calls are skipped entirely, reducing
+latency per object and freeing connection slots for data GETs._
+
+### NP=2
+
+| Epoch | AU%    | Samples/s | Derived IO (MB/s) | Duration (s) |
+|-------|--------|-----------|-------------------|--------------|
+| 1     | 42.22% | 36.40    | 5,334             | 212.2        |
+| 2     | 42.98% | 37.06    | 5,431             | 195.2        |
+| 3     | 43.13% | 37.19    | 5,450             | 194.5        |
+| 4     | 43.21% | 37.25    | 5,458             | 194.2        |
+| 5     | 43.24% | 37.27    | 5,462             | 194.1        |
+
+_Very stable after E1 (std dev 0.38%). E1 overhead (+~18 s): 2 workers × 7,200 objects
+= ~7,200 concurrent `HeadObject` calls to populate the `ObjectSizeCache`. Epochs 2–5
+skip all HEAD calls and settle tightly at ~194 s._
+
+### NP=4
+
+| Epoch | AU%    | Samples/s | Derived IO (MB/s) | Duration (s) |
+|-------|--------|-----------|-------------------|--------------|
+| 1     | 28.08% | 48.27    | 7,076             | 164.5        |
+| 2     | 28.19% | 48.50    | 7,109             | 150.0        |
+| 3     | 28.22% | 48.52    | 7,112             | 150.0        |
+| 4     | 28.33% | 48.71    | 7,139             | 149.4        |
+| 5     | 28.36% | 48.76    | 7,146             | 149.2        |
+
+_Extremely stable (std dev 0.10%). E1 overhead (+~15 s): 4 workers × 7,200 objects =
+~14,400+ `HeadObject` calls in parallel, all resolved before epoch 2. The `ObjectSizeCache`
+warms faster at NP=4 (more parallel HEAD calls) but the burst also creates more transient
+loopback pressure, explaining the slightly larger absolute E1 gap at higher NP._
+
+---
+
+## Scaling Analysis
+
+### Aggregate Throughput Scaling
+
+| NP | Samples/s | Speedup vs NP=1 | Ideal | Efficiency |
+|----|-----------|-----------------|-------|------------|
+| 1  | 23.18     | 1.00×           | 1.00× | 100%       |
+| 2  | 37.03     | 1.597×          | 2.00× | **79.9%**  |
+| 4  | 48.55     | 2.094×          | 4.00× | **52.4%**  |
+
+### Derived I/O Throughput Scaling
+
+| NP | Derived IO   | Speedup vs NP=1 |
+|----|-------------|-----------------|
+| 1  | 3,398 MB/s  | 1.00×           |
+| 2  | 5,429 MB/s  | 1.597×          |
+| 4  | 7,116 MB/s  | 2.094×          |
+
+I/O throughput scaling is identical to sample throughput scaling (expected: fixed object
+size, 1 sample/file).
+
+### Per-Accelerator (per-rank) Throughput
+
+| NP | Samples/s per rank | Derived IO per rank (MB/s) |
+|----|-------------------|---------------------------|
+| 1  | 23.18             | 3,398                     |
+| 2  | 18.52             | 2,714                     |
+| 4  | 12.14             | 1,779                     |
+
+Per-rank throughput degrades monotonically as NP grows — each new worker competes
+with both the other workers and the co-located s3-ultra server for CPU and loopback
+bandwidth.
+
+### Warm-Up Epoch Overhead
+
+| NP | E1 duration (s) | Steady-state (s) | Warm-up overhead |
+|----|----------------|-----------------|-----------------|
+| 1  | 339.4          | ~300            | +39 s (+13%)    |
+| 2  | 212.2          | ~194            | +18 s (+9%)     |
+| 4  | 164.5          | ~150            | +15 s (+10%)    |
+
+The E1 penalty is caused by the **s3dlio `ObjectSizeCache`** being cold. The cache is
+implemented in `s3dlio/src/object_size_cache.rs` as an `Arc<RwLock<HashMap<String, CachedSize>>>`
+with a **1-hour TTL** (`GLOBAL_SIZE_CACHE` in `s3_utils.rs`). On first access to each
+object, `get_object_uri_optimized_async()` issues a `HeadObject` call to learn the
+object size, then stores it. From epoch 2 onward, every lookup is a cache hit and the
+HEAD call is skipped entirely — the benchmark only issues `GetObject` (with byte-range
+parts for large objects). This is consistent with observing a burst of HEAD operations
+at the s3-ultra server during epoch 1 that stops completely at the start of epoch 2.
+
+Absolute overhead decreases with NP (all ranks' 7,200 HEAD calls run in parallel,
+so they resolve faster), but relative overhead stays roughly constant at 9–13%.
+
+---
+
+## Key Findings
+
+1. **All NP configurations fail the 90% AU target.** This is expected in a co-located
+   setup: s3-ultra and all benchmark processes share the same CPU cores and loopback
+   interface. The 90% UNet3D threshold requires storage to deliver data fast enough
+   that the simulated accelerator is stalled for <10% of wall time — not achievable
+   when storage competes for the same CPU.
+
+2. **AU degrades sharply with NP.** 53.7% → 42.9% → 28.2% as NP doubles. Each new rank
+   doubles the per-step I/O demand without changing s3-ultra's available CPU budget.
+   This is purely a co-located resource contention effect, not a storage technology
+   limitation.
+
+3. **Absolute I/O throughput scales well.** 3.40 → 5.43 → 7.12 GB/s (2.09× for 4×
+   workers). The storage server is not bandwidth-saturated; it is CPU-throttled by
+   competition. On a dedicated remote system the ceiling would be substantially higher.
+
+4. **Scaling efficiency drops from 80% (NP=2) to 52% (NP=4).** The efficiency drop
+   between NP=2 and NP=4 is larger than between NP=1 and NP=2, consistent with
+   progressive CPU saturation of the co-located s3-ultra process.
+
+5. **s3dlio `ObjectSizeCache` cold-start dominates E1.** The first epoch is 9–13%
+   slower because every one of the 7,200 objects requires a `HeadObject` call to learn
+   its size before the library can calculate byte-range GET boundaries. Results are
+   stored in a process-wide 1-hour-TTL cache (`GLOBAL_SIZE_CACHE`). From epoch 2 onward
+   the cache is fully warm: zero HEAD calls are issued, and the server shows no HEAD
+   traffic. This is directly observable by watching request logs on s3-ultra: a burst of
+   HEAD requests fires during E1 and then stops completely.
+
+   This effect is smaller in DLRM (small 761-byte objects, no multi-part range GETs
+   needed) and would shrink further in production where the s3dlio process persists
+   across runs (cache pre-warmed from a previous job).
+
+6. **NP=4 is the practical limit on this host.** At NP=4, all 4 DLIO workers plus
+   s3-ultra are sharing 28 vCPUs. NP=8 would likely OOM or saturate the loopback
+   listener (as observed with DLRM NP=8 on the same host).
+
+7. **On dedicated storage, NP=1 would likely pass.** A 3.40 GB/s single-rank read
+   rate is a strong baseline. With s3-ultra on a separate host (full CPU available for
+   both storage server and benchmark), AU at NP=1 would be expected to exceed 90%.
+
+---
+
+## Raw Results
+
+Full per-run output under:
+```
+results/unet3d_np_sweep/20260512_141130/
+    NP1/training/unet3d/run/20260512_141131/
+    NP2/training/unet3d/run/20260512_143754/
+    NP4/training/unet3d/run/20260512_145438/
+```
+Each directory contains `summary.json`, `*_per_epoch_stats.json`, `dlio.log`,
+`training_run.stdout.log`, and DLIO config snapshots.
+
+---
+
+## Running the Sweep
+
+```bash
+cd /home/eval/Documents/Code/mlp-storage
+
+# Full NP=1,2,4 sweep (auto-generates TSV + Markdown results):
+STORAGE_ROOT=mlp-flux bash tests/object-store/sweep_unet3d_np.sh 2>&1 | tee sweep_unet3d_$(date +%Y%m%d_%H%M%S).log
+
+# Quick NP=1 smoke test:
+STORAGE_ROOT=mlp-flux bash tests/object-store/test_unet3d.sh
+
+# Single run at a specific NP:
+STORAGE_ROOT=mlp-flux NP=2 bash tests/object-store/test_unet3d.sh
+```
+
+> Note: data currently lives in `s3://mlp-flux/data/unet3d/train/` (generated May 12, 2026).
+> Pass `STORAGE_ROOT=mlp-unet3d` once data is migrated to the canonical bucket.
+
+---
+
+*Benchmark date: May 12, 2026*  
+*Host: loki-russ*  
+*s3-ultra (localhost:9000, co-located)*
+
+
+---
+
+## Test Environment
+
+| Parameter | Value |
+|-----------|-------|
+| Host | 24 vCPU VM (with hyperthreading), 48 GB RAM |
+| Object storage | s3-ultra (`http://127.0.0.1:9000`, co-located on test host) |
+| Bucket / path | `mlp-unet3d / data/unet3d` |
+| Dataset | 7,200 NPZ files × 1 sample/file (≈ 984 GiB) |
+| Record length | 146,600,628 bytes avg (σ = 68,341,808, resize = 2,097,152) |
+| Batch size | 7 |
+| Read threads | 4 |
+| `computation_time` | 0.162 s  (B200 = H100 0.323 s ÷ 2) |
+| `decode_mode` | `none` |
+| Epochs | 5 |
+| AU target | ≥ 90% |
+| Model config | `unet3d_b200.yaml` |
+| MPI invocation | `mpirun -n NP -host 127.0.0.1:NP` |
+
+> **⚠️ Co-located test configuration.** The s3-ultra storage server and all benchmark
+> processes run on the **same** 24 vCPU / 48 GB RAM host, sharing CPU cores, memory,
+> and the loopback network interface. In a real deployment the storage target would be a
+> dedicated remote system, and the CPU/memory pressure that limits scaling here
+> (particularly at NP ≥ 4) would not apply to the test processes. The resource constraints
+> described in this document are a property of this co-located setup, not of the storage
+> technology itself.
+
+**AU (Accelerator Utilization)** — fraction of wall time the simulated GPU was computing
+rather than waiting for I/O. AU ≥ 90% is the target threshold for a "pass" on unet3d.
+
+---
+
+## NP Scaling Results
+
+| NP | AU% | Samples/s | I/O MiB/s | Wall time (s) | AU ≥ 90%? |
+|----|-----|-----------|-----------|---------------|-----------|
+| 1 | TBD | TBD | TBD | TBD | TBD |
+| 2 | TBD | TBD | TBD | TBD | TBD |
+| 4 | TBD | TBD | TBD | TBD | TBD |
+
+---
+
+## Scaling Analysis
+
+*(To be filled after sweep completes.)*
+
+### Throughput Scaling Efficiency
+
+| Transition | Samples/s | Ideal | Efficiency |
+|------------|-----------|-------|------------|
+| NP=1 → NP=2 | TBD | TBD | TBD |
+| NP=1 → NP=4 | TBD | TBD | TBD |
+
+### Key Observations
+
+*(To be filled after sweep completes.)*
+
+---
+
+## Dataset Notes
+
+The dataset was generated on **May 12, 2026** using `gen_unet3d_npz.sh` (NP=4, 10m 02s wall time):
+- **Generator**: `s3dlio.generate_npz_bytes()` — pure Rust, hardware CRC32, zero Python-side copies
+- **Format**: NPZ (structured array, `float32`, shape varies per record)
+- **Avg file size**: ≈ 140 MiB  (σ ≈ 65 MiB)
+- **Total dataset**: 7,200 files ≈ 984 GiB
+
+### UNet3D vs Other Models
+
+| Model | Files | Avg file size | Total | Format |
+|-------|-------|---------------|-------|--------|
+| DLRM  | 200 | 761 B × 1,536,000 samples | ~223 GiB | binary |
+| Flux  | 500 | ~50 MiB | ~25 GiB | Parquet |
+| **UNet3D** | **7,200** | **~140 MiB** | **~984 GiB** | **NPZ** |
+
+UNet3D is the most I/O-intensive workload tested: large random files, 1 sample/file (no
+batching across samples), and a very large total dataset requiring sustained sequential reads
+across the full 984 GiB corpus each epoch.
+
+---
+
+## Running the Sweep
+
+```bash
+cd /home/eval/Documents/Code/mlp-storage
+
+# Full NP=1,2,4 sweep (recommended — auto-generates results doc):
+bash tests/object-store/sweep_unet3d_np.sh 2>&1 | tee sweep_unet3d_$(date +%Y%m%d_%H%M%S).log
+
+# Quick single NP=1 smoke test:
+bash tests/object-store/test_unet3d.sh
+
+# Single run at NP=2:
+NP=2 bash tests/object-store/test_unet3d.sh
+```
+
+The sweep writes per-run results to `results/unet3d_np_sweep/<timestamp>/NP{1,2,4}/`
+and auto-generates a populated Markdown doc alongside the TSV summary.
diff --git a/mlpstorage_py/benchmarks/base.py b/mlpstorage_py/benchmarks/base.py
index 17b983bf..22feb52d 100755
--- a/mlpstorage_py/benchmarks/base.py
+++ b/mlpstorage_py/benchmarks/base.py
@@ -294,6 +294,27 @@ def _execute_command(
 
             return stdout, stderr, return_code
 
+    @staticmethod
+    def _apply_dotted_overrides(params, overrides):
+        """Merge override_parameters (dotted keys) into a nested params dict.
+
+        Fixes #365: combined_params is frozen at __init__ time from YAML
+        defaults + args.params. Subclasses that call add_checkpoint_params()
+        afterwards only write into params_dict, leaving combined_params with
+        stale YAML defaults. This method folds params_dict back in so that
+        metadata['parameters'] reflects the effective run configuration that
+        the submission checker reads.
+        """
+        import copy
+        out = copy.deepcopy(params)
+        for dotted, value in (overrides or {}).items():
+            parts = dotted.split('.')
+            cur = out
+            for p in parts[:-1]:
+                cur = cur.setdefault(p, {})
+            cur[parts[-1]] = value
+        return out
+
     @property
     def metadata(self) -> Dict[str, Any]:
         """Generate metadata dict capturing the benchmark run configuration.
@@ -322,9 +343,12 @@ def metadata(self) -> Dict[str, Any]:
             'result_dir': self.run_result_output,
         }
 
-        # Parameters - prefer combined_params if available (includes YAML + overrides)
+        # Parameters - YAML defaults with CLI overrides folded in (fixes #365).
+        # combined_params alone omits overrides added after __init__ (e.g.
+        # checkpoint.num_checkpoints_*), causing split-phase runs to double-count.
         if hasattr(self, 'combined_params'):
-            metadata['parameters'] = self.combined_params
+            metadata['parameters'] = self._apply_dotted_overrides(
+                self.combined_params, getattr(self, 'params_dict', {}))
         else:
             metadata['parameters'] = {}
 
@@ -442,15 +466,22 @@ def _collect_cluster_information(self) -> 'ClusterInformation':
             mpi_bin = getattr(self.args, 'mpi_bin', 'mpirun')
             allow_run_as_root = getattr(self.args, 'allow_run_as_root', False)
             timeout = getattr(self.args, 'cluster_collection_timeout', 60)
+            ssh_username = getattr(self.args, 'ssh_username', None)
+            shared_staging_dir = getattr(self.args, 'shared_staging_dir', None)
 
-            # Collect cluster info
+            # Collect cluster info. ``results_dir`` is required by
+            # ``collect_cluster_info`` for staging the helper script under
+            # ``<results_dir>/collector-staging/`` (see issue #363).
             collected_data = collect_cluster_info(
                 hosts=self.args.hosts,
                 mpi_bin=mpi_bin,
                 logger=self.logger,
+                results_dir=self.run_result_output,
                 allow_run_as_root=allow_run_as_root,
                 timeout_seconds=timeout,
-                fallback_to_local=True
+                fallback_to_local=True,
+                shared_staging_dir=shared_staging_dir,
+                ssh_username=ssh_username,
             )
 
             # Create ClusterInformation from collected data
diff --git a/mlpstorage_py/benchmarks/dlio.py b/mlpstorage_py/benchmarks/dlio.py
index 0ed6c674..137eb82e 100755
--- a/mlpstorage_py/benchmarks/dlio.py
+++ b/mlpstorage_py/benchmarks/dlio.py
@@ -267,7 +267,8 @@ def generate_dlio_command(self):
             self.logger.debug(f'Generating MPI Command with binary "{self.args.mpi_bin}"')
             mpi_prefix = generate_mpi_prefix_cmd(self.args.mpi_bin, self.args.hosts, self.args.num_processes,
                                                  self.args.oversubscribe, self.args.allow_run_as_root,
-                                                 self.args.mpi_params, self.logger)
+                                                 self.args.mpi_params, self.logger,
+                                                 mpi_btl=getattr(self.args, 'mpi_btl', 'auto'))
             cmd = f"{mpi_prefix} {cmd}"
 
         return cmd
diff --git a/mlpstorage_py/cli/common_args.py b/mlpstorage_py/cli/common_args.py
index 398caf74..f04974e8 100755
--- a/mlpstorage_py/cli/common_args.py
+++ b/mlpstorage_py/cli/common_args.py
@@ -291,6 +291,18 @@ def add_mpi_arguments(parser):
         '--allow-run-as-root',
         action="store_true"
     )
+    mpi_options.add_argument(
+        '--mpi-btl',
+        choices=['auto', 'vader', 'tcp'],
+        default='auto',
+        help=(
+            "MPI Byte Transport Layer for single-host runs. "
+            "'auto' lets OpenMPI select automatically (default; works on most systems). "
+            "'vader' forces POSIX shared-memory transport (fast; may fail in containers or as root). "
+            "'tcp' forces TCP loopback transport (universally compatible; recommended for containers "
+            "and root environments). Has no effect on multi-host runs."
+        )
+    )
     mpi_options.add_argument(
         '--mpi-params',
         nargs="+",
diff --git a/mlpstorage_py/cli_parser.py b/mlpstorage_py/cli_parser.py
index af32669f..cafecb98 100755
--- a/mlpstorage_py/cli_parser.py
+++ b/mlpstorage_py/cli_parser.py
@@ -115,13 +115,20 @@ def parse_arguments():
     if hasattr(parsed_args, 'config_file') and parsed_args.config_file:
         parsed_args = apply_yaml_config_overrides(parsed_args)
 
-    # Consolidate the data access protocol into a single field
-    if parsed_args.file:
-        parsed_args.data_access_protocol = "file"
-    else:
-        parsed_args.data_access_protocol = parsed_args.object
-    del parsed_args.file
-    del parsed_args.object
+    # Consolidate the data access protocol into a single field.
+    # The --file / --object flags are only defined on benchmark subcommands
+    # that call add_storage_type_arguments() (training, checkpointing,
+    # vectordb, kvcache). Other subcommands (reports, history, lockfile)
+    # do not define them, so guard the consolidation on attribute presence.
+    if hasattr(parsed_args, "file") or hasattr(parsed_args, "object"):
+        if getattr(parsed_args, "file", False):
+            parsed_args.data_access_protocol = "file"
+        else:
+            parsed_args.data_access_protocol = getattr(parsed_args, "object", None)
+        # Clean up the raw flags so downstream code uses data_access_protocol.
+        for _attr in ("file", "object"):
+            if hasattr(parsed_args, _attr):
+                delattr(parsed_args, _attr)
 
     """
     print(f"Arguments found: {parsed_args}")
diff --git a/mlpstorage_py/rules/models.py b/mlpstorage_py/rules/models.py
index 50438da7..85addf44 100755
--- a/mlpstorage_py/rules/models.py
+++ b/mlpstorage_py/rules/models.py
@@ -745,7 +745,11 @@ def parse(self, result_dir: str, metadata: Optional[Dict] = None) -> BenchmarkRu
                     override_parameters[p[len('++workload.'):]] = v
 
         system_info = ClusterInformation.from_dlio_summary_json(summary, self.logger)
-
+        # Fallback to metadata cluster_information when DLIO summary lacks system info
+        if system_info is None and metadata:
+            ci_data = metadata.get('cluster_information')
+            if ci_data:
+                system_info = ClusterInformation.from_dict(ci_data, self.logger)
         return BenchmarkRunData(
             benchmark_type=benchmark_type,
             model=model,
@@ -890,7 +894,8 @@ def __init__(self, data: BenchmarkRunData = None, logger=None,
             self._data = BenchmarkInstanceExtractor.extract(benchmark_instance)
         elif benchmark_result:
             parser = DLIOResultParser(logger=logger)
-            self._data = parser.parse(benchmark_result.benchmark_result_root_dir)
+            metadata = getattr(benchmark_result, 'metadata', None)
+            self._data = parser.parse(benchmark_result.benchmark_result_root_dir, metadata=metadata)
 
         self._run_id = RunID(
             program=self._data.benchmark_type.name if self._data.benchmark_type else "",
diff --git a/mlpstorage_py/tests/test_benchmarks.py b/mlpstorage_py/tests/test_benchmarks.py
index e11e92ce..92a7beb3 100755
--- a/mlpstorage_py/tests/test_benchmarks.py
+++ b/mlpstorage_py/tests/test_benchmarks.py
@@ -218,20 +218,30 @@ def _run(self):
             benchmark = TestBenchmark.__new__(TestBenchmark)
             benchmark.args = base_args
             benchmark.logger = mock_logger
+            # ``run_result_output`` is normally set in ``Benchmark.__init__``
+            # via ``generate_output_location()``. We patched ``__init__``
+            # away, so set it explicitly so the call site has a results dir
+            # to forward to ``collect_cluster_info`` (issue #363).
+            benchmark.run_result_output = '/tmp/results/run-001'
 
             with patch('mlpstorage_py.benchmarks.base.collect_cluster_info') as mock_collect:
                 mock_collect.return_value = mock_collected_data
 
                 result = benchmark._collect_cluster_information()
 
-                # Verify collect_cluster_info was called with correct args
+                # Verify collect_cluster_info was called with correct args.
+                # ``results_dir`` is REQUIRED by collect_cluster_info; missing
+                # it was the root cause of issue #363.
                 mock_collect.assert_called_once_with(
                     hosts=['host1', 'host2'],
                     mpi_bin='mpirun',
                     logger=mock_logger,
+                    results_dir='/tmp/results/run-001',
                     allow_run_as_root=False,
                     timeout_seconds=60,
-                    fallback_to_local=True
+                    fallback_to_local=True,
+                    shared_staging_dir=None,
+                    ssh_username=None,
                 )
 
                 # Verify result is a ClusterInformation instance
@@ -260,6 +270,120 @@ def _run(self):
                 assert result is None
 
 
+# =============================================================================
+# Regression tests for issue #363
+# =============================================================================
+# The original bug was that ``Benchmark._collect_cluster_information`` called
+# ``collect_cluster_info`` without the required ``results_dir`` argument. Every
+# pre-existing test patched ``collect_cluster_info`` away, so the missing-arg
+# ``TypeError`` never surfaced. The tests below validate the call against the
+# *real* function signature so future signature drift is caught at unit-test
+# time.
+
+class TestCollectClusterInfoSignatureBinding:
+    """Issue #363: guard ``_collect_cluster_information`` against signature drift."""
+
+    def test_call_binds_to_real_collect_cluster_info_signature(
+        self, base_args, mock_logger
+    ):
+        """The kwargs passed by ``_collect_cluster_information`` must bind to
+        the real ``collect_cluster_info`` signature without raising
+        ``TypeError`` for missing required arguments.
+
+        This is what would have caught issue #363 before merge.
+        """
+        import inspect
+        from mlpstorage_py.benchmarks.base import Benchmark
+        from mlpstorage_py.cluster_collector import collect_cluster_info
+
+        class TestBenchmark(Benchmark):
+            BENCHMARK_TYPE = BENCHMARK_TYPES.training
+            def _run(self):
+                pass
+
+        sig = inspect.signature(collect_cluster_info)
+        captured_kwargs = {}
+
+        def capture(*args, **kwargs):
+            # Reject positional shadowing — the call site is keyword-only.
+            assert not args, "call site should use keyword arguments only"
+            captured_kwargs.update(kwargs)
+            # Validate against the REAL signature; this raises TypeError if
+            # any required parameter (e.g., ``results_dir``) is missing.
+            sig.bind(**kwargs)
+            return {
+                'host1': {'hostname': 'host1', 'meminfo': {'MemTotal': 16384000}},
+                '_metadata': {
+                    'collection_method': 'mpi',
+                    'collection_timestamp': '2024-01-01T00:00:00Z',
+                },
+            }
+
+        with patch.object(TestBenchmark, '__init__', lambda x, *a, **kw: None):
+            benchmark = TestBenchmark.__new__(TestBenchmark)
+            benchmark.args = base_args
+            benchmark.logger = mock_logger
+            benchmark.run_result_output = '/tmp/results/run-001'
+
+            with patch(
+                'mlpstorage_py.benchmarks.base.collect_cluster_info',
+                side_effect=capture,
+            ):
+                benchmark._collect_cluster_information()
+
+        # ``results_dir`` is the parameter that was missing in issue #363.
+        assert 'results_dir' in captured_kwargs
+        assert captured_kwargs['results_dir'] == '/tmp/results/run-001'
+
+    def test_warning_message_from_issue_363_is_not_emitted(
+        self, base_args, mock_logger
+    ):
+        """The exact warning ``MPI cluster info collection failed:
+        collect_cluster_info() missing 1 required positional argument:
+        'results_dir'`` must NOT appear after the fix.
+        """
+        from mlpstorage_py.benchmarks.base import Benchmark
+
+        class TestBenchmark(Benchmark):
+            BENCHMARK_TYPE = BENCHMARK_TYPES.training
+            def _run(self):
+                pass
+
+        warnings_seen = []
+
+        class CapturingLogger(MockLogger):
+            def warning(self, msg):
+                warnings_seen.append(msg)
+
+        with patch.object(TestBenchmark, '__init__', lambda x, *a, **kw: None):
+            benchmark = TestBenchmark.__new__(TestBenchmark)
+            benchmark.args = base_args
+            benchmark.logger = CapturingLogger()
+            benchmark.run_result_output = '/tmp/results/run-001'
+
+            # Use the REAL ``collect_cluster_info`` but stub out the heavy
+            # ``MPIClusterCollector`` so we don't need an actual cluster.
+            with patch(
+                'mlpstorage_py.cluster_collector.MPIClusterCollector'
+            ) as mock_collector_cls:
+                mock_instance = MagicMock()
+                mock_instance.collect.return_value = {
+                    'host1': {'hostname': 'host1', 'meminfo': {'MemTotal': 16384000}},
+                }
+                mock_collector_cls.return_value = mock_instance
+
+                benchmark._collect_cluster_information()
+
+        offending = [
+            w for w in warnings_seen
+            if 'missing 1 required positional argument' in w
+            and 'results_dir' in w
+        ]
+        assert offending == [], (
+            f"Issue #363 warning regressed: {offending}"
+        )
+
+
 # =============================================================================
 # Tests for DLIOBenchmark.accumulate_host_info
 # =============================================================================
diff --git a/mlpstorage_py/tests/test_cluster_collector.py b/mlpstorage_py/tests/test_cluster_collector.py
index 384f8ae9..dbac177c 100755
--- a/mlpstorage_py/tests/test_cluster_collector.py
+++ b/mlpstorage_py/tests/test_cluster_collector.py
@@ -520,7 +520,8 @@ def test_init(self, mock_logger):
         collector = MPIClusterCollector(
             hosts=["host1", "host2"],
             mpi_bin="mpirun",
-            logger=mock_logger
+            logger=mock_logger,
+            results_dir='/tmp'
         )
         assert collector.hosts == ["host1", "host2"]
         assert collector.mpi_bin == "mpirun"
@@ -531,7 +532,8 @@ def test_get_unique_hosts(self, mock_logger):
         collector = MPIClusterCollector(
             hosts=["host1:4", "host2:4", "host1:4"],
             mpi_bin="mpirun",
-            logger=mock_logger
+            logger=mock_logger,
+            results_dir='/tmp'
         )
         unique = collector._get_unique_hosts()
         assert len(unique) == 2
@@ -543,7 +545,8 @@ def test_generate_mpi_command(self, mock_logger):
         collector = MPIClusterCollector(
             hosts=["host1", "host2"],
             mpi_bin="mpirun",
-            logger=mock_logger
+            logger=mock_logger,
+            results_dir='/tmp'
         )
         cmd = collector._generate_mpi_command("/tmp/script.py", "/tmp/output.json")
         assert "mpirun" in cmd
@@ -559,6 +562,7 @@ def test_generate_mpi_command_with_root(self, mock_logger):
             hosts=["host1"],
             mpi_bin="mpirun",
             logger=mock_logger,
+            results_dir='/tmp',
             allow_run_as_root=True
         )
         cmd = collector._generate_mpi_command("/tmp/script.py", "/tmp/output.json")
@@ -569,7 +573,8 @@ def test_write_collector_script(self, mock_logger):
         collector = MPIClusterCollector(
             hosts=["host1"],
             mpi_bin="mpirun",
-            logger=mock_logger
+            logger=mock_logger,
+            results_dir='/tmp'
         )
         with tempfile.TemporaryDirectory() as tmpdir:
             script_path = os.path.join(tmpdir, "collector.py")
@@ -586,7 +591,8 @@ def test_collect_local_only(self, mock_logger):
         collector = MPIClusterCollector(
             hosts=["host1"],
             mpi_bin="mpirun",
-            logger=mock_logger
+            logger=mock_logger,
+            results_dir='/tmp'
         )
         result = collector.collect_local_only()
         assert isinstance(result, dict)
@@ -606,6 +612,7 @@ def test_collect_cluster_info_with_fallback(self, mock_logger):
             hosts=["localhost"],
             mpi_bin="mpirun",
             logger=mock_logger,
+            results_dir='/tmp',
             fallback_to_local=True,
             timeout_seconds=5
         )
@@ -619,6 +626,7 @@ def test_collect_cluster_info_metadata(self, mock_logger):
             hosts=["localhost"],
             mpi_bin="mpirun",
             logger=mock_logger,
+            results_dir='/tmp',
             fallback_to_local=True,
             timeout_seconds=5
         )
@@ -671,7 +679,8 @@ def test_collector_detects_mpi_import_error(self, mock_logger):
         collector = MPIClusterCollector(
             hosts=["host1"],
             mpi_bin="mpirun",
-            logger=mock_logger
+            logger=mock_logger,
+            results_dir='/tmp'
         )
 
         # Simulate what the script writes when mpi4py is missing
@@ -709,12 +718,6 @@ def test_collector_detects_mpi_import_error(self, mock_logger):
 
     def test_collector_returns_valid_data_without_error_marker(self, mock_logger):
         """Collector should return data normally when no error marker present."""
-        collector = MPIClusterCollector(
-            hosts=["host1"],
-            mpi_bin="mpirun",
-            logger=mock_logger
-        )
-
         # Valid output without error marker
         valid_output = {
             'host1': {
@@ -724,39 +727,39 @@ def test_collector_returns_valid_data_without_error_marker(self, mock_logger):
         }
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            # Under the new implementation (issue #303 fix), the collector
-            # creates a uuid-named subdirectory inside its base tmp dir and
-            # writes cluster_info.json there. We exercise that path by
-            # supplying ``shared_tmp_dir`` and pinning the uuid so we know
-            # the final output path.
+            # Use shared_staging_dir so collect() stages everything under tmpdir.
+            # output_path = <shared_staging_dir>/cluster_info.json — pre-create
+            # it so collect() finds it after the (mocked) subprocess.run.
             import subprocess
             from unittest.mock import patch, MagicMock
 
+            collector = MPIClusterCollector(
+                hosts=["host1"],
+                mpi_bin="mpirun",
+                logger=mock_logger,
+                results_dir=tmpdir,
+                shared_staging_dir=tmpdir,
+            )
+
+            output_path = os.path.join(tmpdir, 'cluster_info.json')
+            with open(output_path, 'w') as f:
+                json.dump(valid_output, f)
+
             mock_result = MagicMock()
             mock_result.returncode = 0
             mock_result.stderr = ""
 
-            with patch('mlpstorage_py.cluster_collector.uuid.uuid4') as mock_uuid:
-                mock_uuid.return_value.hex = 'abcdef012345'
-                working_dir = os.path.join(tmpdir, 'mlps_collector_abcdef012345')
-                os.makedirs(working_dir, exist_ok=True)
-                output_path = os.path.join(working_dir, 'cluster_info.json')
-                with open(output_path, 'w') as f:
-                    json.dump(valid_output, f)
-
-                collector.shared_tmp_dir = tmpdir
-
-                with patch('mlpstorage_py.cluster_collector.subprocess.run',
-                           return_value=mock_result):
-                    with patch.object(collector, '_write_collector_script'):
-                        with patch.object(
-                            collector, '_generate_mpi_command',
-                            return_value="mpirun test",
-                        ):
-                            result = collector.collect()
-
-                            assert 'host1' in result
-                            assert result['host1']['hostname'] == 'host1'
+            with patch('mlpstorage_py.cluster_collector.subprocess.run',
+                       return_value=mock_result):
+                with patch.object(collector, '_write_collector_script'):
+                    with patch.object(
+                        collector, '_generate_mpi_command',
+                        return_value="mpirun test",
+                    ):
+                        result = collector.collect()
+
+                        assert 'host1' in result
+                        assert result['host1']['hostname'] == 'host1'
 
 
 # =============================================================================
diff --git a/mlpstorage_py/tests/test_rules.py b/mlpstorage_py/tests/test_rules.py
index 3637a170..d3fb1af5 100755
--- a/mlpstorage_py/tests/test_rules.py
+++ b/mlpstorage_py/tests/test_rules.py
@@ -11,7 +11,7 @@
 import logging
 from unittest.mock import MagicMock, patch
 
-from mlpstorage_py.rules import ClusterInformation, BenchmarkRun, BenchmarkResult
+from mlpstorage_py.rules import ClusterInformation, BenchmarkRun, BenchmarkResult, DLIOResultParser
 
 
 class MockLogger:
@@ -211,7 +211,11 @@ def test_system_info_from_metadata_when_dlio_summary_lacks_data(self, mock_logge
             'overrides.yaml': ['workload=training_gpu']
         }
 
-        benchmark_run = BenchmarkRun(benchmark_result=mock_benchmark_result, logger=mock_logger)
+        with patch.object(DLIOResultParser, '_load_summary',
+                          return_value=mock_benchmark_result.summary), \
+             patch.object(DLIOResultParser, '_load_hydra_configs',
+                          return_value=mock_benchmark_result.hydra_configs):
+            benchmark_run = BenchmarkRun(benchmark_result=mock_benchmark_result, logger=mock_logger)
 
         assert benchmark_run.system_info is not None
         assert benchmark_run.system_info.total_memory_bytes == 256 * 1024 * 1024 * 1024
@@ -246,7 +250,11 @@ def test_system_info_prefers_dlio_summary_when_available(self, mock_logger):
             'overrides.yaml': ['workload=training_gpu']
         }
 
-        benchmark_run = BenchmarkRun(benchmark_result=mock_benchmark_result, logger=mock_logger)
+        with patch.object(DLIOResultParser, '_load_summary',
+                          return_value=mock_benchmark_result.summary), \
+             patch.object(DLIOResultParser, '_load_hydra_configs',
+                          return_value=mock_benchmark_result.hydra_configs):
+            benchmark_run = BenchmarkRun(benchmark_result=mock_benchmark_result, logger=mock_logger)
 
         # Should use DLIO summary data (128GB), not metadata (256GB)
         expected_bytes = 128 * 1024 * 1024 * 1024
@@ -274,7 +282,11 @@ def test_system_info_none_when_no_data_available(self, mock_logger):
             'overrides.yaml': ['workload=training_gpu']
         }
 
-        benchmark_run = BenchmarkRun(benchmark_result=mock_benchmark_result, logger=mock_logger)
+        with patch.object(DLIOResultParser, '_load_summary',
+                          return_value=mock_benchmark_result.summary), \
+             patch.object(DLIOResultParser, '_load_hydra_configs',
+                          return_value=mock_benchmark_result.hydra_configs):
+            benchmark_run = BenchmarkRun(benchmark_result=mock_benchmark_result, logger=mock_logger)
 
         assert benchmark_run.system_info is None
 
diff --git a/mlpstorage_py/utils.py b/mlpstorage_py/utils.py
index 7c3a581f..7c4c80b8 100755
--- a/mlpstorage_py/utils.py
+++ b/mlpstorage_py/utils.py
@@ -472,7 +472,8 @@ def generate_mpi_prefix_cmd(
     oversubscribe: bool,
     allow_run_as_root: bool,
     params: Optional[List[str]],
-    logger: logging.Logger
+    logger: logging.Logger,
+    mpi_btl: str = "auto",
 ) -> str:
     """Generate MPI command prefix for distributed execution.
 
@@ -487,6 +488,11 @@ def generate_mpi_prefix_cmd(
         allow_run_as_root: Allow running MPI as root user.
         params: Additional MPI parameters to append.
         logger: Logger instance for debug output.
+        mpi_btl: Byte Transport Layer for single-host runs. 'auto' lets
+            OpenMPI select automatically (default). 'vader' forces POSIX
+            shared-memory transport. 'tcp' forces TCP loopback (most
+            compatible; recommended for containers/root). Ignored for
+            multi-host runs.
 
     Returns:
         MPI command prefix string ready for command execution.
@@ -544,10 +550,18 @@ def generate_mpi_prefix_cmd(
     if len(unique_hosts) > 1:
         # Multi-host: prioritize even distribution across nodes
         prefix += " --bind-to none --map-by node"
+        logger.info("MPI BTL transport: auto (multi-host run; transport managed by network fabric)")
     else:
         # Single-host: optimize for NUMA domains
-        # Disable VADER shared-memory transport — causes segfaults on some kernels
-        prefix += " --bind-to none --map-by socket --mca btl ^vader"
+        prefix += " --bind-to none --map-by socket"
+        if mpi_btl == "vader":
+            prefix += " --mca btl vader,self"
+            logger.info("MPI BTL transport: vader (POSIX shared-memory)")
+        elif mpi_btl == "tcp":
+            prefix += " --mca btl tcp,self"
+            logger.info("MPI BTL transport: tcp (TCP loopback; recommended for containers/root)")
+        else:  # auto
+            logger.info("MPI BTL transport: auto (OpenMPI default selection)")
 
     if oversubscribe:
         prefix += " --oversubscribe"
diff --git a/pyproject.toml b/pyproject.toml
index 80545fdd..09680be5 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "mlpstorage"
-version = "2.0.0b1"
+version = "3.0.2"
 description = "MLPerf Storage Benchmark Suite"
 readme = "README.md"
 license = {text = "Apache-2.0"}
@@ -21,8 +21,8 @@ dependencies = [
     "dlio-benchmark", # Required dependency
     "minio>=7.2.20",
     "s3torchconnector>=1.5.0",
-    "s3dlio>=0.9.95",
     "python-dotenv>=1.0.0",
+    "s3dlio>=0.9.100",
 ]
 
 [project.optional-dependencies]
@@ -84,8 +84,17 @@ name = "pytorch-cpu"
 url = "https://download.pytorch.org/whl/cpu"
 explicit = true
 
+[tool.uv]
+# s3dlio only ships Linux wheels — restrict resolution to Linux.
+environments = ["sys_platform == 'linux'"]
+
 [tool.uv.sources]
-dlio-benchmark = { git = "https://github.com/russfellows/dlio_benchmark.git", branch = "feat/parquet-dgen-streaming" }
 torch = [{ index = "pytorch-cpu" }]
 torchvision = [{ index = "pytorch-cpu" }]
 torchaudio = [{ index = "pytorch-cpu" }]
+dlio-benchmark = { git = "https://github.com/russfellows/dlio_benchmark.git", rev = "21c0723de897add728158943d369abd4b333f7dc" }
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+]
diff --git a/tests/DLRM_test_results.md b/tests/DLRM_test_results.md
index e8e98c71..9c4d8b70 100644
--- a/tests/DLRM_test_results.md
+++ b/tests/DLRM_test_results.md
@@ -158,3 +158,171 @@ cd /home/eval/Documents/Code/mlp-storage && uv run mlpstorage training run \
   Even NVMe may struggle to meet AU ≥ 90% at 12,288 samples/step × ~761 bytes = ~9.1 MB/step × 1302 steps/epoch ≈ 11.8 GB must be read at accelerator speed.
 - Parquet footer cache (`_pf_cache`) active in `parquet_reader.py` — same fix as Flux.
 - S3 row-group reads via byte-range GET using `parquet_reader_s3_iterable.py`.
+
+---
+
+## Direct DLIO Benchmark — Reader Library Comparison (2026-05-07)
+
+> These tests bypass `mlpstorage` and run `dlio_benchmark` directly to isolate storage library performance.
+>
+> **AU formula** (from `statscounter.py`):
+> `AU = (metric_steps × computation_time_per_step) / metric_window_wall_time`
+> where `metric_steps = total_steps − 1` (first step excluded by default `metric_exclude_start_steps=1`).
+> AU represents the fraction of time the simulated accelerator is computing vs. waiting for I/O.
+
+### Test Configuration
+
+| Parameter | Value |
+|-----------|-------|
+| Date | 2026-05-07 |
+| Benchmark | `dlio_benchmark.main` (direct, no `mlpstorage` wrapper) |
+| S3 server | s3-ultra at `127.0.0.1:9200` (synthetic, ~40 GB/s capable) |
+| S3 credentials | `minioadmin/minioadmin` |
+| File storage | `/mnt/test/dlrm/train/*.parquet` |
+| Dataset | 64 × ~971 MiB Parquet files, **~60.5 GiB total** |
+| Row groups / file | 123 RGs @ ~8 MiB/RG compressed |
+| DataLoader workers | 8 |
+| Prefetch threads/worker | 64 |
+| Prefetch window | 64 RGs |
+| I/O pattern | Sliding-window RG prefetch (`TorchIterableDataset`) |
+| Epochs | 1 |
+| Batch size | 2,048 samples |
+| Computation time | 0.770 ms/step |
+| `read_threads` | 8 |
+
+### Per-Worker I/O Timing (`[io_timing]` lines)
+
+| Reader | Data/worker | Per-worker elapsed | Per-worker throughput |
+|--------|------------|--------------------|-----------------------|
+| S3 + s3torchconnector | 7.562 GiB | ~63 s | ~121–131 MiB/s |
+| S3 + s3dlio | 7.562 GiB | ~48.5 s | ~159–160 MiB/s |
+| File posix (buffered) | 7.562 GiB | ~58–63 s | ~121–131 MiB/s |
+| File direct:// (O_DIRECT) | 7.562 GiB | ~49–51 s | ~151–158 MiB/s |
+
+### Epoch Results (NP=1, Single Rank)
+
+| Reader | Epoch wall time | Aggregate throughput (60.5 GiB) | AU (raw) | AU (corrected) |
+|--------|-----------------|---------------------------------|----------|----------------|
+| S3 + s3torchconnector | 107.79 s | ~575 MiB/s (~603 MB/s) | 22.3% | 35.7% |
+| **S3 + s3dlio** | **76.07 s** | **~814 MiB/s (~854 MB/s)** | **31.6%** | **50.6%** |
+| File posix (buffered) | 95.23 s | ~650 MiB/s (~682 MB/s) | 25.3% | 40.5% |
+| File direct:// (O_DIRECT) | 80.41 s | ~770 MiB/s (~808 MB/s) | 30.0% | 48.0% |
+| Dry-run (simulate, no I/O) | 38.51 s | — | 62.5% | 100% |
+
+> **AU (raw)** = `(steps − 1) × 0.000770031 s / epoch_wall_time` = `24.06 s / epoch_wall_time`.
+> Dry-run measured at 38.51 s → AU_dry = 24.06 / 38.51 = **62.5%** (framework overhead ceiling).
+> **AU (corrected)** = `AU_raw / AU_dry_run` — normalizes out unavoidable DataLoader/framework overhead,
+> expressing how much of the *achievable* compute time was actually utilized. 100% = no I/O stall beyond framework floor.
+> Aggregate throughput = 60.5 GiB ÷ epoch wall time (all 8 workers run in parallel).
+
+### Key Findings
+
+- **s3dlio is 41.7% faster than s3torchconnector** on S3 (76s vs 108s epoch; ~160 vs ~126 MiB/s per worker). Both use byte-range GETs; s3dlio benefits from its Rust async runtime vs CRT thread pool under this workload.
+- **File direct:// (O_DIRECT) is the fastest file reader** at 80.4s — slightly faster than s3dlio S3 and 15% faster than posix. O_DIRECT bypasses the page cache and exercises the NVMe bandwidth directly.
+- **File posix is comparable to s3torchconnector** (95s vs 108s), suggesting both are similarly bounded by concurrency or I/O queue depth.
+- **Dry-run floor is ~38.5s** — pure PyTorch DataLoader/compute overhead with no I/O. All configurations add meaningful I/O time on top.
+- DLIO's built-in I/O metric should be ignored — it reports ~0.84 MiB/s because it counts `get_sample()` calls × `record_length` (1024 bytes), not actual bytes transferred. Use `[io_timing]` lines for true throughput.
+
+### Run Commands
+
+```bash
+cd /home/eval/Documents/Code/dlio_benchmark
+
+# S3 + s3torchconnector
+AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin AWS_ENDPOINT_URL=http://127.0.0.1:9200 \
+  uv run python -m dlio_benchmark.main workload=dlrm_s3dlio_s3 \
+  ++workload.storage.storage_options.storage_library=s3torchconnector
+
+# S3 + s3dlio
+AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin AWS_ENDPOINT_URL=http://127.0.0.1:9200 \
+  uv run python -m dlio_benchmark.main workload=dlrm_s3dlio_s3 \
+  ++workload.storage.storage_options.storage_library=s3dlio
+
+# File direct:// (O_DIRECT via s3dlio)
+uv run python -m dlio_benchmark.main workload=dlrm_s3dlio_file \
+  ++workload.storage.storage_options.storage_library=direct
+
+# File posix (buffered)
+uv run python -m dlio_benchmark.main workload=dlrm_s3dlio_file
+
+# Dry-run (simulate, no I/O)
+uv run python -m dlio_benchmark.main workload=dlrm_s3dlio_file \
+  ++workload.storage.storage_options.simulate_io=true
+```
+
+---
+
+## Multi-Rank MPI Scaling — S3 + s3dlio (2026-05-07)
+
+> Same `dlrm_s3dlio_s3` workload as above, launched via `mpirun` to simulate multiple accelerator ranks.
+> All ranks share the same s3-ultra instance (127.0.0.1:9200). Each rank reads an equal share of the 64 files.
+
+### Configuration
+
+| Parameter | Value |
+|-----------|-------|
+| Date | 2026-05-07 |
+| Storage library | s3dlio |
+| S3 server | s3-ultra at `127.0.0.1:9200` |
+| Config | `dlrm_s3dlio_s3.yaml` (64 files, 2048 batch, 0.770031 ms compute) |
+| DataLoader workers/rank | 8 |
+| Prefetch threads/worker | 64 |
+
+### Dry-Run Baselines (framework overhead only, `simulate_io=true`)
+
+| NP | Dry-run epoch | Compute budget/rank | AU_dry (raw) |
+|----|--------------|--------------------|--------------|
+| 1 | 36.35 s | 24.06 s | 66.2% |
+| 2 | 20.61 s | 12.03 s | 58.4% |
+| 4 | 9.65 s | 6.01 s | 62.3% |
+
+> AU_dry drops at NP=2 because fewer steps per rank means less compute time relative to fixed per-rank DataLoader startup overhead.
+
+### Results
+
+| NP (MPI ranks) | Files/rank | Steps/rank | Epoch wall time | Aggregate throughput | AU (raw) | AU (corrected) |
+|----------------|-----------|-----------|-----------------|---------------------|----------|----------------|
+| 1 | 64 | 31,248 | 81.65 s | 759 MiB/s (796 MB/s) | 29.5% | 44.6% |
+| 2 | 32 | 15,625 | 56.67 s | 1,094 MiB/s (1,147 MB/s) | 21.2% | 36.3% |
+| 4 | 16 | 7,812 | 49.57 s | 1,250 MiB/s (1,311 MB/s) | 12.1% | 19.4% |
+
+> **Aggregate throughput** = 60.5 GiB ÷ epoch wall time (all NP ranks run in parallel on same dataset).
+>
+> **AU (raw)** = `(steps_per_rank − 1) × 0.000770031 s / epoch_wall_time`:
+> - NP=1: 24.06 / 81.65 = **29.5%** &nbsp; NP=2: 12.03 / 56.67 = **21.2%** &nbsp; NP=4: 6.01 / 49.57 = **12.1%**
+>
+> **AU (corrected)** = `AU_raw / AU_dry` using per-NP dry-run baselines above:
+> - NP=1: 29.5% / 66.2% = **44.6%** &nbsp; NP=2: 21.2% / 58.4% = **36.3%** &nbsp; NP=4: 12.1% / 62.3% = **19.4%**
+
+### Key Findings
+
+- **Throughput scales super-linearly** going from NP=1 to NP=2 (+44%), then flattens NP=2→NP=4 (+14%). Multiple ranks issue concurrent GETs that better saturate s3-ultra's async runtime.
+- **Raw AU decreases with more ranks**: each rank processes fewer steps (less compute time) while epoch wall time doesn't shrink proportionally. This is expected and not a storage deficiency.
+- **Corrected AU also decreases with NP** (44.6% → 36.3% → 19.4%): at NP=4, even the dry-run baseline is tighter (only 9.65s epoch), so the I/O stall takes a larger share of the available time. The benchmark is genuinely becoming more I/O-limited per rank as NP scales on a shared single-node server.
+- **Epoch wall time compresses** as NP increases (81.65 → 56.67 → 49.57 s), but with diminishing returns as all ranks compete for the same single-node S3 server.
+- On a real multi-node deployment with dedicated S3 bandwidth per node, both throughput and corrected AU would scale more linearly.
+
+### Run Commands
+
+```bash
+cd /home/eval/Documents/Code/dlio_benchmark
+export AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin AWS_ENDPOINT_URL=http://127.0.0.1:9200
+
+# NP=1
+mpirun -n 1 -host 127.0.0.1:1 --bind-to none --map-by socket --mca btl ^vader --allow-run-as-root \
+  .venv/bin/dlio_benchmark workload=dlrm_s3dlio_s3 \
+  ++workload.storage.storage_options.storage_library=s3dlio \
+  --config-dir=dlio_benchmark/configs
+
+# NP=2
+mpirun -n 2 -host 127.0.0.1:2 --bind-to none --map-by socket --mca btl ^vader --allow-run-as-root \
+  .venv/bin/dlio_benchmark workload=dlrm_s3dlio_s3 \
+  ++workload.storage.storage_options.storage_library=s3dlio \
+  --config-dir=dlio_benchmark/configs
+
+# NP=4
+mpirun -n 4 -host 127.0.0.1:4 --bind-to none --map-by socket --mca btl ^vader --allow-run-as-root \
+  .venv/bin/dlio_benchmark workload=dlrm_s3dlio_s3 \
+  ++workload.storage.storage_options.storage_library=s3dlio \
+  --config-dir=dlio_benchmark/configs
+```
diff --git a/tests/conftest.py b/tests/conftest.py
index b2ad7ea5..283b9d05 100755
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,7 @@
 collect_ignore_glob = [
     "integration/test_s3_connectivity.py",  # argparse.parse_args() at module level
     "integration/test_compat_runtime.py",   # full S3 smoke-test at module level
+    "integration/test_dlio_storage.py",     # standalone script; StorageType.S3DLIO not in installed package
 ]
 
 import json
diff --git a/tests/object-store/NPZ-OPTIMIZATION-ANALYSIS.md b/tests/object-store/NPZ-OPTIMIZATION-ANALYSIS.md
deleted file mode 100644
index 38172c11..00000000
--- a/tests/object-store/NPZ-OPTIMIZATION-ANALYSIS.md
+++ /dev/null
@@ -1,223 +0,0 @@
-# NPZ Datagen Optimization Analysis
-
-**Date:** 2026-04-25  
-**Goal:** Reach 8 GB/s aggregate throughput for unet3d NPZ datagen with NP=8
-
----
-
-## 1. Current Measured Performance
-
-| Run | Model | Storage Lib | Runtime | Throughput |
-|-----|-------|-------------|---------|------------|
-| 2026-04-25T12:16 | unet3d | s3dlio | 21.2 s | ~1.11 GB/s |
-| 2026-04-25T12:17 | unet3d | minio  | 24.7 s | ~0.95 GB/s |
-
-- 168 files × 8 MPI ranks = 21 files/rank
-- Each file: 139.8 MiB (shape `(6053, 6053, 1)` float32)
-- s3-ultra listening on `0.0.0.0:9101`
-
----
-
-## 2. Object and Array Size Derivation
-
-Config: `record_length_bytes=146600628`, `record_length_bytes_stdev=68341808`, dtype=float32
-
-```
-record_length (elements) = 146600628 / 4 = 36650157
-dimension = floor(sqrt(36650157)) = 6053
-Array shape: (6053, 6053, 1) float32
-Array size: 6053 × 6053 × 1 × 4 = 146,572,036 bytes = 139.8 MiB
-NPZ size (STORED, no compression): ≈ 139.9 MiB (header overhead ~100 bytes)
-```
-
----
-
-## 3. Critical Finding: Installed dlio_benchmark is STALE
-
-**mlp-storage uses a wheel installed from git, NOT our local modified source.**
-
-Evidence:
-```
-source file:    /home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/utils/utility.py  (24879 bytes)
-installed file: ...site-packages/dlio_benchmark/utils/utility.py                          (19154 bytes)
-```
-
-The installed version is missing:
-- Singleton `_DGEN_PROC_GEN` pattern (avoids re-creating Rayon thread pool per file)
-- Async pipeline in `data_generator.py` (upload pool running while main thread generates)
-- `write_threads` floor=8 cap=32 in `config.py`
-- Raw-bytes dgen path in `gen_random_tensor()`
-
-**Impact:** Without the async pipeline, each file is: serialize (270ms) + upload (sequential, ~1s) = ~1.3s/file × 21 files = ~27s ≈ matches measured 21s.
-
-With the async pipeline correctly installed, expected: 21 files × 280ms generation = 5.9s dominated by serial generation, but uploads overlapped → should be much faster.
-
----
-
-## 4. Per-File Timing Breakdown
-
-### np.savez baseline (actual unet3d shape)
-
-```
-Shape: (6053, 6053, 1) float32 = 139.8 MiB
-  Run 0: 270 ms, 518 MB/s
-  Run 1: 270 ms, 518 MB/s
-  Run 2: 272 ms, 514 MB/s
-```
-
-np.savez cost: ~270 ms/file  
-dgen-py generation (BytesView from singleton): < 10 ms  
-Upload 140 MiB at ~140 MB/s per rank: ~1 s/file
-
-### Where 270ms goes in np.savez
-
-1. `ZipFile` object creation + internal buffer setup: ~1 ms
-2. NPY header write: ~0.1 ms
-3. Array data write to BytesIO (140 MiB memcpy): ~130 ms (at ~1 GB/s BytesIO write speed)
-4. ZIP local file header + CRC32 computation: ~140 ms (CRC32 at ~1 GB/s)
-
-Key observation: `np.savez` creates an uninitialized `BytesIO`, then grows it from 0 → 140 MiB via ZipFile writes. Python's `BytesIO` uses a `bytearray` internally that **doubles on reallocation** — this causes multiple 70+ MiB allocations and copies during the write.
-
----
-
-## 5. NPZ Format Structure
-
-NPZ = ZIP archive containing `.npy` files.
-
-NPY 1.0 format:
-```
-\x93NUMPY          (6 bytes magic)
-\x01\x00           (2 bytes: version 1.0)
-HLEN               (2 bytes LE: header data length)
-HEADER_DICT\n      (HLEN bytes: Python dict string, padded to 64-byte boundary)
-DATA               (raw array bytes, C-contiguous little-endian)
-```
-
-**Key insight from user:** The DATA bytes do NOT need to be valid float32 values. Any random bytes are acceptable since the training workload discards data after benchmarking. Only the NPY header (shape, dtype, format descriptors) needs to be correct.
-
----
-
-## 6. Optimization Strategy
-
-### Strategy A: Fix the Installation (IMMEDIATE — critical)
-
-Update mlp-storage's `uv.lock` to use local editable dlio_benchmark:
-```toml
-# pyproject.toml [tool.uv.sources]
-dlio-benchmark = { path = "/home/eval/Documents/Code/dlio_benchmark", editable = true }
-```
-
-**Expected impact:** Enables async pipeline + dgen singleton → likely ~3-4× speedup from 1.11 GB/s to 3-5 GB/s.
-
-### Strategy B: Bypass numpy for NPZ serialization
-
-Current path:
-```
-gen_random_tensor() → ndarray(6053,6053,1)  ~10ms
-np.savez(BytesIO, x=arr, y=[0])             ~270ms  (BytesIO growth + CRC32)
-put_data(path, BytesIO)                     ~1000ms
-```
-
-Optimized path:
-```
-dgen_py.generate_buffer(total_bytes)        ~10ms   (BytesView, no copy)
-build_npz_raw(BytesView, shape)             ~?ms    (manual ZIP+NPY, pre-alloc)
-put_data(path, BytesIO)                     ~?ms
-```
-
-Techniques:
-1. **Pre-allocate BytesIO** to exact NPZ size → avoid BytesIO reallocation overhead
-2. **Skip numpy array creation** — use `bytes(BytesView)` directly as NPY data
-3. **Stream-write via `zf.open()`** — avoids building combined `npy_header + data` bytes
-4. **Buffer protocol write** — `zf.open('x.npy','w').write(bytesview)` — zero extra copy if ZipFile accepts bytes-like objects
-
-### Strategy C: Rust NPZ generator in s3dlio
-
-Add Python-callable Rust function:
-```python
-s3dlio.generate_npz_bytes(shape=(6053,6053,1), dtype='<f4') -> bytes
-```
-
-Internally:
-- dgen-rs generates random bytes (Rayon parallel, ~15 GB/s)
-- NPY header built from shape/dtype parameters
-- ZIP STORED wrapper constructed without Python GIL
-- Returns `Bytes` zero-copy via PyO3
-
-**Expected impact:** ~500+ MB/s → 1+ GB/s per rank serialization (Rust memcpy vs Python BytesIO growth).
-
-### Strategy D: Direct scatter/gather PUT (longest-term)
-
-Use `s3dlio.put_many()` or multipart upload to stream NPY header + raw dgen bytes directly to S3 without any BytesIO intermediary. Eliminates all copying.
-
----
-
-## 7. Arithmetic: Path to 8 GB/s
-
-With NP=8 ranks:
-- Each rank needs: 8 GB/s ÷ 8 = 1 GB/s per rank
-- Each rank uploads 21 files × 139.8 MiB = 2936 MiB
-- At 1 GB/s: 2936 MiB / 1024 MB/GiB × 1 s/GB ≈ 2.9 s per rank
-
-For 2.9 s total per rank:
-- Async pipeline: generation of 21 files = 21 × 10ms (dgen) = 210ms (if savez removed)
-- 21 uploads, 8 concurrent: ceil(21/8) × upload_time_per_file ≤ 2.9s
-- Max upload time per file: 2.9s / 3 batches ≈ 970ms
-- Required per-file upload speed: 139.8 MiB / 970ms ≈ 144 MB/s per rank
-
-s3-ultra capability: 47,883 MB/s for 1 MiB on loopback, 49,926 MB/s for 8 MiB.
-With 8 concurrent ranks × 1 connection each: should be well above 144 MB/s/rank.
-
-**Bottleneck is likely the async pipeline not being used (installation bug), followed by np.savez overhead.**
-
----
-
-## 8. s3-ultra Large Object Note
-
-From Performance.md: "Objects > 32 MiB use streaming path — Chunked encoding, slightly higher overhead."
-
-Our 139.8 MiB files are 4× over the 32 MiB threshold. The PUT path uses chunked transfer encoding which:
-1. Doesn't send `Content-Length` upfront
-2. Requires chunked encoding overhead
-3. s3dlio may not pipeline chunks optimally
-
-Potential fix in s3-ultra: buffer large objects up to a threshold and use `Content-Length` response for GETs.
-
----
-
-## 9. Experiment Log
-
-### Experiment 1 — Baseline (2026-04-25)
-- **Config:** unet3d, NP=8, s3dlio, endpoint 127.0.0.1:9101
-- **Runtime:** 21.2 s, **Throughput:** 1.11 GB/s
-- **Note:** Using OLD installed dlio_benchmark (stale git wheel — async pipeline NOT active)
-
-### Experiment 2 — Baseline minio (2026-04-25)  
-- **Config:** unet3d, NP=8, minio, endpoint 127.0.0.1:9101
-- **Runtime:** 24.7 s, **Throughput:** 0.95 GB/s
-- **Note:** Same stale install issue
-
-### Experiment 3 — (PLANNED) Fix installation, re-run
-- Fix: `uv add --editable /home/eval/Documents/Code/dlio_benchmark` in mlp-storage
-- Expected: significant improvement from async pipeline
-
-### Experiment 4 — (PLANNED) Fast NPZ path
-- Bypass np.savez with raw-bytes NPZ builder
-- Expected: save ~260ms/file serialization overhead
-
-### Experiment 5 — (PLANNED) s3dlio Rust NPZ generator
-- Add `generate_npz_bytes()` to s3dlio Python API
-- Build/install new s3dlio wheel
-- Expected: eliminate Python overhead entirely for serialization
-
----
-
-## 10. Test Infrastructure Notes
-
-- s3-ultra: PID 3765782, `0.0.0.0:9101`, db `/tmp/s3-ultra-mlp-test`
-- Buckets: `mlp-s3dlio`, `mlp-minio`, `mlp-s3torch`
-- mlp-storage: `/home/eval/Documents/Code/mlp-storage/`, `uv run`
-- dlio_benchmark source: `/home/eval/Documents/Code/dlio_benchmark/` (our modified version)
-- s3dlio source: `/home/eval/Documents/Code/s3dlio/`
-- All commands via: `uv run mlpstorage training datagen ...`
-- NEVER use boto3 or aws-cli — always `s3-cli`
diff --git a/tests/object-store/README.md b/tests/object-store/README.md
index 0784055c..ad7b3541 100644
--- a/tests/object-store/README.md
+++ b/tests/object-store/README.md
@@ -2,9 +2,104 @@
 
 Tests for S3-compatible object storage backends used by `mlpstorage` and `dlio_benchmark`.
 
-All tests read credentials and runtime configuration from a `.env` file at the
+All scripts read credentials and runtime configuration from a `.env` file at the
 **project root** (`mlp-storage/.env`) — no credentials or site-specific values are
-embedded in any test script or config file.
+embedded in any script or config file.
+
+---
+
+## Recommended Hardware
+
+**Linux only** — macOS and Windows are not supported.
+
+These are minimum requirements per `NP` (number of simulated accelerators).
+Running below spec will likely cause OOM crashes:
+
+| NP | CPU cores (incl. threads) | RAM |
+|:---:|---:|---:|
+| 1 | 8 | 16 GB |
+| 2 | 16 | 32 GB |
+| 4 | 32 | 64 GB |
+| 8 | 64 | 128 GB |
+
+NP scales linearly — each doubling of NP requires 2× the CPU and RAM.
+You may be able to run some workloads below these numbers, but OOM crashes are expected.
+
+---
+
+## Structure
+
+```
+tests/object-store/
+│
+├── — Data Generators (run once, before benchmarking) ——————————————
+│   gen_retinanet_jpeg.sh   generate 50k JPEG files for RetinaNet (~15 GiB)
+│   gen_unet3d_npz.sh       generate 7,200 NPZ files for UNet3D   (~984 GiB)
+│                           (DLRM and Flux generate data inline via run_*_bench.sh)
+│
+├── — Benchmark Runners ————————————————————————————————————————————
+│   run_dlrm_bench.sh       DLRM:      Parquet, NP=1..8, prints AU + throughput
+│   run_flux_bench.sh       Flux:      Parquet, NP=1..8, prints AU + throughput
+│   test_retinanet.sh       RetinaNet: JPEG,    NP=1..4, smoke test + benchmark
+│   test_unet3d.sh          UNet3D:    NPZ,     NP=1..4, smoke test + benchmark
+│
+├── — Checkpointing ————————————————————————————————————————————————
+│   run_checkpointing.sh    LLaMA 3 8B checkpoint write + read (s3dlio/minio/s3torch)
+│
+├── — Utilities ————————————————————————————————————————————————————
+│   run_cleanup.sh          delete all objects written by tests above
+│   show_results.sh         print throughput summary from results/dlrm/
+│
+├── sweeps/                 NP and compute-time scaling studies (run after smoke tests)
+│   sweep_dlrm_compute.sh   DLRM:      computation_time sweep at NP=1
+│   sweep_dlrm_np.sh        DLRM:      NP scaling (1, 2, 4, 8)
+│   sweep_flux.sh           Flux:      NP × read_threads scaling
+│   sweep_retinanet_np.sh   RetinaNet: NP scaling (1, 2, 4)
+│   sweep_unet3d_np.sh      UNet3D:    NP scaling (1, 2, 4)
+│
+└── old-archive/            deprecated scripts kept for reference — not maintained
+
+Performance results and analysis live in docs/ (see Performance Results below).
+```
+
+### Four model types, one generator + one benchmark each
+
+| Model | Format | Generator | Benchmark |
+|---|---|---|---|
+| **DLRM** | Parquet | *(inline in run_dlrm_bench.sh)* | `run_dlrm_bench.sh` |
+| **Flux** | Parquet | *(inline in run_flux_bench.sh)* | `run_flux_bench.sh` |
+| **RetinaNet** | JPEG | `gen_retinanet_jpeg.sh` | `test_retinanet.sh` |
+| **UNet3D** | NPZ | `gen_unet3d_npz.sh` | `test_unet3d.sh` |
+
+**Checkpointing** is a separate workflow (`run_checkpointing.sh`) — it tests LLaMA 3 8B
+checkpoint write + read and is independent of the four model types above.
+
+---
+
+## Quick Start
+
+```bash
+# 1. Install dependencies
+cd /path/to/mlp-storage
+uv sync
+
+# 2. Create .env with your credentials (see Credential Setup below)
+cp .env.example .env
+
+# 3a. DLRM or Flux — data is generated inline, just run the benchmark
+NP=1 bash tests/object-store/run_dlrm_bench.sh
+NP=1 bash tests/object-store/run_flux_bench.sh
+
+# 3b. RetinaNet or UNet3D — generate data first, then benchmark
+bash tests/object-store/gen_retinanet_jpeg.sh
+bash tests/object-store/test_retinanet.sh
+
+bash tests/object-store/gen_unet3d_npz.sh
+bash tests/object-store/test_unet3d.sh
+
+# 3c. Checkpointing
+bash tests/object-store/run_checkpointing.sh
+```
 
 ---
 
@@ -69,96 +164,6 @@ uv run python -c "import s3dlio; print(s3dlio.list('s3://your-bucket/', recursiv
 
 ---
 
-## Tests
-
-Four shell scripts cover the complete test workflow. All runtime parameters come
-from `.env` (or environment variables) — no editing of scripts or config files is needed.
-
-```
-run_datagen.sh       — generate training dataset (run once)
-run_training.sh      — run training benchmark (run as many times as needed)
-run_checkpointing.sh — write + read LLaMA 3 8B checkpoints
-run_cleanup.sh       — delete all objects written by the tests above
-```
-
----
-
-### `run_datagen.sh` — Data generation
-
-Generates a synthetic training dataset and writes it to the object store.  Run
-this **once** before using `run_training.sh`.  The dataset can be reused for
-multiple training runs without re-generating.
-
-```bash
-cd /path/to/mlp-storage
-
-# s3dlio (default) — BUCKET auto-defaults to mlp-s3dlio
-bash tests/object-store/run_datagen.sh
-
-# minio — BUCKET auto-defaults to mlp-minio
-STORAGE_LIBRARY=minio bash tests/object-store/run_datagen.sh
-
-# s3torchconnector — BUCKET auto-defaults to mlp-s3torch
-STORAGE_LIBRARY=s3torchconnector bash tests/object-store/run_datagen.sh
-
-# Override bucket name explicitly
-BUCKET=my-bucket STORAGE_LIBRARY=s3dlio bash tests/object-store/run_datagen.sh
-
-# 8 parallel MPI processes for faster generation
-NP=8 bash tests/object-store/run_datagen.sh
-```
-
-**Runtime parameters:**
-
-| Variable | Default | Description |
-|---|---|---|
-| `BUCKET` | auto-derived | `mlp-s3dlio` / `mlp-minio` / `mlp-s3torch` based on `STORAGE_LIBRARY`; set explicitly to override |
-| `STORAGE_LIBRARY` | `s3dlio` | `s3dlio`, `minio`, or `s3torchconnector` |
-| `MODEL` | `unet3d` | mlpstorage model name |
-| `NP` | `1` | MPI process count for generation |
-| `DATA_DIR` | `test-run/` | Object prefix for the dataset |
-| `S3_PROFILE` | *(unset)* | AWS credential profile for s3torchconnector (default: `mlp-minio`) |
-
----
-
-### `run_training.sh` — Training
-
-Reads the dataset generated by `run_datagen.sh` and runs the MLPerf Storage
-training benchmark.  Can be run repeatedly against the same dataset.
-
-**DATA_DIR and MODEL must match what was used during datagen.**
-
-```bash
-cd /path/to/mlp-storage
-
-# s3dlio (default) — BUCKET auto-defaults to mlp-s3dlio
-bash tests/object-store/run_training.sh
-
-# minio, 8 simulated accelerators — BUCKET auto-defaults to mlp-minio
-STORAGE_LIBRARY=minio NP=8 bash tests/object-store/run_training.sh
-
-# s3torchconnector — BUCKET auto-defaults to mlp-s3torch
-STORAGE_LIBRARY=s3torchconnector bash tests/object-store/run_training.sh
-
-# bert model (must have been generated with MODEL=bert)
-MODEL=bert bash tests/object-store/run_training.sh
-```
-
-**Runtime parameters:**
-
-| Variable | Default | Description |
-|---|---|---|
-| `BUCKET` | auto-derived | `mlp-s3dlio` / `mlp-minio` / `mlp-s3torch` based on `STORAGE_LIBRARY`; set explicitly to override |
-| `STORAGE_LIBRARY` | `s3dlio` | `s3dlio`, `minio`, or `s3torchconnector` |
-| `MODEL` | `unet3d` | mlpstorage model name (must match datagen) |
-| `NP` | `1` | Number of simulated accelerators |
-| `DATA_DIR` | `test-run/` | Object prefix (must match datagen) |
-| `ACCELERATOR_TYPE` | `h100` | Accelerator to simulate (`h100`, `a100`, `b200`, `mi355`) |
-| `CLIENT_MEMORY_GB` | `512` | Client host memory in GB |
-| `S3_PROFILE` | *(unset)* | AWS credential profile for s3torchconnector (default: `mlp-minio`) |
-
----
-
 ### `run_checkpointing.sh` — Checkpoint write + read
 
 Runs a LLaMA 3 8B checkpoint cycle via `dlio_benchmark`:
@@ -303,6 +308,22 @@ curl -v https://your-minio-host:9000/
 
 ---
 
+## Performance Results
+
+Current benchmark results are in `docs/` — these are the authoritative numbers,
+updated as new sweeps are run:
+
+| Model | Results doc |
+|---|---|
+| DLRM | [docs/DLRM_NP_Scaling_Results.md](../../docs/DLRM_NP_Scaling_Results.md) |
+| Flux | [docs/Flux_NP_ReadThreads_Scaling_Results.md](../../docs/Flux_NP_ReadThreads_Scaling_Results.md) |
+| RetinaNet | [docs/RetinaNet_NP_Scaling_Results.md](../../docs/RetinaNet_NP_Scaling_Results.md) |
+| UNet3D | [docs/UNet3D_NP_Scaling_Results.md](../../docs/UNet3D_NP_Scaling_Results.md) |
+
+Sweep runs also write timestamped results to `results/<model>_np_sweep/<timestamp>/`.
+
+---
+
 ## Adding More Libraries
 
 Runtime parameters — library, bucket, endpoint, credentials — all flow from
@@ -310,13 +331,18 @@ environment variables. To test a new storage library:
 
 1. Add it to `mlpstorage_py/storage/` and register it in `obj_store_lib.py`
 2. Set `STORAGE_LIBRARY=<new-library>` in `.env`
-3. Run `run_datagen.sh` and `run_training.sh` without changing any test script
+3. Run the relevant benchmark script with `STORAGE_LIBRARY=<new-library>`
 
 ---
 
 ## Archived Tests
 
-Older per-library scripts (dlio\_s3dlio\_\*.sh, dlio\_minio\_\*.sh, etc.),
-per-library Python tests, library benchmark scripts, and historical result
-documents are preserved in `tests/object-store/old-archive/` for reference.
-They are **not maintained**.
+Older scripts and historical results are preserved in `tests/object-store/old-archive/`
+for reference. They are **not maintained** and may not work with current code.
+
+Notable reference files:
+- `test_s3dlio_direct.py`, `test_s3dlio_formats.py` — raw s3dlio API patterns
+- `test_s3lib_get_bench.py`, `test_direct_write_comparison.py` — library comparison methodology
+- `S3library_review_21-Mar.md` — analysis of library concurrency models
+- `bench_npz_build.py`, `bench_parquet_rg_flux.py` — format serialization benchmarks
+- `run_datagen.sh`, `run_training.sh` — old generic multi-model wrappers (replaced by model-specific scripts)
diff --git a/tests/object-store/gen_retinanet_jpeg.sh b/tests/object-store/gen_retinanet_jpeg.sh
new file mode 100755
index 00000000..4312f536
--- /dev/null
+++ b/tests/object-store/gen_retinanet_jpeg.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+# =============================================================================
+# gen_retinanet_jpeg.sh — Generate RetinaNet JPEG dataset on s3-ultra
+#
+# Generates synthetic JPEG files for RetinaNet benchmarking.
+#
+# Default: 50,000 files × ~323 KiB avg ≈ 15.4 GiB
+#   Suitable for functional testing and NP scaling sweeps on a co-located
+#   s3-ultra instance.
+#
+# Full MLPerf compliance requires 1,170,301 files (~361 GiB total).
+#   Override: NUM_FILES=1170301 bash gen_retinanet_jpeg.sh
+#
+# JPEG generation uses dlio_benchmark's standard Python generator (no Rust
+# fast path — JPEG does not have an equivalent to s3dlio.generate_npz_bytes()).
+# Each file contains one synthetic image of record_length_bytes ≈ 322,957 bytes.
+#
+# Destination: s3://mlp-retinanet/data/retinanet/
+#
+# Prerequisites:
+#   - s3-ultra running on localhost:9000  (bash s3-ultra/scripts/start_s3ultra2.sh)
+#   - mlp-retinanet bucket already exists (s3-cli create-bucket s3://mlp-retinanet)
+#   - mlp-storage .venv with s3dlio installed
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#   bash tests/object-store/gen_retinanet_jpeg.sh
+#
+#   # Use more MPI processes for faster generation:
+#   NP=4 bash tests/object-store/gen_retinanet_jpeg.sh
+#
+#   # Full MLPerf dataset (361 GiB — slow, ~10-30 min at 700 MiB/s):
+#   NUM_FILES=1170301 NP=4 bash tests/object-store/gen_retinanet_jpeg.sh
+# =============================================================================
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+VENV="${REPO}/.venv"
+PYTHON="${VENV}/bin/python3"
+
+# Number of MPI datagen workers.  Higher NP = faster generation.
+# Each rank generates a disjoint subset of files concurrently.
+NP="${NP:-4}"
+
+# Dataset parameters — must match retinanet_b200.yaml / retinanet_datagen.yaml
+# Default: 50,000 files for test/sweep use.  Full MLPerf: 1,170,301.
+NUM_FILES="${NUM_FILES:-50000}"
+DATA_FOLDER="data/retinanet"
+STORAGE_ROOT="${STORAGE_ROOT:-mlp-retinanet}"   # override: STORAGE_ROOT=other-bucket bash ...
+
+cd "${REPO}"
+
+# ── Load s3-ultra credentials from .env.s3-ultra ────────────────────────────
+# NOTE: We unset BUCKET so the env file's default does not override the
+# explicit storage.storage_root param we pass on the CLI.
+if [[ ! -f .env.s3-ultra ]]; then
+    echo "ERROR: .env.s3-ultra not found in ${REPO}" >&2
+    exit 1
+fi
+set -o allexport
+source .env.s3-ultra
+set +o allexport
+unset BUCKET   # prevent env BUCKET from controlling the target bucket
+
+# ── Activate virtual environment ─────────────────────────────────────────────
+if [[ ! -f "${VENV}/bin/activate" ]]; then
+    echo "ERROR: .venv not found — run: uv sync" >&2
+    exit 1
+fi
+source "${VENV}/bin/activate"
+
+if ! command -v mlpstorage &>/dev/null; then
+    echo "ERROR: mlpstorage not found in venv. Run: uv sync" >&2
+    exit 1
+fi
+
+# ── Size estimate ─────────────────────────────────────────────────────────────
+RECORD_BYTES=322957
+TOTAL_MIB=$(( NUM_FILES * RECORD_BYTES / 1024 / 1024 ))
+
+echo ""
+echo "════════════════════════════════════════════════════════"
+echo "  RetinaNet JPEG Dataset Generation"
+echo "════════════════════════════════════════════════════════"
+echo "  Bucket    : s3://${STORAGE_ROOT}/${DATA_FOLDER}/"
+echo "  Endpoint  : ${AWS_ENDPOINT_URL}"
+echo "  Files     : ${NUM_FILES} × ~323 KiB  (~${TOTAL_MIB} MiB total)"
+echo "  NP        : ${NP} MPI datagen workers"
+echo "  Generator : dlio_benchmark JPEG generator (Python, s3dlio upload)"
+echo "  Started   : $(date '+%Y-%m-%d %H:%M:%S')"
+if [[ "${NUM_FILES}" -lt 1170301 ]]; then
+    echo ""
+    echo "  NOTE: Generating ${NUM_FILES} files (test subset)."
+    echo "        Full MLPerf compliance needs 1,170,301 files (~361 GiB)."
+    echo "        Override: NUM_FILES=1170301 NP=4 bash $0"
+fi
+echo "════════════════════════════════════════════════════════"
+echo ""
+
+RUST_LOG=s3dlio=info \
+"${PYTHON}" -c "from mlpstorage_py.main import main; main()" \
+    training datagen \
+    --model retinanet \
+    --num-processes "${NP}" \
+    --skip-validation \
+    --allow-run-as-root \
+    --object s3 \
+    --params \
+        storage.storage_root=${STORAGE_ROOT} \
+        dataset.num_files_train=${NUM_FILES} \
+        dataset.data_folder=${DATA_FOLDER} \
+        storage.storage_options.storage_library=s3dlio
+
+echo ""
+echo "════════════════════════════════════════════════════════"
+echo "  ✅  gen_retinanet_jpeg.sh complete"
+echo "  Dataset : s3://${STORAGE_ROOT}/${DATA_FOLDER}/"
+echo "  Files   : ${NUM_FILES} JPEG files"
+echo "  Finished: $(date '+%Y-%m-%d %H:%M:%S')"
+echo ""
+echo "  To run a quick smoke test:"
+echo "    bash tests/object-store/test_retinanet.sh"
+echo ""
+echo "  To run a full NP scaling sweep:"
+echo "    bash tests/object-store/sweep_retinanet_np.sh 2>&1 | tee sweep_retinanet_\$(date +%Y%m%d_%H%M%S).log"
+echo "════════════════════════════════════════════════════════"
diff --git a/tests/object-store/gen_unet3d_npz.sh b/tests/object-store/gen_unet3d_npz.sh
new file mode 100755
index 00000000..c032d4a1
--- /dev/null
+++ b/tests/object-store/gen_unet3d_npz.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+# =============================================================================
+# gen_unet3d_npz.sh — Generate unet3d NPZ dataset on s3-ultra (mlp-unet3d)
+#
+# Generates ~984 GiB of synthetic NPZ files (7,200 files × ~140 MiB avg)
+# for unet3d B200 benchmarking.
+#
+# Data generation uses s3dlio.generate_npz_bytes() via the dlio_benchmark
+# NPZGenerator fast path — pure Rust, hardware CRC32, no GIL, zero Python-
+# side copies of the payload buffer.
+#
+# Destination: s3://mlp-unet3d/data/unet3d/
+#
+# Prerequisites:
+#   - s3-ultra running on localhost:9000  (bash s3-ultra/scripts/start_s3ultra2.sh)
+#   - mlp-unet3d bucket already exists   (s3-cli create-bucket s3://mlp-unet3d)
+#   - mlp-storage .venv with s3dlio installed
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#   bash tests/object-store/gen_unet3d_npz.sh
+#
+#   # Use more MPI processes for faster generation (each rank writes its share):
+#   NP=4 bash tests/object-store/gen_unet3d_npz.sh
+# =============================================================================
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+VENV="${REPO}/.venv"
+PYTHON="${VENV}/bin/python3"
+
+# Number of MPI datagen workers.  Higher NP = faster generation.
+# Each rank generates a disjoint subset of the 7,200 files concurrently.
+NP="${NP:-4}"
+
+# Dataset parameters — must match unet3d_b200.yaml / unet3d_datagen.yaml
+NUM_FILES=7200          # ~984 GiB at ~140 MiB avg per file
+DATA_FOLDER="data/unet3d"
+STORAGE_ROOT="${STORAGE_ROOT:-mlp-unet3d}"   # override: STORAGE_ROOT=mlp-flux bash gen_unet3d_npz.sh
+
+cd "${REPO}"
+
+# ── Load s3-ultra credentials from .env.s3-ultra ────────────────────────────
+# NOTE: .env.s3-ultra sets BUCKET=mlp-flux (its default).  We do NOT export
+# BUCKET — instead we pass storage.storage_root on the CLI so the correct
+# bucket is always used regardless of what the env file contains.
+if [[ ! -f .env.s3-ultra ]]; then
+    echo "ERROR: .env.s3-ultra not found in ${REPO}" >&2
+    exit 1
+fi
+set -o allexport
+source .env.s3-ultra
+set +o allexport
+unset BUCKET   # prevent env BUCKET from leaking into mlpstorage
+
+# ── Activate virtual environment ─────────────────────────────────────────────
+if [[ ! -f "${VENV}/bin/activate" ]]; then
+    echo "ERROR: .venv not found — run: uv sync" >&2
+    exit 1
+fi
+source "${VENV}/bin/activate"
+
+if ! command -v mlpstorage &>/dev/null; then
+    echo "ERROR: mlpstorage not found in venv. Run: uv sync" >&2
+    exit 1
+fi
+
+echo ""
+echo "════════════════════════════════════════════════════════"
+echo "  UNet3D NPZ Dataset Generation"
+echo "════════════════════════════════════════════════════════"
+echo "  Bucket    : s3://${STORAGE_ROOT}/${DATA_FOLDER}/"
+echo "  Endpoint  : ${AWS_ENDPOINT_URL}"
+echo "  Files     : ${NUM_FILES} × ~140 MiB avg  (~984 GiB total)"
+echo "  NP        : ${NP} MPI datagen workers"
+echo "  Generator : s3dlio.generate_npz_bytes() (Rust, hardware CRC32)"
+echo "  Started   : $(date '+%Y-%m-%d %H:%M:%S')"
+echo "════════════════════════════════════════════════════════"
+echo ""
+
+RUST_LOG=s3dlio=info \
+"${PYTHON}" -c "from mlpstorage_py.main import main; main()" \
+    training datagen \
+    --model unet3d \
+    --num-processes "${NP}" \
+    --skip-validation \
+    --allow-run-as-root \
+    --object s3 \
+    --params \
+        storage.storage_root=${STORAGE_ROOT} \
+        dataset.num_files_train=${NUM_FILES} \
+        dataset.data_folder=${DATA_FOLDER} \
+        storage.storage_options.storage_library=s3dlio \
+        storage.storage_options.decode_mode=none
+
+echo ""
+echo "════════════════════════════════════════════════════════"
+echo "  ✅  gen_unet3d_npz.sh complete"
+echo "  Dataset : s3://${STORAGE_ROOT}/${DATA_FOLDER}/"
+echo "  Finished: $(date '+%Y-%m-%d %H:%M:%S')"
+echo ""
+echo "  To run a benchmark:"
+echo "    bash tests/object-store/test_unet3d.sh"
+echo "════════════════════════════════════════════════════════"
diff --git a/tests/object-store/old-archive/Object_Perf_Results.md b/tests/object-store/old-archive/Object_Perf_Results.md
deleted file mode 100644
index a8b9a040..00000000
--- a/tests/object-store/old-archive/Object_Perf_Results.md
+++ /dev/null
@@ -1,498 +0,0 @@
-# S3 Library Write + Read Comparison — Results
-
-**Date:** March 18, 2026  
-**Endpoint:** `http://minio-host:9000` (MinIO-compatible S3)  
-**Test script:** `Test-Backup/test_direct_write_comparison.py`
-
----
-
-## Environment & Credentials
-
-Credentials and endpoint configuration are supplied via a `.env` file at the root of the
-`mlp-storage` project directory (`mlp-storage/.env`).  The script loads this file
-automatically at startup and exports the following variables into the environment before
-any library is initialised:
-
-```
-AWS_ACCESS_KEY_ID
-AWS_SECRET_ACCESS_KEY
-AWS_ENDPOINT_URL
-AWS_REGION
-```
-
-No credentials are hard-coded in the test script.  Any future tester only needs to create
-(or update) the `.env` file with their own endpoint and credentials before running.
-
----
-
-## Library Versions Tested
-
-| Library | Version |
-|---|---|
-| s3dlio | 0.9.84 |
-| minio (Python SDK) | 7.2.20 |
-| s3torchconnector | 1.5.0 |
-
-All three were installed in the project's virtual environment (`.venv`):
-
-```bash
-source .venv/bin/activate
-pip show s3dlio minio s3torchconnector
-```
-
-Each library was given its own dedicated S3 bucket so writes never interfere:
-
-| Library | Bucket |
-|---|---|
-| s3dlio | `bucket-s3dlio` |
-| minio | `bucket-minio` |
-| s3torchconnector | `bucket-s3torch` |
-
----
-
-## Test Description
-
-`test_direct_write_comparison.py` runs three phases per library:
-
-1. **Cleanup** — delete every object under the test prefix so every run starts clean
-2. **Write** — upload N objects in parallel using `ThreadPoolExecutor` and each library's
-   native write API (no common wrapper)
-3. **Read** — download all N objects back in parallel using `ThreadPoolExecutor`
-
-Write APIs used:
-- **s3dlio** — `MultipartUploadWriter.from_uri()` with configurable `part_size` and
-  `max_in_flight` (concurrent parts per object)
-- **minio** — native `_create_multipart_upload` / `_upload_part` / `_complete_multipart_upload`
-  (sequential parts within each object, parallel objects)
-- **s3torchconnector** — `S3Client.put_object()` (buffers internally, uploads at `close()`)
-
----
-
-## How to Run
-
-### Default run (8 write workers, 8 read workers, all three libraries)
-
-```bash
-cd mlp-storage
-source .venv/bin/activate
-python Test-Backup/test_direct_write_comparison.py --num-files 100 --size-mb 128
-```
-
-### Run that produced the results below (12 workers each, all libraries)
-
-```bash
-python Test-Backup/test_direct_write_comparison.py \
-    --num-files 100 \
-    --size-mb 128 \
-    --write-workers 12 \
-    --read-workers 12
-```
-
-### Test a single library
-
-```bash
-python Test-Backup/test_direct_write_comparison.py \
-    --num-files 100 --size-mb 128 \
-    --write-workers 12 --read-workers 12 \
-    --library s3dlio
-```
-
-### Test two libraries
-
-```bash
-python Test-Backup/test_direct_write_comparison.py \
-    --num-files 100 --size-mb 128 \
-    --write-workers 12 --read-workers 12 \
-    --library s3dlio minio
-```
-
-### Full CLI reference
-
-```
-optional arguments:
-  --num-files N         Number of objects to write/read per library (default: 100)
-  --size-mb N           Object size in MB (default: 128)
-  --chunk-mb N          Multipart chunk size in MB (default: 32)
-  --prefix PREFIX       S3 key prefix (default: bench)
-  --write-workers N     Parallel object upload threads (default: 8)
-  --read-workers N      Parallel object download threads (default: 8)
-  --max-in-flight N     s3dlio per-object concurrent multipart parts (default: 8)
-  --library LIB [LIB …] Libraries to test: s3dlio minio s3torchconnector (default: all)
-```
-
----
-
-## Results
-
-Command run:
-
-```bash
-python Test-Backup/test_direct_write_comparison.py \
-    --num-files 100 --size-mb 128 \
-    --write-workers 12 --read-workers 12
-```
-
-```
-========================================================================================
-WRITE + READ COMPARISON — RESULTS
-  100 objects × 128 MB = 12800 MB per library  |  write workers: 12   read workers: 12
-========================================================================================
-  Library                Version       Write GB/s   Read GB/s  Wr s/obj  Rd s/obj
-  ---------------------- ------------ ----------- ----------- --------- ---------
-  s3dlio                 0.9.84            0.525         1.085 ◀R    0.238s    0.115s
-  minio                  7.2.20            0.415         1.051       0.301s    0.119s
-  s3torchconnector       1.5.0             0.561 ◀W      0.541       0.223s    0.231s
-
-  Write GB/s — parallel write throughput (all objects, ThreadPoolExecutor)
-  Read GB/s  — parallel read throughput (all objects, ThreadPoolExecutor)
-  Wr s/obj   — average time to write one object (write + commit)
-  Rd s/obj   — average time to read one object (wall-clock, under parallelism)
-  ◀W = fastest write    ◀R = fastest read
-
-  Notes:
-   • Write workers = parallel object uploads; Read workers = parallel object downloads
-   • s3dlio max_in_flight = additional per-object part concurrency within each writer
-   • minio part uploads are sequential within each object (no per-object parallelism)
-   • s3torchconnector buffers writes internally and uploads at close()
-========================================================================================
-✅ All tests passed.
-```
-
----
-
-## Analysis
-
-### Write throughput
-
-s3torchconnector achieved the highest write throughput (0.561 GB/s), narrowly ahead of
-s3dlio (0.525 GB/s).  Both are consistent with the independent `s3-cli` baseline of
-~0.429 GB/s at 12 jobs — the per-library Python threads reach slightly higher than the CLI
-tool because they issue more concurrent connections.  minio lags (0.415 GB/s) likely
-because its multipart parts are issued sequentially within each object, so each upload is
-limited to one connection at a time regardless of how many objects are in flight in parallel.
-
-### Read throughput
-
-s3dlio and minio deliver essentially the same peak read throughput (~1.05–1.09 GB/s).
-s3torchconnector reads at only 0.541 GB/s — roughly half — because its streaming `read()`
-model serialises data transfer through a single Python call per object rather than issuing
-parallel range-based fetches.
-
-### Overall recommendation
-
-**s3dlio is the most balanced choice**: near-best write throughput and best-in-class read
-throughput.  It is also the only library that supports configurable per-object part
-concurrency (`max_in_flight`), which provides an additional tuning lever beyond the number
-of parallel objects.
-
----
-
----
-
-## DLIO Workload Results
-
-**Test script:** `Test-Backup/test_dlio_multilib_demo.py`  
-**Date:** March 18, 2026  
-**Endpoint:** `http://minio-host:9000` (MinIO-compatible, ~1.2 GB/s link on this machine)
-
-These results measure performance **as seen by DLIO** (via `mlpstorage`) — not direct native
-API calls. The gap versus the direct API numbers above quantifies DLIO overhead.
-
-### Workload 1 — Training
-
-- Dataset: 100 × 128 MiB NPZ objects = 12.5 GiB per library
-- 2 full epochs (25.0 GiB total reads per library)
-- Write = `mlpstorage training datagen` (8 MPI processes)
-- Read = `mlpstorage training run` (8 DataLoader workers, prefetch 4)
-
-```
-  Library                  Write GB/s    Read GB/s    Gen s   Train s  Status
-  ---------------------- ------------ ------------ -------- ---------  ------
-  s3dlio                        0.308        0.178    40.6s    140.1s  ✅
-  s3torchconnector              0.360        0.178    34.7s    140.5s  ✅
-  minio                         (pending)
-```
-
-**Key observations:**
-
-- Read throughput is **identical** (0.178 GB/s) for both libraries despite s3dlio reading at
-  1.085 GB/s natively. The bottleneck is PyTorch DataLoader IPC overhead: each of the 8
-  worker processes fetches a 128 MiB file, deserializes NPZ, then pickles the result back
-  to the main process. For 128 MiB objects this IPC pickle is the sole limiter — the S3
-  library is never the constraint.
-- Write (datagen) overhead vs direct API: s3dlio 0.308 vs 0.525 GB/s (~41% slower through
-  DLIO); s3torchconnector 0.360 vs 0.561 GB/s (~36% slower). DLIO's MPI orchestration adds
-  meaningful overhead.
-
-### Workload 2 — Checkpoint (StreamingCheckpointing)
-
-- Single 100 GB object per library written via streaming producer-consumer pipeline
-- Fixed RAM: 32 MB chunks × 4 buffers = 128 MB peak, regardless of checkpoint size
-- dgen-py generates data concurrently; I/O is always the bottleneck
-- Write API: `StreamingCheckpointing.save(uri, 100 GB)`
-
-```
-  Library                    Size GB    Elapsed    Write GB/s  Status
-  ----------------------- ---------- ---------- -----------    ------
-  s3dlio                       100        99.2s      1.008 ◀   ✅
-  s3torchconnector              75        83.9s      0.912      ❌ CRT error at ~78 GB (run capped at 75 GB)
-  minio                        100       233.6s      0.429      ✅
-```
-
-**s3torchconnector CRT failure:**
-
-s3torchconnector fails consistently at approximately 78 GB into the 100 GB upload with:
-
-```
-Client error: Unknown CRT error: CRT error 14366:
-  aws-c-s3: AWS_ERROR_S3_REQUEST_HAS_COMPLETED,
-  Request has already completed, action cannot be performed.
-Client error: Internal S3 client error: A previous write operation did not complete successfully
-```
-
-This is a bug in the AWS Common Runtime (CRT) multipart upload state machine — the CRT
-marks a request as completed prematurely while the Python streaming layer is still feeding
-data. The failure is **reproducible** and occurs at ~78 GB regardless of retry. s3dlio
-uses its own multipart engine (not the CRT) and completes 100 GB cleanly.
-
-**minio checkpoint result:**
-
-minio achieved **0.429 GB/s** — exactly matching its native direct-API write speed
-(0.415 GB/s in the direct comparison).  The initial implementation uploaded parts
-sequentially (one at a time), capping throughput at ~0.10 GB/s.  After enabling
-8 parallel part uploads via `ThreadPoolExecutor`, throughput improved 4× to 0.429 GB/s.
-Further gains are unlikely from minio alone: even with parallelism its per-connection
-transfer is limited to one outstanding request per part, unlike s3dlio which pipelines
-parts within each connection.
-
-**s3dlio checkpoint result:**
-
-s3dlio achieved **1.008 GB/s** — near the ~1.2 GB/s physical network ceiling on this
-machine. The streaming pipeline keeps the network saturated throughout the full 100 GB
-run with no accumulation of model state in RAM.
-
----
-
-## Reference: write worker count sensitivity
-
-Tested independently using `s3-cli` (s3dlio's CLI), same endpoint & object size:
-
-| Workers (`-j`) | Write throughput |
-|---|---|
-| 8 | 308.64 MiB/s (0.302 GB/s) |
-| 12 | 429.25 MiB/s (0.419 GB/s) |
-
-A ~39 % gain from 8 → 12 workers; worth testing higher values (16, 24) if the network
-and server can sustain it.
-
----
-
-## Checkpoints
-
-**Test script:** `Test-Backup/test_dlio_multilib_demo.py --workload checkpoint`  
-**Date:** March 18, 2026  
-**Checkpoint size:** 16 GB (sanity-check run; production target is 100 GB)  
-**Method:** `StreamingCheckpointing` — streaming producer-consumer pipeline, fixed 128 MB RAM
-
-### Checkpoint Write
-
-```
-================================================================================================
-DLIO MULTI-LIBRARY BENCHMARK — RESULTS
-================================================================================================
-
-WORKLOAD 2: CHECKPOINT  (StreamingCheckpointing — fixed 128 MB RAM)
-  Single object per library via streaming producer-consumer pipeline
-  32 MB chunks × 4 buffers = 128 MB RAM max regardless of checkpoint size
-  Library                  Size GB   Write GB/s    Read GB/s     Status
-  ---------------------- --------- ------------ ------------      -----
-  s3dlio                        16        1.023 ◀W        1.051     ✅ - 1st place 
-  minio                         16        0.430           1.055     ✅ - 3rd place
-  s3torchconnector              16        0.949           1.092 ◀R  ✅ - 2nd place
-
-  Write GB/s = I/O throughput from StreamingCheckpointing.save()
-  Read GB/s  = I/O throughput from StreamingCheckpointing.load() (byte-range GETs, data discarded)
-  ◀W = fastest write   ◀R = fastest read
-  dgen-py generates write data concurrently; bottleneck is always I/O, not generation
-
-================================================================================================
-✅ All tests passed.
-```
-
-### Checkpoint Load
-
-**s3dlio and minio** use explicit offset-based `get_range()` / Range-GET calls.
-`StreamingCheckpointing.load()` issues 8 parallel threads, each reading a contiguous
-block of the object with its own connection, achieving ~1.05 GB/s.
-
-**s3torchconnector** — RAM and throughput fixes, three iterations:
-
-**Iteration 1 — OOM with SequentialS3Reader (before any fix):**
-The default `get_object()` uses `SequentialS3Reader`, which causes the AWS CRT
-(`mountpoint-s3-client`) to buffer the entire object before serving any `read()` calls.
-Peak RAM = object size. Results: 75 GB load killed at ~24 GB; 16 GB caused heavy swap.
-
-**Iteration 2 — `range_based(buffer_size=0)` (fixed OOM, killed throughput):**
-`RangedS3Reader._read_unbuffered()` was used, which calls `_get_stream(start, end)` on
-**every single `read()` call**, opening a brand-new HTTP range-GET each time. With 128 MB
-read chunks, each worker made 16 separate range-GETs to read its 2 GB block. Per-worker
-throughput stalled at 0.07 GB/s regardless of chunk size; total read: **0.583 GB/s**.
-RAM was bounded (8 × 128 MB = 1 GB) but connection overhead dominated.
-
-**Iteration 3 — `_get_object_stream` directly (current implementation):**
-After reading the s3torchconnector source, the root cause was identified: the fix calls
-`S3Client._get_object_stream(bucket, key, start, end)` directly — the same native CRT
-method that `RangedS3Reader` uses internally, but held open for the entire block. Each
-worker issues **one HTTP connection** for its `[block_start, block_end)` range and
-streams through native CRT chunks (~8 MB each) without reopening. This is implemented
-as `stream_block(start, end)` on the reader. Each chunk is counted and immediately
-discarded.
-
-Peak RAM = n_workers × CRT internal buffer per stream ≈ 8 workers × ~32 MB = **~256 MB**,
-constant for any object size (16 GB or 759 GB). The `read_chunk()` serial path also uses
-a persistent stream opened lazily, with a small leftover buffer for CRT chunk boundary
-alignment (~8 MB max). The `S3Client` instance is created once per worker; the CRT
-manages its own connection pool for reuse across calls.
-
-**Confirmed results (16 GB, 8 workers, stream_block path):**
-- Write: **0.949 GB/s** ✅
-- Read:  **1.092 GB/s** ✅  (was 0.583 GB/s with range_based — **87% improvement**)
-- `Chunks: 8` in load output — confirms exactly ONE HTTP connection per worker.
-- Per-worker: ~0.14–0.21 GB/s each × 8 workers = ~1.09 GB/s aggregate.
-- Peak RAM: ~256 MB (8 workers × ~32 MB CRT buffer); independent of object size.
-- Now matches s3dlio and minio at the ~1.0–1.1 GB/s network ceiling.
-
----
-
-# DLIO Training Sweep Results
-
-**Date:** March 18, 2026  
-**Test script:** `Test-Backup/test_training_mpi_sweep.py`  
-**Endpoint:** `http://minio-host:9000` (MinIO-compatible S3)
-
-These results measure performance **as seen by the full DLIO training pipeline** — including
-DLIO's MPI data generation, PyTorch DataLoader worker processes, NPZ deserialization, and
-IPC overhead. Each sweep point is an independent clean cycle: `clean → datagen(N) → train(N) → clean`.
-
-## Setup
-
-| Parameter | Value |
-|---|---|
-| Dataset | 100 × 128 MiB NPZ = 12.50 GiB per library |
-| Training | 2 epochs = 25.00 GiB total reads per cycle |
-| Model | unet3d / a100 accelerator profile |
-| DataLoader | 8 read_threads per MPI process, prefetch 4, batch size 1 |
-| Sweep variable | N MPI processes (applied to both datagen and training) |
-
-Each library uses a dedicated bucket; no cross-library interference.
-
-## Data Generation Write Throughput (GB/s)
-
-| Library | N=1 | N=2 | N=4 |
-|---|---|---|---|
-| s3dlio | 0.080 | 0.156 | 0.249 |
-| minio | 0.085 | 0.158 | 0.250 |
-| s3torchconnector | 0.085 | 0.114 | 0.248 |
-
-## Training Read Throughput (GB/s)
-
-| Library | N=1 | N=2 | N=4 |
-|---|---|---|---|
-| s3dlio | 0.179 | 0.325 | 0.488 |
-| minio | 0.179 | 0.323 | 0.485 |
-| s3torchconnector | 0.179 | 0.321 | 0.490 |
-
-## Read Scaling (relative to N=1 baseline)
-
-| Library | N=1 | N=2 | N=4 |
-|---|---|---|---|
-| s3dlio | 1.00× | 1.81× | 2.72× |
-| minio | 1.00× | 1.81× | 2.71× |
-| s3torchconnector | 1.00× | 1.79× | 2.73× |
-
-## Comparison: DLIO vs Native Library Throughput
-
-| Metric | Native (direct API, 12 workers) | DLIO N=4 | DLIO as % of native |
-|---|---|---|---|
-| Write (s3dlio) | 0.525 GB/s | 0.249 GB/s | **47%** |
-| Write (minio) | 0.415 GB/s | 0.250 GB/s | **60%** |
-| Write (s3torchconnector) | 0.561 GB/s | 0.248 GB/s | **44%** |
-| Read (s3dlio) | 1.085 GB/s | 0.488 GB/s | **45%** |
-| Read (minio) | 1.051 GB/s | 0.485 GB/s | **46%** |
-| Read (s3torchconnector) | 1.092 GB/s | 0.490 GB/s | **45%** |
-
-## Analysis
-
-**The bottleneck is DLIO, not the network and not the storage library.**
-
-All three libraries perform within noise of each other at every process count — write
-differences are ≤ 1% at N=4, read differences ≤ 1%. This means the storage library
-choice is completely irrelevant inside DLIO. The per-library call latency and throughput
-advantages measured in the direct API tests are entirely erased by DLIO overhead.
-
-**The culprit is the serialization chain, not the I/O:**
-
-- **NPZ on write** — `numpy.savez()` on 128 MiB arrays is expensive CPU work done
-  inline before the S3 write even starts. The storage library is waiting on numpy, not
-  the network.
-
-- **NPZ on read + IPC pickle** — each DataLoader worker loads the NPZ, unpacks it, then
-  pickles the 128 MiB tensor back to the main process via `multiprocessing`. At 128 MiB,
-  the pickle + memcpy dominates wall time — the S3 read completes long before the tensor
-  is delivered to the training loop.
-
-- **MPI coordination** — barriers prevent full write pipelining; N=4 yields only ~3.1×
-  the N=1 throughput, not the theoretical 4×. Synchronization points eat the remaining
-  efficiency.
-
-DLIO achieves only ~45–60% of what the native APIs can deliver, pointing to several
-likely bottlenecks within DLIO itself:
-
-1. **NPZ serialization / deserialization** — each 128 MiB object must be packaged as NPZ
-   on write (via numpy.savez) and unpacked on read (via numpy.load). For 128 MiB files
-   this is expensive CPU work done serially within each DataLoader worker before any data
-   reaches the model.
-
-2. **PyTorch DataLoader IPC** — after deserializing NPZ, each of the N read_thread
-   worker processes must pickle the resulting tensor back to the main training process
-   via shared-memory IPC. For 128 MiB tensors this pickle + memcpy dominates wall time.
-
-3. **MPI coordination overhead** — DLIO's MPI-based data generation adds synchronization
-   barriers and metadata tracking overhead that prevent the N processes from fully
-   pipelining their writes. At N=4, write throughput is only ~3.1× N=1 (not 4×).
-
-4. **Read scaling sub-linearity** — training read at N=4 is only ~2.7× N=1 (not 4×),
-   meaning ~32% efficiency loss to DLIO scheduling, DataLoader prefetch coordination,
-   and process-local deserialization bottlenecks.
-
-## Is a DLIO rewrite needed?
-
-The short answer is: **yes, if the goal is to make DLIO competitive with native I/O**.
-
-The current DLIO storage path creates a deep stack between the S3 call and the training
-loop: `MPI process → Python storage backend → S3 lib → network → S3 lib → Python storage
-backend → numpy.load → IPC pickle → DataLoader → training loop`. Every layer adds
-overhead, and the serialization layers (NPZ + pickle) cost CPU time that is comparable
-to or greater than the actual I/O time at this file size.
-
-**Targeted improvements that would not require a full rewrite:**
-
-- **Reduce object size** — smaller objects (e.g. 4–16 MiB) reduce per-file NPZ overhead
-  and make the IPC pickle cheaper, allowing more objects in flight and better pipelining.
-
-- **Switch to a raw binary format** — replacing NPZ with flat binary (or memmap-able
-  formats like safetensors / raw fp32) eliminates the numpy zip overhead entirely and
-  allows zero-copy reads into pinned CUDA memory.
-
-- **Use shared memory for DataLoader IPC** — passing large tensors via `multiprocessing`
-  shared memory (`torch.multiprocessing`) avoids the pickle round-trip for large tensors.
-
-- **Pre-stage to NVMe** — DLIO supports a cache tier; pre-fetching objects to local NVMe
-  and reading from there can decouple the I/O and compute timelines.
-
-**If a deeper rewrite is on the table**, the most impactful change would be to replace
-the per-file DataLoader read model with a streaming prefetch model where S3 range-GETs
-are issued asynchronously by a dedicated I/O thread pool and data is DMA-copied directly
-into pre-allocated pinned buffers. This eliminates the NPZ deserialization bottleneck
-and the IPC pickle entirely — the storage library (s3dlio, etc.) would operate at its
-native throughput.
diff --git a/tests/object-store/bench-results-retinanet-20260425.md b/tests/object-store/old-archive/bench-results-retinanet-20260425.md
similarity index 100%
rename from tests/object-store/bench-results-retinanet-20260425.md
rename to tests/object-store/old-archive/bench-results-retinanet-20260425.md
diff --git a/tests/object-store/bench_npz_build.py b/tests/object-store/old-archive/bench_npz_build.py
similarity index 100%
rename from tests/object-store/bench_npz_build.py
rename to tests/object-store/old-archive/bench_npz_build.py
diff --git a/tests/object-store/old-archive/bench_parquet_rg_flux.py b/tests/object-store/old-archive/bench_parquet_rg_flux.py
new file mode 100644
index 00000000..cf93173f
--- /dev/null
+++ b/tests/object-store/old-archive/bench_parquet_rg_flux.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""
+Flux Parquet Row-Group Benchmark — Mode 1 (s3dlio raw + discard)
+
+Reads Parquet row groups from Flux training files using
+s3dlio.parquet_get_rg(decode="raw") and discards bytes immediately.
+This benchmarks pure storage throughput with zero Python decode overhead.
+
+Flux dataset characteristics (MLPerf Storage):
+  Files:   4296 train files
+  Samples: 288 per file  (~594 MiB each uncompressed, no compression)
+  Columns: t5_encodings (524328×f32), clip_encodings (409×f32),
+           mean (8232×f32), logvar (8232×f32), timestamp (7×f32)
+  Record:  2,164,832 bytes per sample
+  Full dataset: ~2.4 TiB total
+
+Row-group granularity:
+  --rg-per-file controls how many row groups each file is split into.
+  Default is 6 (matching batch_size=48 from flux_b200.yaml: 288/48 = 6).
+  Each row group = 48 samples × 2,164,832 bytes ≈ 99 MiB.
+
+Mode 1 = s3dlio.parquet_get_rg(decode="raw")
+  Returns compressed column-chunk bytes directly from the Parquet file.
+  NOT a standalone .parquet file — no magic bytes or footer.
+  The bytes are discarded immediately; only storage throughput is measured.
+
+Usage:
+    python3 bench_parquet_rg_flux.py [OPTIONS]
+
+File naming matches gen_flux_parquet.py: train_{i:04d}.parquet
+
+Options:
+    --prefix URI_PREFIX      Base URI prefix for flux files
+                             (default: file:///mnt/test/data/flux/train)
+    --files N                Number of files to benchmark per epoch (default: 8)
+    --rg-per-file N          Row groups per file (default: 6 = 288 samples / 48)
+    --np N                   Simulated MPI ranks — multiplies pipeline (default: 1)
+    --pipeline N             Concurrent parquet_get_rg calls per rank (default: 4)
+    --epochs N               Number of epochs to run (default: 2)
+    --footer-cap BYTES        Footer prefetch size in bytes (default: 4194304 = 4 MiB)
+"""
+
+import argparse
+import os
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# ---------------------------------------------------------------------------
+# Load .env credentials / endpoint (walk up from script location)
+# ---------------------------------------------------------------------------
+_here = os.path.dirname(os.path.abspath(__file__))
+for _candidate in [
+    os.path.join(_here, "../../.env"),
+    os.path.join(_here, "../.env"),
+    os.path.join(_here, ".env"),
+]:
+    if os.path.exists(_candidate):
+        with open(_candidate) as _f:
+            for _line in _f:
+                _line = _line.strip()
+                if _line and not _line.startswith("#") and "=" in _line:
+                    _k, _, _v = _line.partition("=")
+                    os.environ.setdefault(_k.strip(), _v.strip())
+        break
+
+import s3dlio  # noqa: E402  (needs env vars set first)
+
+# ---------------------------------------------------------------------------
+# Dataset defaults
+# ---------------------------------------------------------------------------
+DEFAULT_PREFIX    = "file:///mnt/test/data/flux/train"
+DEFAULT_N_FILES   = 8
+DEFAULT_RG_PER_FILE = 6          # 288 samples / batch_size 48
+DEFAULT_FOOTER_CAP  = 4 * 1024 * 1024   # 4 MiB — covers all RG metadata
+
+# Flux file size for reference reporting (uncompressed, no Snappy)
+FLUX_FILE_MIB = 594.0            # ~594 MiB per file
+
+
+def file_uris(prefix: str, n: int, start: int = 0) -> list[str]:
+    """Return s3dlio URIs for n Flux training files.
+
+    Naming matches gen_flux_parquet.py: train_{i:04d}.parquet.
+    """
+    return [f"{prefix.rstrip('/')}/train_{i:04d}.parquet" for i in range(start, start + n)]
+
+
+# ---------------------------------------------------------------------------
+# Worker: fetch one (file, rg_idx) pair — Mode 1, raw bytes, immediate discard
+# ---------------------------------------------------------------------------
+def fetch_rg(uri: str, rg_idx: int, footer_cap: int) -> tuple[str, int, int, float]:
+    """
+    Read one Parquet row group (raw compressed bytes) and discard.
+
+    Returns (uri, rg_idx, nbytes_compressed, elapsed_s).
+    nbytes reflects compressed column-chunk bytes, not uncompressed payload.
+    """
+    t0 = time.monotonic()
+    data = s3dlio.parquet_get_rg(uri, rg_idx, footer_cap=footer_cap, decode="raw")
+    elapsed = time.monotonic() - t0
+    nbytes = len(bytes(data))
+    del data                    # release immediately — we measure storage, not decode
+    return uri, rg_idx, nbytes, elapsed
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    ap.add_argument("--prefix",       default=DEFAULT_PREFIX,
+                    help=f"Base URI prefix for Flux files (default: {DEFAULT_PREFIX})")
+    ap.add_argument("--files",        type=int, default=DEFAULT_N_FILES,
+                    help=f"Files per epoch (default: {DEFAULT_N_FILES})")
+    ap.add_argument("--rg-per-file",  type=int, default=DEFAULT_RG_PER_FILE,
+                    help=f"Row groups per file (default: {DEFAULT_RG_PER_FILE})")
+    ap.add_argument("--np",           type=int, default=1,
+                    help="Simulated MPI ranks; multiplies pipeline (default: 1)")
+    ap.add_argument("--pipeline",     type=int, default=4,
+                    help="Concurrent parquet_get_rg calls per rank (default: 4)")
+    ap.add_argument("--epochs",       type=int, default=2,
+                    help="Epochs to run (default: 2)")
+    ap.add_argument("--footer-cap",   type=int, default=DEFAULT_FOOTER_CAP,
+                    help=f"Footer prefetch bytes (default: {DEFAULT_FOOTER_CAP})")
+    args = ap.parse_args()
+
+    total_workers = args.np * args.pipeline
+    uris = file_uris(args.prefix, args.files)
+    total_rgs = args.files * args.rg_per_file
+
+    # Partition Tokio threads across simulated MPI ranks
+    s3dlio.configure_tokio_threads()
+
+    print("Flux Parquet RG benchmark — Mode 1 (s3dlio raw + discard)")
+    print(f"  files={args.files}  rg_per_file={args.rg_per_file}  "
+          f"total_rgs={total_rgs}")
+    print(f"  np={args.np}  pipeline={args.pipeline}  "
+          f"total_workers={total_workers}  epochs={args.epochs}")
+    print(f"  prefix:   {args.prefix}")
+    print(f"  endpoint: {os.environ.get('AWS_ENDPOINT_URL_S3', '(default AWS)')}")
+    print(f"  footer_cap: {args.footer_cap // 1024} KiB")
+    print(f"  est. uncompressed data/epoch: "
+          f"{args.files * FLUX_FILE_MIB / 1024:.1f} GiB "
+          f"({args.files} files × {FLUX_FILE_MIB:.0f} MiB)")
+    print()
+
+    epoch_results: list[tuple[int, float, float]] = []  # (epoch, total_gb, mbps)
+
+    for ep in range(1, args.epochs + 1):
+        print(f"══ Epoch {ep} ═════════════════════════════════════════════════")
+
+        # Build all (uri, rg_idx) tasks for this epoch
+        tasks = [
+            (uri, rg_idx)
+            for uri in uris
+            for rg_idx in range(args.rg_per_file)
+        ]
+
+        epoch_bytes = 0
+        rg_count = 0
+
+        t_epoch = time.monotonic()
+        with ThreadPoolExecutor(max_workers=total_workers) as ex:
+            futs = {
+                ex.submit(fetch_rg, uri, rg_idx, args.footer_cap): (uri, rg_idx)
+                for uri, rg_idx in tasks
+            }
+            for fut in as_completed(futs):
+                uri, rg_idx, nbytes, elapsed = fut.result()
+                epoch_bytes += nbytes
+                rg_count += 1
+                if rg_idx == 0:
+                    # Print first RG of each file as a progress indicator
+                    fname = os.path.basename(uri)
+                    mbps = nbytes / elapsed / 1e6 if elapsed > 0 else 0
+                    print(f"  {fname}  rg=0  {nbytes/1024:.0f} KiB  "
+                          f"{elapsed*1000:.1f} ms  {mbps:.0f} MB/s")
+        t_epoch = time.monotonic() - t_epoch
+
+        epoch_mbps = epoch_bytes / t_epoch / 1e6
+        epoch_gib  = epoch_bytes / 1024**3
+        epoch_results.append((ep, epoch_gib, epoch_mbps))
+
+        print(f"  ── epoch {ep} total: {rg_count} RGs  "
+              f"{epoch_gib:.3f} GiB compressed  "
+              f"{t_epoch:.2f} s  {epoch_mbps:.0f} MB/s")
+        print()
+
+    # Summary
+    print("══ Summary ═══════════════════════════════════════════════")
+    print(f"  {'Epoch':<8}  {'Compressed GiB':>15}  {'Throughput MB/s':>16}")
+    print(f"  {'-'*8}  {'-'*15}  {'-'*16}")
+    for ep, gib, mbps in epoch_results:
+        print(f"  {ep:<8}  {gib:>15.3f}  {mbps:>16.0f}")
+
+    if len(epoch_results) >= 2:
+        ep2_mbps = epoch_results[1][2]
+        print()
+        print(f"  Epoch 2 reflects warm OS/server cache: {ep2_mbps:.0f} MB/s")
+
+    # Note on compressed vs uncompressed
+    print()
+    print("  Note: bytes reported are compressed column-chunk bytes")
+    print("  (decode='raw' returns Parquet payload before decompression).")
+    print(f"  Flux files have compression=none so raw ≈ uncompressed payload.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/object-store/old-archive/bench_wholefile_get.py b/tests/object-store/old-archive/bench_wholefile_get.py
new file mode 100644
index 00000000..d94a893a
--- /dev/null
+++ b/tests/object-store/old-archive/bench_wholefile_get.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Simulate the proposed batch-iterator architecture for DLRM Parquet files.
+
+Current architecture:  16 byte-range GETs per file, 1,024 GETs/epoch,
+                       64,000,000 read_index() calls/epoch  → Python-bound
+Proposed architecture: 1 whole-object GET per file, 64 GETs/epoch,
+                       ~64 iterator.__next__() calls/epoch  → I/O-bound
+
+This script issues real full-file GETs against the S3 endpoint (no Parquet
+decode) to measure the I/O ceiling of the proposed design.
+
+  --pipeline  concurrent GETs per NP process (default: 2)
+  --np        number of NP processes to simulate (default: 1)
+              total outstanding GETs = np × pipeline
+              e.g. --np 4 --pipeline 2  →  8 concurrent GETs in flight
+
+Usage:
+    python3 bench_wholefile_get.py [--np N] [--pipeline N] [--files N] [--epochs N]
+"""
+
+import argparse
+import os
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# ---------------------------------------------------------------------------
+# Credentials / endpoint from .env
+# ---------------------------------------------------------------------------
+_ENV = os.path.join(os.path.dirname(__file__), "../../.env")
+if os.path.exists(_ENV):
+    with open(_ENV) as _f:
+        for _line in _f:
+            _line = _line.strip()
+            if _line and not _line.startswith("#") and "=" in _line:
+                _k, _, _v = _line.partition("=")
+                os.environ.setdefault(_k.strip(), _v.strip())
+
+import s3dlio  # noqa: E402  (needs env vars before import)
+
+# ---------------------------------------------------------------------------
+# Dataset constants
+# ---------------------------------------------------------------------------
+BUCKET   = "mlp-flux"
+PREFIX   = "data/dlrm/train/train"
+N_FILES  = 64
+
+def file_uris(n: int = N_FILES) -> list[str]:
+    return [f"s3://{BUCKET}/{PREFIX}/img_{i:02d}_of_64.parquet" for i in range(n)]
+
+
+# ---------------------------------------------------------------------------
+# Worker
+# ---------------------------------------------------------------------------
+def fetch_file(uri: str) -> tuple[str, int, float]:
+    """GET the entire object and discard bytes.  Returns (uri, nbytes, elapsed_s)."""
+    t0 = time.monotonic()
+    data = s3dlio.get(uri)          # releases GIL internally → concurrent with other threads
+    elapsed = time.monotonic() - t0
+    nbytes = len(data)
+    del data                        # release immediately
+    return uri, nbytes, elapsed
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--np",       type=int, default=1,
+                    help="Simulated NP (number of processes); multiplies pipeline (default: 1)")
+    ap.add_argument("--pipeline", type=int, default=2,
+                    help="Concurrent GETs per NP process (default: 2)")
+    ap.add_argument("--files",    type=int, default=N_FILES,
+                    help=f"Number of files to fetch per epoch (default: {N_FILES})")
+    ap.add_argument("--epochs",   type=int, default=2,
+                    help="Number of epochs to run (default: 2)")
+    args = ap.parse_args()
+
+    total_pipeline = args.np * args.pipeline
+    uris = file_uris(args.files)
+    total_dataset_bytes: int | None = None  # set after first epoch
+
+    print(f"Proposed batch-iterator benchmark")
+    print(f"  files={args.files}  np={args.np}  pipeline={args.pipeline}  "
+          f"total_outstanding={total_pipeline}  epochs={args.epochs}")
+    print(f"  endpoint: {os.environ.get('AWS_ENDPOINT_URL_S3', '(default)')}")
+    print(f"  target: ≥400 MB/s")
+    print()
+
+    epoch_results: list[tuple[int, float, float]] = []  # (epoch, total_gb, mbps)
+
+    for ep in range(1, args.epochs + 1):
+        print(f"═══ Epoch {ep} ════════════════════════════════════════════════════")
+        print(f"  {'File':<35} {'MiB':>8}  {'s':>7}  {'MB/s':>9}")
+        print(f"  {'-'*35} {'-'*8}  {'-'*7}  {'-'*9}")
+
+        epoch_bytes = 0
+        file_results: list[tuple[str, int, float]] = []
+
+        t_epoch = time.monotonic()
+        with ThreadPoolExecutor(max_workers=total_pipeline) as ex:
+            futs = {ex.submit(fetch_file, u): u for u in uris}
+            for fut in as_completed(futs):
+                uri, nbytes, elapsed = fut.result()
+                mbps = nbytes / elapsed / 1e6
+                epoch_bytes += nbytes
+                file_results.append((uri, nbytes, elapsed, mbps))
+                print(f"  {os.path.basename(uri):<35} {nbytes/1024**2:>8.1f}  {elapsed:>7.3f}  {mbps:>9.1f}")
+        t_epoch = time.monotonic() - t_epoch
+
+        if total_dataset_bytes is None:
+            total_dataset_bytes = epoch_bytes
+
+        epoch_mbps = epoch_bytes / t_epoch / 1e6
+        epoch_results.append((ep, epoch_bytes / 1024**3, epoch_mbps))
+
+        print(f"  {'-'*35} {'-'*8}  {'-'*7}  {'-'*9}")
+        print(f"  {'EPOCH TOTAL':<35} {epoch_bytes/1024**3:>7.2f}G  {t_epoch:>7.3f}  {epoch_mbps:>9.1f}")
+        print()
+
+    # Summary
+    print("═══ Summary ════════════════════════════════════════════════════════")
+    print(f"  {'Epoch':<8}  {'Data GiB':>10}  {'Throughput MB/s':>16}  {'vs 400 MB/s':>12}")
+    print(f"  {'-'*8}  {'-'*10}  {'-'*16}  {'-'*12}")
+    for ep, gb, mbps in epoch_results:
+        vs = f"+{mbps-400:.0f}" if mbps >= 400 else f"{mbps-400:.0f}"
+        label = "PASS" if mbps >= 400 else "FAIL"
+        print(f"  {ep:<8}  {gb:>10.2f}  {mbps:>16.1f}  {vs:>8} ({label})")
+
+    if len(epoch_results) >= 2:
+        ep2_mbps = epoch_results[1][2]
+        print()
+        print(f"  Epoch 2 (OS/server cache): {ep2_mbps:.1f} MB/s  "
+              f"{'≥ 400 MB/s ✓' if ep2_mbps >= 400 else '< 400 MB/s ✗'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/object-store/old-archive/demo_streaming_checkpoint.sh b/tests/object-store/old-archive/demo_streaming_checkpoint.sh
deleted file mode 100755
index 2953b8c2..00000000
--- a/tests/object-store/old-archive/demo_streaming_checkpoint.sh
+++ /dev/null
@@ -1,291 +0,0 @@
-#!/bin/bash
-# Demo: dgen-py Integration + StreamingCheckpointing
-#
-# Demonstrates two major mlpstorage optimizations:
-#   1. dgen-py integration (155x faster data generation, Rust-based)
-#   2. StreamingCheckpointing (192x memory reduction, producer-consumer pipeline)
-#
-# Shows file storage (if TEST_CHECKPOINT_DIR is set) and object storage tests
-# for each configured library.
-#
-# Configuration — all via environment variables or .env file:
-#
-#   Required for object storage:
-#     AWS_ACCESS_KEY_ID       S3 access key
-#     AWS_SECRET_ACCESS_KEY   S3 secret key
-#     AWS_ENDPOINT_URL        S3-compatible endpoint (e.g. http://host:9000)
-#     AWS_REGION              Region (default: us-east-1)
-#
-#   Optional:
-#     TEST_SIZE_GB            Checkpoint size in GB (default: 1)
-#     TEST_CHECKPOINT_DIR     Local directory for file-based tests (skipped if unset)
-#     S3_BUCKET               Bucket for object storage tests (default: mlp-demo-ckpt)
-#     S3_PREFIX               Key prefix inside the bucket (default: demo)
-#     S3_LIBRARIES            Libraries to test: s3dlio,minio,s3torchconnector or "all"
-#                             (default: all three)
-#
-# Usage:
-#   cd mlp-storage
-#   bash tests/object-store/demo_streaming_checkpoint.sh
-#
-#   # With a file-storage test:
-#   TEST_CHECKPOINT_DIR=/tmp/ckpt-demo bash tests/object-store/demo_streaming_checkpoint.sh
-#
-#   # Larger checkpoint, single library:
-#   TEST_SIZE_GB=16 S3_LIBRARIES=s3dlio bash tests/object-store/demo_streaming_checkpoint.sh
-
-set -e
-
-#============================================================================
-# Navigate to repo root regardless of where the script was invoked from
-#============================================================================
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-cd "$REPO_ROOT"
-
-#============================================================================
-# Load .env — env vars already set in the shell always take precedence
-#============================================================================
-if [ -f ".env" ]; then
-    while IFS='=' read -r key value; do
-        [[ "$key" =~ ^[[:space:]]*# ]] && continue
-        [[ -z "${key// /}" ]] && continue
-        key="${key// /}"
-        [[ -v "$key" ]] && continue   # skip if already set in environment
-        export "$key"="$value"
-    done < .env
-fi
-
-#============================================================================
-# Configuration (all overridable via environment)
-#============================================================================
-
-# Checkpoint size — 1 GB is quick; use 16+ for realistic numbers
-TEST_SIZE_GB="${TEST_SIZE_GB:-1}"
-
-# Local directory for file-based tests; skipped when unset
-TEST_CHECKPOINT_DIR="${TEST_CHECKPOINT_DIR:-}"
-
-# Object storage configuration
-S3_BUCKET="${S3_BUCKET:-mlp-demo-ckpt}"
-S3_PREFIX="${S3_PREFIX:-demo}"
-S3_LIBRARIES="${S3_LIBRARIES:-all}"
-
-#============================================================================
-# Banner
-#============================================================================
-
-echo "╔══════════════════════════════════════════════════════════════════════════════╗"
-echo "║            DEMO: dgen-py + StreamingCheckpointing                            ║"
-echo "╚══════════════════════════════════════════════════════════════════════════════╝"
-echo ""
-echo "Two mlpstorage optimizations demonstrated here:"
-echo ""
-echo "  🚀 dgen-py Integration"
-echo "     • 155x faster random tensor generation (Rust-based)"
-echo "     • Drop-in replacement for torch.rand() and np.random()"
-echo "     • 1.54 GB/s → 239 GB/s generation speed"
-echo ""
-echo "  💾 StreamingCheckpointing"
-echo "     • Producer-consumer pattern for low-memory checkpoints"
-echo "     • 192x memory reduction (24 GB → 128 MB for large checkpoints)"
-echo "     • Overlaps generation and I/O for sustained throughput"
-echo ""
-echo "════════════════════════════════════════════════════════════════════════════════"
-echo ""
-
-#============================================================================
-# Environment Setup
-#============================================================================
-
-# Activate virtual environment
-if [ ! -d ".venv" ]; then
-    echo "❌ ERROR: Virtual environment not found at $REPO_ROOT/.venv"
-    echo "   Please create it first: uv venv && uv uv sync
-    exit 1
-fi
-
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-echo "✅ Virtual environment activated"
-
-# Verify dgen-py is installed
-if ! python -c "import dgen_py" 2>/dev/null; then
-    echo "❌ ERROR: dgen-py not installed"
-    echo "   Install with: uv sync"
-    exit 1
-fi
-
-DGEN_VERSION=$(python -c 'import dgen_py; print(dgen_py.__version__)' 2>/dev/null)
-echo "✅ dgen-py ${DGEN_VERSION} available"
-echo ""
-
-#============================================================================
-# Configuration Summary
-#============================================================================
-
-echo "📋 Demo Configuration:"
-echo "   Test size:          ${TEST_SIZE_GB} GB"
-echo "   S3 bucket:          ${S3_BUCKET}"
-echo "   S3 prefix:          ${S3_PREFIX}"
-echo "   Libraries to test:  ${S3_LIBRARIES}"
-
-SKIP_FILE_TESTS=1
-if [ -n "$TEST_CHECKPOINT_DIR" ]; then
-    mkdir -p "$TEST_CHECKPOINT_DIR"
-    echo "   Checkpoint dir:     $TEST_CHECKPOINT_DIR"
-    SKIP_FILE_TESTS=0
-else
-    echo "   Checkpoint dir:     (not set — file tests will be skipped)"
-    echo "   To enable file tests: export TEST_CHECKPOINT_DIR=/path/to/dir"
-fi
-
-echo ""
-echo "════════════════════════════════════════════════════════════════════════════════"
-echo ""
-
-#============================================================================
-# PART 1: File Storage Checkpoint (StreamingCheckpointing)
-#============================================================================
-
-if [ "$SKIP_FILE_TESTS" -eq 0 ]; then
-    echo "📊 PART 1: File Storage Checkpoint"
-    echo "════════════════════════════════════════════════════════════════════════════════"
-    echo ""
-    echo "Writing a ${TEST_SIZE_GB} GB StreamingCheckpointing to: $TEST_CHECKPOINT_DIR"
-    echo "  • 128 MB RAM regardless of checkpoint size"
-    echo "  • Producer-consumer pipeline: dgen-py generates while I/O writes"
-    echo ""
-
-    CHECKPOINT_URI="${TEST_CHECKPOINT_DIR}/demo_checkpoint_${TEST_SIZE_GB}gb.dat"
-
-    python - <<PYEOF
-import sys
-sys.path.insert(0, '$REPO_ROOT')
-from mlpstorage.checkpointing.streaming_checkpoint import StreamingCheckpointing
-
-sc = StreamingCheckpointing(chunk_size_mb=32, num_buffers=4)
-uri = '$CHECKPOINT_URI'
-size_gb = $TEST_SIZE_GB
-print(f"Writing {size_gb} GB to {uri} ...")
-result = sc.save(uri, size_gb * 1024**3)
-print(f"Write: {result['write_gb_s']:.3f} GB/s  ({result['elapsed_s']:.1f}s)")
-print(f"Reading back ...")
-result = sc.load(uri)
-print(f"Read:  {result['read_gb_s']:.3f} GB/s  ({result['elapsed_s']:.1f}s)")
-PYEOF
-
-    echo ""
-    echo "✅ File storage checkpoint complete"
-    echo "   Result: ${TEST_SIZE_GB} GB written and read back with ~128 MB RAM"
-    echo ""
-else
-    echo "⏭️  PART 1: File Storage Tests SKIPPED (TEST_CHECKPOINT_DIR not set)"
-    echo ""
-fi
-
-echo "════════════════════════════════════════════════════════════════════════════════"
-echo ""
-
-#============================================================================
-# PART 2: Object Storage Checkpoint (per-library)
-#============================================================================
-
-echo "📦 PART 2: Object Storage Checkpoint"
-echo "════════════════════════════════════════════════════════════════════════════════"
-echo ""
-echo "Testing StreamingCheckpointing via object storage:"
-echo "  • s3dlio (Rust-based, multi-protocol)"
-echo "  • minio (Python SDK)"
-echo "  • s3torchconnector (AWS recommended for PyTorch)"
-echo ""
-
-# Credentials were already loaded from .env at the top of the script.
-# Check that the required variables are present.
-SKIP_S3_TESTS=0
-if [[ -z "$AWS_ACCESS_KEY_ID" || -z "$AWS_SECRET_ACCESS_KEY" || -z "$AWS_ENDPOINT_URL" ]]; then
-    echo "⚠️  S3 credentials not found — skipping object storage tests."
-    echo "   Create $REPO_ROOT/.env with:"
-    echo "     AWS_ACCESS_KEY_ID=<your-access-key>"
-    echo "     AWS_SECRET_ACCESS_KEY=<your-secret-key>"
-    echo "     AWS_ENDPOINT_URL=http://<host>:<port>"
-    echo "     AWS_REGION=us-east-1"
-    SKIP_S3_TESTS=1
-fi
-
-# Determine which libraries to run
-if [[ "$SKIP_S3_TESTS" -eq 0 ]]; then
-    if [[ "$S3_LIBRARIES" == "all" ]]; then
-        LIBRARIES_TO_RUN="s3dlio minio s3torchconnector"
-    else
-        LIBRARIES_TO_RUN="${S3_LIBRARIES//,/ }"
-    fi
-
-    echo "Endpoint:  $AWS_ENDPOINT_URL"
-    echo "Bucket:    $S3_BUCKET"
-    echo "Prefix:    $S3_PREFIX"
-    echo "Libraries: $LIBRARIES_TO_RUN"
-    echo ""
-
-    S3_PASS=0
-    S3_FAIL=0
-
-    for LIB in $LIBRARIES_TO_RUN; do
-        echo "  --- $LIB ---"
-        SCRIPT="$SCRIPT_DIR/test_${LIB}_checkpoint.py"
-
-        if [ ! -f "$SCRIPT" ]; then
-            # s3torchconnector → test_s3torch_checkpoint.py
-            SCRIPT="$SCRIPT_DIR/test_s3torch_checkpoint.py"
-        fi
-
-        if [ ! -f "$SCRIPT" ]; then
-            echo "  ⚠️  No test script found for $LIB — skipping"
-            continue
-        fi
-
-        OBJECT_URI="s3://${S3_BUCKET}/${S3_PREFIX}/${LIB}/demo_${TEST_SIZE_GB}gb.dat"
-        if python "$SCRIPT" \
-                --size-gb "$TEST_SIZE_GB" \
-                --uri "$OBJECT_URI" 2>&1; then
-            S3_PASS=$((S3_PASS + 1))
-        else
-            echo "  ❌ $LIB test failed"
-            S3_FAIL=$((S3_FAIL + 1))
-        fi
-        echo ""
-    done
-
-    echo "✅ Object storage tests complete  ($S3_PASS passed, $S3_FAIL failed)"
-    echo ""
-fi
-
-echo "════════════════════════════════════════════════════════════════════════════════"
-echo "DEMO COMPLETE"
-echo "════════════════════════════════════════════════════════════════════════════════"
-echo ""
-
-if [ "$SKIP_FILE_TESTS" -eq 0 ]; then
-    echo "  ✅ Part 1: File storage checkpoint (${TEST_SIZE_GB} GB, ~128 MB RAM)"
-else
-    echo "  ⏭️  Part 1: File storage SKIPPED (set TEST_CHECKPOINT_DIR to enable)"
-fi
-
-if [ "$SKIP_S3_TESTS" -eq 0 ]; then
-    echo "  ✅ Part 2: Object storage — $LIBRARIES_TO_RUN"
-else
-    echo "  ⏭️  Part 2: Object storage SKIPPED (set credentials in .env to enable)"
-fi
-
-echo ""
-echo "For benchmark results see: tests/object-store/Object_Perf_Results.md"
-echo ""
-echo "Configuration reference:"
-echo "   TEST_SIZE_GB            Checkpoint size in GB           (current: $TEST_SIZE_GB)"
-echo "   TEST_CHECKPOINT_DIR     Local path for file tests       (current: ${TEST_CHECKPOINT_DIR:-(not set)})"
-echo "   S3_BUCKET               Object storage bucket           (current: $S3_BUCKET)"
-echo "   S3_PREFIX               Key prefix inside bucket        (current: $S3_PREFIX)"
-echo "   S3_LIBRARIES            Libraries: all or comma-list    (current: $S3_LIBRARIES)"
-echo "   AWS_ENDPOINT_URL        S3-compatible endpoint URL"
-echo "   AWS_ACCESS_KEY_ID       S3 access key"
-echo "   AWS_SECRET_ACCESS_KEY   S3 secret key"
-echo "   AWS_REGION              Region (default: us-east-1)"
diff --git a/tests/object-store/old-archive/dlio_minio_checkpoint.sh b/tests/object-store/old-archive/dlio_minio_checkpoint.sh
deleted file mode 100755
index 0383cd94..00000000
--- a/tests/object-store/old-archive/dlio_minio_checkpoint.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env bash
-# dlio_minio_checkpoint.sh
-#
-# Run DLIO checkpointing directly via dlio_benchmark — NO mlpstorage wrapper.
-# Writes and reads llama3-8b checkpoints to/from MinIO using the minio Python SDK.
-#
-# Config  : configs/dlio/workload/llama3_8b_checkpoint_minio.yaml
-# Workload: LLaMA 3 8B — ZeRO-3, 8 ranks, ~13.1 GB per rank per checkpoint
-# Storage : minio SDK → MinIO  (endpoint from AWS_ENDPOINT_URL)  bucket: chckpt-test1
-# Objects : s3://chckpt-test1/minio/llama3-8b/<checkpoint_id>/<rank_file>.pt
-#
-# MPI ranks:
-#   llama3-8b with ZeRO-3 requires exactly 8 MPI ranks (the closed reference value).
-#   Each rank writes its shard of the model+optimizer state (~13.1 GB).
-#   Run with NP=8 for full workload; NP=1 for a single-rank sanity check.
-#
-# Environment overrides:
-#   NP=1 bash dlio_minio_checkpoint.sh       → 1 rank, ~13.1 GB per checkpoint
-#   NP=8 bash dlio_minio_checkpoint.sh       → 8 ranks, ~105 GB per checkpoint
-#   CHECKPOINTS=1 bash dlio_minio_checkpoint.sh  → write+read 1 checkpoint only
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_minio_checkpoint.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ────────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Check minio is installed ──────────────────────────────────────────────────
-if ! python3 -c "from minio import Minio" 2>/dev/null; then
-    echo "ERROR: minio is not installed." >&2
-    echo "  Install with: uv sync" >&2
-    exit 1
-fi
-
-# ── Tunables (override via env) ────────────────────────────────────────────────
-NP=${NP:-1}
-CHECKPOINTS=${CHECKPOINTS:-2}
-
-BUCKET="chckpt-test1"
-S3_PREFIX="minio/llama3-8b"
-
-RUN_DIR="/tmp/dlio-minio-checkpoint-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Checkpoint — minio SDK + MinIO  (llama3-8b)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket      : $BUCKET"
-echo "  Objects at  : s3://$BUCKET/$S3_PREFIX/"
-echo "  Endpoint    : $AWS_ENDPOINT_URL"
-echo "  MPI ranks   : $NP   (default=1; full run: NP=8 bash $0)"
-echo "  Checkpoints : $CHECKPOINTS write + $CHECKPOINTS read"
-echo "  Per-rank    : ~13.1 GB per checkpoint  (ZeRO-3, 8 ranks)"
-echo "  Run dir     : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: verify bucket is reachable using s3dlio (minio has no CLI) ───
-echo "Checking bucket reachability: s3://$BUCKET/ ..."
-python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-try:
-    files = s3dlio.list("s3://${BUCKET}/", recursive=False)
-    print(f"  Bucket accessible — {len(files)} top-level entries")
-except Exception as e:
-    print(f"  ERROR: Cannot access bucket s3://${BUCKET}/: {e}", file=sys.stderr)
-    sys.exit(1)
-PYEOF
-echo ""
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=llama3_8b_checkpoint_minio \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    "++workload.checkpoint.num_checkpoints_write=$CHECKPOINTS" \
-    "++workload.checkpoint.num_checkpoints_read=$CHECKPOINTS" \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-echo "✅  Checkpoint test complete — results in $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_minio_cleanup.sh b/tests/object-store/old-archive/dlio_minio_cleanup.sh
deleted file mode 100755
index 51655c38..00000000
--- a/tests/object-store/old-archive/dlio_minio_cleanup.sh
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/env bash
-# dlio_minio_cleanup.sh
-#
-# Delete all test objects from the MinIO bucket (mlp-minio).
-# Use this to reset between datagen runs without running the full cycle.
-#
-# Storage : S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-minio
-# Removes : s3://mlp-minio/test-run/unet3d/train/*
-#
-# Safety  : Lists files first, shows count, prompts for confirmation.
-#           To skip the prompt: FORCE=1 bash dlio_minio_cleanup.sh
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_minio_cleanup.sh
-#   FORCE=1 bash tests/object-store/dlio_minio_cleanup.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ───────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-# ── Config ────────────────────────────────────────────────────────────────────
-FORCE=${FORCE:-0}
-
-BUCKET="${BUCKET:-mlp-minio}"
-S3_PREFIX="test-run/unet3d/train"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Cleanup — minio SDK + MinIO"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Prefix   : $S3_PREFIX"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── List what will be deleted ─────────────────────────────────────────────────
-echo "Listing objects to delete: s3://$BUCKET/$S3_PREFIX/ ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("${BUCKET}", prefix="${S3_PREFIX}/", recursive=True))
-print(len(objects))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -eq 0 ]]; then
-    echo "✅  Bucket is already empty — nothing to delete."
-    exit 0
-fi
-
-echo "Found $FILE_COUNT objects to delete."
-
-# ── Confirm before deleting ───────────────────────────────────────────────────
-if [[ "$FORCE" -eq 0 ]]; then
-    echo ""
-    echo "⚠️   This will permanently delete $FILE_COUNT objects from s3://$BUCKET/$S3_PREFIX/"
-    echo "    To skip this prompt: FORCE=1 bash $0"
-    read -r -p "Delete all $FILE_COUNT objects? [y/N] " REPLY
-    if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
-        echo "Aborted — no objects deleted."
-        exit 0
-    fi
-fi
-
-# ── Delete ────────────────────────────────────────────────────────────────────
-echo ""
-echo "Deleting $FILE_COUNT objects ..."
-DELETED=$(python3 - <<PYEOF
-import os
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("${BUCKET}", prefix="${S3_PREFIX}/", recursive=True))
-for obj in objects:
-    client.remove_object("${BUCKET}", obj.object_name)
-print(len(objects))
-PYEOF
-)
-
-echo ""
-echo "✅  Cleanup complete — deleted $DELETED objects from s3://$BUCKET/$S3_PREFIX/"
diff --git a/tests/object-store/old-archive/dlio_minio_cycle.sh b/tests/object-store/old-archive/dlio_minio_cycle.sh
deleted file mode 100755
index 9ed4a897..00000000
--- a/tests/object-store/old-archive/dlio_minio_cycle.sh
+++ /dev/null
@@ -1,223 +0,0 @@
-#!/usr/bin/env bash
-# dlio_minio_cycle.sh
-#
-# Full DLIO direct cycle test — NO mlpstorage CLI wrapper.
-#
-# Calls dlio_benchmark directly for every phase:
-#   1. Datagen  — generate 168 × ~140 MB NPZ files → MinIO (mlp-minio bucket)
-#   2. Verify   — use minio Python SDK to list and count the files
-#   3. Train    — run training reading from MinIO via minio SDK
-#   4. Cleanup  — delete all test objects from the bucket
-#
-# Config : unet3d_h100_minio_datagen.yaml + unet3d_h100_minio.yaml
-#          (real h100 workload — 168 files × ~140 MB NPZ)
-# Storage: S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-minio
-# Data   : mlp-minio/test-run/unet3d/train/
-#
-# Requirements:
-#   - .env file in repo root with AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
-#     AWS_ENDPOINT_URL, AWS_REGION  (no credentials in this script)
-#   - Python venv at .venv/  with dlio_benchmark and minio installed
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_minio_cycle.sh
-
-set -euo pipefail
-
-# ── Locate repo root ──────────────────────────────────────────────────────────
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-# allexport ensures every variable sourced from .env is exported to child
-# processes (mpirun, python, dlio_benchmark, etc.).
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    # shellcheck disable=SC1091
-    source .env
-    set +o allexport
-fi
-
-# Fail fast if credentials are missing — don't let dlio start and then error.
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ───────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found — run: python -m venv .venv && uv sync >&2
-    exit 1
-fi
-# shellcheck disable=SC1091
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found — is dlio_benchmark installed in the venv?" >&2
-    exit 1
-fi
-
-# ── Config ────────────────────────────────────────────────────────────────────
-BUCKET="${BUCKET:-mlp-minio}"
-S3_PREFIX="test-run/unet3d/train"       # matches data_folder=test-run/unet3d + DLIO appends /train/
-EXPECTED_FILES=168
-CONFIG_DIR="$REPO_ROOT/configs/dlio"
-
-# MPI ranks for datagen — more ranks = faster generation of 168 × 140 MB files
-DATAGEN_NP=${DATAGEN_NP:-8}
-TRAIN_NP=${TRAIN_NP:-1}
-
-# Unique run dir keeps DLIO output logs for this cycle
-RUN_DIR="/tmp/dlio-minio-cycle-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-# ── Helpers ───────────────────────────────────────────────────────────────────
-banner() { echo ""; echo "════════════════════════════════════════════════════════"; echo "  $*"; echo "════════════════════════════════════════════════════════"; echo ""; }
-step()   { echo ""; echo "──── $* ────"; echo ""; }
-ok()     { echo "✅  $*"; }
-fail()   { echo "❌  $*" >&2; exit 1; }
-
-banner "DLIO Direct Cycle — minio SDK + MinIO"
-echo "  Bucket       : $BUCKET"
-echo "  Prefix       : $S3_PREFIX"
-echo "  Endpoint     : $AWS_ENDPOINT_URL"
-echo "  Files        : $EXPECTED_FILES × ~140 MB NPZ  (real h100 workload)"
-echo "  Datagen MPI  : $DATAGEN_NP ranks"
-echo "  Train MPI    : $TRAIN_NP rank(s)"
-echo "  Run dir      : $RUN_DIR"
-
-# ── Inline minio list helper (reused in verify and cleanup phases) ────────────
-# Usage: minio_count <bucket> <prefix>
-minio_count() {
-    python3 - <<PYEOF
-import os
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("$1", prefix="$2/", recursive=True))
-print(len(objects))
-PYEOF
-}
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 1 — DATAGEN
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 1 — Datagen (writing ${EXPECTED_FILES} × ~140 MB files to MinIO)"
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$DATAGEN_NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_minio_datagen \
-    "++hydra.run.dir=$RUN_DIR/datagen" \
-    ++hydra.output_subdir=null \
-    --config-dir="$CONFIG_DIR"
-
-ok "Datagen complete"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 2 — VERIFY
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 2 — Verify (listing s3://$BUCKET/$S3_PREFIX/)"
-
-FOUND=$(python3 - <<PYEOF
-import os, sys
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("${BUCKET}", prefix="${S3_PREFIX}/", recursive=True))
-print(len(objects))
-for obj in objects[:5]:
-    print("  ", obj.object_name, file=sys.stderr)
-if len(objects) > 5:
-    print(f"  ... and {len(objects)-5} more", file=sys.stderr)
-PYEOF
-)
-
-echo "Files found in MinIO: $FOUND (expected: $EXPECTED_FILES)"
-if [[ "$FOUND" -ne "$EXPECTED_FILES" ]]; then
-    fail "File count mismatch: got $FOUND, expected $EXPECTED_FILES — datagen may have failed"
-fi
-ok "Verify passed — $FOUND files confirmed in bucket"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 3 — TRAIN
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 3 — Training (5 epochs, reading from MinIO via minio SDK)"
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$TRAIN_NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_minio \
-    "++hydra.run.dir=$RUN_DIR/train" \
-    ++hydra.output_subdir=null \
-    --config-dir="$CONFIG_DIR"
-
-ok "Training complete"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 4 — CLEANUP
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 4 — Cleanup (deleting all test objects)"
-
-DELETED=$(python3 - <<PYEOF
-import os
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("${BUCKET}", prefix="${S3_PREFIX}/", recursive=True))
-for obj in objects:
-    client.remove_object("${BUCKET}", obj.object_name)
-print(len(objects))
-PYEOF
-)
-
-ok "Cleanup complete — deleted $DELETED objects from s3://$BUCKET/$S3_PREFIX/"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# DONE
-# ══════════════════════════════════════════════════════════════════════════════
-banner "ALL PHASES PASSED"
-echo "  Datagen  ✅  generated $EXPECTED_FILES × ~140 MB NPZ files"
-echo "  Verify   ✅  $FOUND files confirmed in MinIO"
-echo "  Training ✅  5 epochs completed"
-echo "  Cleanup  ✅  $DELETED objects deleted"
-echo ""
-echo "  DLIO logs: $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_minio_datagen.sh b/tests/object-store/old-archive/dlio_minio_datagen.sh
deleted file mode 100755
index 9f5b9adc..00000000
--- a/tests/object-store/old-archive/dlio_minio_datagen.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env bash
-# dlio_minio_datagen.sh
-#
-# Run DLIO datagen directly via dlio_benchmark — NO mlpstorage wrapper.
-# Generates 168 × ~140 MB NPZ files into MinIO (mlp-minio bucket).
-#
-# Config  : configs/dlio/workload/unet3d_h100_minio_datagen.yaml
-# Workload: UNet3D h100 — 168 × ~140 MB NPZ
-# Storage : S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-minio
-# Data    : s3://mlp-minio/test-run/unet3d/train/
-#
-# Environment overrides:
-#   NP=4 bash dlio_minio_datagen.sh      → 4 MPI ranks writing in parallel
-#   FORCE=1 bash dlio_minio_datagen.sh   → overwrite even if files already exist
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_minio_datagen.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ───────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Tunables (override via env) ───────────────────────────────────────────────
-# NP    = MPI ranks — more ranks write more files in parallel
-# FORCE = set to 1 to skip the pre-flight "files already exist" warning
-NP=${NP:-8}
-FORCE=${FORCE:-0}
-
-BUCKET="${BUCKET:-mlp-minio}"
-S3_PREFIX="test-run/unet3d/train"
-EXPECTED_FILES=168
-
-RUN_DIR="/tmp/dlio-minio-datagen-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Datagen — minio SDK + MinIO  (unet3d h100)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Prefix   : $S3_PREFIX"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "  Files    : $EXPECTED_FILES × ~140 MB NPZ"
-echo "  MPI ranks: $NP   (override: NP=4 bash $0)"
-echo "  Run dir  : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: warn if files already exist ───────────────────────────────────
-echo "Checking for existing data: s3://$BUCKET/$S3_PREFIX/ ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("${BUCKET}", prefix="${S3_PREFIX}/", recursive=True))
-print(len(objects))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -gt 0 && "$FORCE" -eq 0 ]]; then
-    echo ""
-    echo "⚠️   WARNING: $FILE_COUNT files already exist in s3://$BUCKET/$S3_PREFIX/"
-    echo "    Datagen will overwrite them."
-    echo "    To skip this warning: FORCE=1 bash $0"
-    echo "    To clean up first:    bash tests/object-store/dlio_minio_cleanup.sh"
-    echo ""
-    read -r -p "Continue anyway? [y/N] " REPLY
-    if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
-        echo "Aborted."
-        exit 0
-    fi
-elif [[ "$FILE_COUNT" -gt 0 ]]; then
-    echo "⚠️   $FILE_COUNT files already exist — FORCE=1 set, overwriting"
-else
-    echo "✅  Bucket is empty — proceeding with datagen"
-fi
-echo ""
-
-# ── Run datagen ───────────────────────────────────────────────────────────────
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_minio_datagen \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-
-# ── Post-flight: verify file count ───────────────────────────────────────────
-echo "Verifying generated files ..."
-FOUND=$(python3 - <<PYEOF
-import os
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("${BUCKET}", prefix="${S3_PREFIX}/", recursive=True))
-print(len(objects))
-PYEOF
-)
-
-if [[ "$FOUND" -ne "$EXPECTED_FILES" ]]; then
-    echo "⚠️   File count: $FOUND (expected $EXPECTED_FILES) — some files may have been skipped or failed"
-else
-    echo "✅  Datagen complete — $FOUND / $EXPECTED_FILES files confirmed in s3://$BUCKET/$S3_PREFIX/"
-fi
-echo "    DLIO logs: $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_minio_train.sh b/tests/object-store/old-archive/dlio_minio_train.sh
deleted file mode 100755
index 44e939f9..00000000
--- a/tests/object-store/old-archive/dlio_minio_train.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env bash
-# dlio_minio_train.sh
-#
-# Run DLIO training directly via dlio_benchmark — NO mlpstorage wrapper.
-# Assumes data is already in the bucket (run dlio_minio_datagen.sh first
-# if needed, or dlio_minio_cycle.sh if starting from scratch).
-#
-# Config  : configs/dlio/workload/unet3d_h100_minio.yaml
-# Workload: UNet3D h100 — 168 × ~140 MB NPZ, 5 epochs, batch_size=7
-# Storage : S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-minio
-# Data    : s3://mlp-minio/test-run/unet3d/train/
-#
-# MPI vs PyTorch workers — these are different:
-#   NP (--np)         = MPI ranks  = simulated distributed training nodes
-#   read_threads (YAML) = PyTorch DataLoader workers per MPI rank
-#   Total I/O processes = NP × read_threads
-#
-# Environment overrides:
-#   NP=4 bash dlio_minio_train.sh        → 4 MPI ranks × 4 threads = 16 readers
-#   NP=1 READ_THREADS=8 bash ...         → 1 rank × 8 threads = 8 readers
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_minio_train.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ───────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Tunables (override via env) ───────────────────────────────────────────────
-# NP = MPI ranks (1 = single process, 4 = 4 simulated nodes, etc.)
-NP=${NP:-1}
-
-BUCKET="${BUCKET:-mlp-minio}"
-S3_PREFIX="test-run/unet3d/train"
-
-RUN_DIR="/tmp/dlio-minio-train-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Training — minio SDK + MinIO  (unet3d h100)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Data     : $S3_PREFIX  (168 × ~140 MB NPZ)"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "  MPI ranks: $NP   (override: NP=4 bash $0)"
-echo "  Workers  : 4 per rank  (reader.read_threads in YAML)"
-echo "  Epochs   : 5"
-echo "  Batch    : 7"
-echo "  Run dir  : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: verify training data exists ───────────────────────────────────
-echo "Checking training data: s3://$BUCKET/$S3_PREFIX/ ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os
-from urllib.parse import urlparse
-from minio import Minio
-
-endpoint = os.environ["AWS_ENDPOINT_URL"]
-parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
-host = parsed.netloc or endpoint
-secure = parsed.scheme == "https"
-
-client = Minio(
-    host,
-    access_key=os.environ["AWS_ACCESS_KEY_ID"],
-    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
-    secure=secure,
-)
-objects = list(client.list_objects("${BUCKET}", prefix="${S3_PREFIX}/", recursive=True))
-print(len(objects))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -eq 0 ]]; then
-    echo ""
-    echo "❌  ERROR: No training files found in s3://$BUCKET/$S3_PREFIX/"
-    echo "    Run datagen first to populate the bucket:"
-    echo "      bash tests/object-store/dlio_minio_datagen.sh"
-    echo "    Or run the full cycle (datagen + train + cleanup):"
-    echo "      bash tests/object-store/dlio_minio_cycle.sh"
-    exit 1
-fi
-
-echo "✅  Found $FILE_COUNT training files — proceeding"
-echo ""
-
-# ── Note on the expected 'valid' listing ──────────────────────────────────────
-# DLIO always tries to list a valid/ path. It will find 0 files and skip it.
-# That is normal — we have train data only. Not an error.
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_minio \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-echo "✅  Training complete — results in $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_mpi_object_results.md b/tests/object-store/old-archive/dlio_mpi_object_results.md
deleted file mode 100644
index 87606564..00000000
--- a/tests/object-store/old-archive/dlio_mpi_object_results.md
+++ /dev/null
@@ -1,688 +0,0 @@
-# DLIO + s3dlio MPI Scaling Results — UNet3D h100 Workload
-
-**Date:** March 20, 2026  
-**System:** loki-russ  
-**Storage:** MinIO @ `http://minio-host:9000`  
-**Bucket:** `mlp-s3dlio`  
-**Network bandwidth (measured limit):** ~1.2 GB/s
-
----
-
-## Test Configuration
-
-| Parameter | Value |
-|---|---|
-| Workload | UNet3D h100 |
-| Files | 168 × ~140 MB NPZ |
-| Total dataset size | ~23.5 GB |
-| Epochs | 5 |
-| Batch size | 7 samples/step |
-| PyTorch DataLoader threads per rank | 4 |
-| Storage library | s3dlio (v0.9.82) |
-| multiprocessing_context | spawn |
-| Config | `configs/dlio/workload/unet3d_h100_s3dlio.yaml` |
-
-All runs used `--mca btl ^vader` to disable OpenMPI's shared-memory (vader) BTL
-(see [Known Issues](#known-issues) below).
-
----
-
-## Metrics Methodology
-
-All throughput and samples/s figures throughout this document use **wall-clock epoch duration** from the DLIO log line:
-
-> `Ending epoch N - K steps completed in X.XX s`
-
-**Formulas — identical for every library and every NP:**
-
-| Metric | Formula |
-|---|---|
-| I/O Throughput (GB/s) | `24.63 GB ÷ epoch_wall_clock_s` |
-| I/O Throughput (MB/s) | `24.63 × 1024 ÷ epoch_wall_clock_s` |
-| Samples/s | `168 samples ÷ epoch_wall_clock_s` |
-| Summary warm value | mean ± stddev of **epochs 2–5** |
-| vs NP=1 | warm GB/s at NP=N ÷ warm GB/s at NP=1 |
-
-**Constants:** 168 files × 146.6 MB = 24,628.8 MB = **24.63 GB** total dataset; 168 total samples per epoch.
-
-**DLIO `[METRIC]` I/O throughput** (and per-epoch DLIO samples/s) exclude the 0.323 s/step compute time from the denominator, so they read higher than wall-clock. They are shown for reference only where noted.
-
----
-
-## Results
-
-### Summary
-
-| MPI Ranks (NP) | Steps/epoch | Epoch 1 time (cold) | Epoch 2–5 time (warm) | I/O Throughput (MB/s) | I/O Throughput (GB/s) | Samples/s | vs NP=1 |
-|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 24 | ~88 s | ~78 s | **332 ± 0.7** | **0.33** | 2.37 ± 0.005 | 1.0× |
-| 2 | 12 | ~54 s | ~43 s | **664 ± 3.2** | **0.66** | 4.75 ± 0.023 | 2.0× |
-| 4 | 6 | ~34 s | ~23 s | **1720 ± 125** | **1.72** | 12.31 ± 0.89 | 5.2× |
-
-Throughput figures are averaged over all 5 epochs (DLIO `[METRIC]` line).
-
-### Per-Epoch Detail — NP=4
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 6 | 34.0 s | 0.724 | 10.64 | Cold read from MinIO over network |
-| 2 | 6 | 22.4 s | 1.100 | 11.93 | Warm — page cache active |
-| 3 | 6 | 22.9 s | 1.076 | 12.94 | Warm |
-| 4 | 6 | 22.9 s | 1.076 | 13.77 | Warm |
-| 5 | 6 | 22.7 s | 1.085 | 13.77 | Warm |
-
----
-
-## s3dlio Tuned Training (Read) Performance — NP=1 Experiment
-
-**Env vars applied in `tests/object-store/dlio_s3dlio_train.sh`:**
-```bash
-export S3DLIO_ENABLE_RANGE_OPTIMIZATION=0
-export S3DLIO_RT_THREADS=8
-```
-
-**Result:** No meaningful change — **329.5 ± 0.9 MB/s** vs original **332 ± 0.7 MB/s** (within noise).
-
-**Root cause — wrong knob for the `get_many()` path:**
-`S3DLIO_ENABLE_RANGE_OPTIMIZATION` is only read inside `S3ObjectStore::get()` in
-`object_store.rs`. The `get_many()` Python function routes through
-`get_objects_parallel()` → `get_object_uri_optimized_async()` in `s3_utils.rs`, which
-does **not** check that env var. To actually disable range splitting on the `get_many`
-path, use `S3DLIO_RANGE_THRESHOLD_MB=1000` (any value larger than the file size, 147 MB).
-
-| NP | Env vars applied | Steps/epoch | Epoch 1 (cold) | Epoch 2–5 (warm) | I/O Throughput (MB/s) | GB/s | Samples/s | vs untuned NP=1 |
-|:-:|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | `S3DLIO_ENABLE_RANGE_OPTIMIZATION=0` `S3DLIO_RT_THREADS=8` | 24 | ~90 s | ~79 s | **329.5 ± 0.9** | **0.322** | 2.357 ± 0.007 | ~1.0× (no change) |
-| 2 | `S3DLIO_ENABLE_RANGE_OPTIMIZATION=0` `S3DLIO_RT_THREADS=8` | 12 | ~54 s | ~43 s | **675.7 ± 2.1** | **0.660** | 4.833 ± 0.015 | 2.05× |
-| 4 | `S3DLIO_ENABLE_RANGE_OPTIMIZATION=0` `S3DLIO_RT_THREADS=8` | 6 | ~34 s | ~23 s | **1661.5 ± 95.7** | **1.623** | 11.884 ± 0.685 | 5.06× |
-
-### Per-Epoch Detail — NP=1 Tuned
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 24 | 89.99 s | 0.274 | 2.3598 | Cold read from MinIO over network |
-| 2 | 24 | 78.88 s | 0.312 | 2.3538 | Warm — page cache active |
-| 3 | 24 | 78.65 s | 0.313 | 2.3647 | Warm |
-| 4 | 24 | 79.30 s | 0.311 | 2.3459 | Warm |
-| 5 | 24 | 78.99 s | 0.312 | 2.3600 | Warm |
-
-**Warm avg:** ~78.95 s → **0.312 GB/s** (identical to untuned warm avg of ~0.31 GB/s).
-
-### Per-Epoch Detail — NP=2 Tuned
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 12 | 53.64 s | 0.448 | 4.8994 | Cold read from MinIO over network |
-| 2 | 12 | 42.67 s | 0.564 | 4.9111 | Warm — page cache active |
-| 3 | 12 | 43.03 s | 0.559 | 4.9099 | Warm |
-| 4 | 12 | 42.76 s | 0.562 | 4.9012 | Warm |
-| 5 | 12 | 42.87 s | 0.561 | 4.9062 | Warm |
-
-**Warm avg:** ~42.83 s → **0.562 GB/s**.
-
-> **Interpretation:** Throughput improved marginally vs untuned NP=2 (675.7 vs 664 MB/s, ~1.7% — within noise). However, CPU and memory utilization dropped significantly — confirming that `S3DLIO_RT_THREADS=8` eliminated the Tokio thread-count overhead (see Finding 3 in the analysis). Range splitting is still occurring (`S3DLIO_ENABLE_RANGE_OPTIMIZATION=0` is a no-op here), but with fewer Tokio threads, per-thread OS scheduling cost is much lower. Next step: test with `S3DLIO_RANGE_THRESHOLD_MB=1000` to also eliminate range splitting.
-
-### Per-Epoch Detail — NP=4 Tuned
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 6 | 34.04 s | 0.707 | 15.7825 | Cold read from MinIO over network |
-| 2 | 6 | 22.67 s | 1.061 | 11.3513 | Warm — page cache active |
-| 3 | 6 | 22.60 s | 1.064 | 12.1462 | Warm |
-| 4 | 6 | 22.82 s | 1.054 | 12.1807 | Warm |
-| 5 | 6 | 22.82 s | 1.054 | 12.9190 | Warm |
-
-**Warm avg:** ~22.73 s → **1.058 GB/s**.
-
----
-
-## Data Generation (Write) Performance
-
-**All three libraries used NP=8 (8 MPI ranks) for data generation — the default for all datagen scripts.**  
-Dataset: 168 × 146.6 MB NPZ = 24.63 GB total.  
-Timings are wall-clock seconds from `Starting data generation` to `Generation done` in the DLIO log.
-
-| Library | Write implementation | Throughput (MB/s) | Throughput (GB/s) | vs s3dlio |
-|---|---|:-:|:-:|:-:|
-| s3dlio | **`MultipartUploadWriter`** | **889 ± 5** | **0.889** | 1.0× |
-| minio-py | automatic multipart (5 MB parts) | **823 ± 34** | **0.823** | 0.93× |
-| s3torchconnector | streaming `put_object` | **963 ± 14** | **0.963** | 1.08× |
-
-**Winner: s3torchconnector at 963 MB/s — 8% faster than s3dlio multipart, 16% faster than minio-py.**
-
-> **minio-py spread (±34 MB/s across 5 runs):** Environmental variation across the measurement window — individual runs range from 28.5 s to 31.2 s. Not a library characteristic.
-
-### Individual Datagen Run Log (all NP=8)
-
-| Library | Log timestamp | Duration | MB/s |
-|---|---|:-:|:-:|
-| s3dlio (MultipartUploadWriter) | `dlio-s3dlio-datagen-20260320_114719` | 27.91 s | 882 |
-| s3dlio (MultipartUploadWriter) | `dlio-s3dlio-datagen-20260320_120959` | 27.44 s | 897 |
-| s3dlio (MultipartUploadWriter) | `dlio-s3dlio-datagen-20260320_152849` | 27.71 s | 889 |
-| s3dlio (MultipartUploadWriter) | `dlio-s3dlio-datagen-20260320_180423` | 27.75 s | 888 |
-| minio-py | `dlio-minio-datagen-20260320_111707` | 30.70 s | 802 |
-| minio-py | `dlio-minio-datagen-20260320_111818` | 30.70 s | 802 |
-| minio-py | `dlio-minio-datagen-20260320_121228` | 28.49 s | 865 |
-| minio-py | `dlio-minio-datagen-20260320_130727` | 28.82 s | 854 |
-| minio-py | `dlio-minio-datagen-20260320_164356` | 31.17 s | 790 |
-| s3torchconnector | `dlio-s3torch-datagen-20260320_122511` | 25.21 s | 977 |
-| s3torchconnector | `dlio-s3torch-datagen-20260320_161531` | 25.96 s | 949 |
-
-### Historical: s3dlio before multipart fix (single-part PUT, NP=8)
-
-The original `put_bytes()` path issued a single HTTP PUT for the entire 147 MB object — one TCP flow, no concurrency. minio-py splits automatically at 5 MB parts; s3torchconnector streams via chunked transfer. Result: s3dlio was 47% slower than the other two libraries.
-
-| Log timestamp | Duration | MB/s |
-|---|:-:|:-:|
-| `dlio-s3dlio-datagen-20260320_094109` | 52.39 s | 470 |
-| `dlio-s3dlio-datagen-20260320_112449` | 52.21 s | 472 |
-| `dlio-s3dlio-datagen-20260320_114245` | 52.12 s | 473 |
-| **mean** | **52.24 ± 0.11 s** | **471 ± 1** |
-
-**Fix applied:** [dlio_benchmark/storage/obj_store_lib.py](../../dlio_benchmark/dlio_benchmark/storage/obj_store_lib.py) — `put_data()` now routes objects ≥ 16 MB through `s3dlio.MultipartUploadWriter.from_uri()`. No changes to s3dlio itself were required.  
-Threshold configurable via `S3DLIO_MULTIPART_THRESHOLD_MB` (default 16).
-
----
-
-## Key Finding: Page Cache Reuse With Object Storage
-
-**The NP=4 average throughput of 1,720 MB/s exceeds the physical network limit of 1,200 MB/s — proving that a substantial fraction of the epoch 2–5 reads are being served from the Linux page cache, not from the network.**
-
-### How this works
-
-When a DLIO worker reads an object from MinIO via s3dlio:
-
-1. s3dlio fetches the object over the network into memory
-2. The kernel stores a copy of those pages in the **Linux page cache** (not s3dlio-specific — all file descriptor reads go through the VFS page cache)
-3. On the next epoch, when the same object is re-requested, the kernel serves those pages directly from RAM without touching the network
-
-This happens transparently: neither DLIO nor s3dlio explicitly manages a cache. The OS page cache just does what it always does for any I/O.
-
-### Why this was unexpected
-
-Object storage reads go through a socket, not a mapped file, so the expectation was that each read would always hit the network. The surprise is that **the Linux kernel caches socket read data in the page cache regardless of whether the source is a file or a TCP stream**, provided the data path goes through standard VFS read calls.
-
-This is the same caching effect observed when benchmarking local NFS or block storage — sequential-epoch AI training workloads always re-read the same files across epochs, and the OS caches aggressively.
-
-### Implications for benchmarking
-
-| Scenario | What it means |
-|---|---|
-| **Epoch 1 throughput** | True cold-read performance — reflects actual network/storage bandwidth |
-| **Epoch 2+ throughput** | Warm performance — partially or fully served from page cache |
-| **Averaged-epoch metric** | Blends cold + warm; optimistic relative to a fresh system |
-| **Large dataset (> RAM)** | Page cache thrashing; all epochs approximate cold performance |
-| **Production workload** | Page cache benefit is real — systems doing repeated training runs will see this speedup |
-
-To measure true storage-only performance, the dataset must exceed available system RAM, or the page cache must be cleared between epochs (`echo 3 > /proc/sys/vm/drop_caches` as root).
-
-The 23.5 GB dataset fits comfortably in RAM on loki-russ, so after epoch 1, subsequent epochs run almost entirely from cache.
-
----
-
-## s3dlio Tuned Training — `S3DLIO_RANGE_THRESHOLD_MB=1000` + `S3DLIO_RT_THREADS=8`
-
-**Env vars applied:**
-```bash
-export S3DLIO_RANGE_THRESHOLD_MB=1000   # single streaming GET for files < 1000 MB (no range splitting)
-export S3DLIO_RT_THREADS=8              # 8 Tokio threads per process (vs default 32)
-```
-
-**Note:** `S3DLIO_ENABLE_RANGE_OPTIMIZATION=0` was used in the prior "tuned" run above — that is a
-confirmed no-op for `get_many()`. This run uses the correct knobs. See [s3dlio_performance_analysis.md](s3dlio_performance_analysis.md) §6 Tier 1 for details.
-
-**Also active:** `_BytesViewIO` zero-copy fix in `npz_reader_s3_iterable.py` (eliminates the `bytes(data)` 147 MB/file copy).
-
-### Per-Epoch Detail — NP=1 (correct env vars + zero-copy fix)
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 24 | 72.28 s | 0.333 | 340.8 | 2.325 | Cold read from MinIO over network |
-| 2 | 24 | 60.90 s | 0.395 | 404.4 | 2.759 | Warm — page cache active |
-| 3 | 24 | 60.25 s | 0.399 | 408.8 | 2.788 | Warm |
-| 4 | 24 | 60.24 s | 0.399 | 408.8 | 2.789 | Warm |
-| 5 | 24 | 60.00 s | 0.401 | 410.5 | 2.800 | Warm |
-
-**Warm avg (epochs 2–5):** 60.35 s → **408 ± 2 MB/s** | **0.398 GB/s** | **2.784 ± 0.015 samples/s**
-
-> DLIO `[METRIC]` reports **431.1 MB/s** — higher than wall-clock because it excludes compute time
-> (0.323 s/step × 24 steps ≈ 7.75 s/epoch) from the denominator. Wall-clock methodology is used
-> throughout this document for consistency.
-
-### Per-Epoch Detail — NP=2 (correct env vars + zero-copy fix)
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 12 | 44.89 s | 0.536 | 548.6 | 3.743 | Cold read from MinIO over network |
-| 2 | 12 | 33.71 s | 0.714 | 730.8 | 4.985 | Warm — page cache active |
-| 3 | 12 | 34.03 s | 0.706 | 723.3 | 4.937 | Warm |
-| 4 | 12 | 33.44 s | 0.719 | 736.5 | 5.024 | Warm |
-| 5 | 12 | 34.00 s | 0.707 | 724.4 | 4.941 | Warm |
-
-**Warm avg (epochs 2–5):** 33.80 s → **729 ± 5 MB/s** | **0.712 GB/s** | **4.97 samples/s**
-
-> DLIO `[METRIC]` reports **857.9 MB/s** — higher than wall-clock as compute time (~3.9 s/epoch
-> for 12 steps × 0.323 s/step) is excluded from the denominator.
-
-**Scaling NP=1 → NP=2: 408 → 729 MB/s = 1.79× speedup** (vs ideal 2.0× for linear scaling).
-
-### Per-Epoch Detail — NP=4 (correct env vars + zero-copy fix)
-
-**Methodology:** MB/s = 24,628.8 MB ÷ duration_s; GB/s = MB/s ÷ 1024; samples/s = 168 ÷ duration_s.
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 6 | 33.84 s | 0.711 | 727.7 | 4.965 | Cold read from MinIO over network |
-| 2 | 6 | 22.59 s | 1.065 | 1090.3 | 7.438 | Warm — page cache active |
-| 3 | 6 | 22.57 s | 1.066 | 1091.2 | 7.444 | Warm |
-| 4 | 6 | 22.62 s | 1.064 | 1088.9 | 7.427 | Warm |
-| 5 | 6 | 22.59 s | 1.065 | 1090.3 | 7.438 | Warm |
-
-**Warm avg (epochs 2–5):** 22.59 s → **1090 ± 1 MB/s** | **1.065 GB/s** | **7.44 samples/s**
-
-> DLIO `[METRIC]` reports **1881.5 MB/s** — higher than wall-clock as compute time (~6 steps × 0.323 s/step ≈ 1.9 s/epoch) is excluded from the denominator.
-
-**Scaling NP=2 → NP=4: 729 → 1090 MB/s = 1.49× speedup** (vs ideal 2.0×). Page cache saturation is reducing marginal gain — all 168 files are already cached after epoch 1 regardless of NP.
-
-### Per-Epoch Detail — NP=8 (correct env vars + zero-copy fix)
-
-**Methodology:** MB/s = 24,628.8 MB ÷ duration_s; GB/s = MB/s ÷ 1024; samples/s = 168 ÷ duration_s.
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 3 | 34.42 s | 0.699 | 715.5 | 4.881 | Cold read from MinIO over network |
-| 2 | 3 | 22.69 s | 1.060 | 1085.5 | 7.404 | Warm — page cache active |
-| 3 | 3 | 22.67 s | 1.061 | 1086.5 | 7.410 | Warm |
-| 4 | 3 | 22.79 s | 1.055 | 1080.6 | 7.371 | Warm |
-| 5 | 3 | 22.57 s | 1.065 | 1091.1 | 7.444 | Warm |
-
-**Warm avg (epochs 2–5):** 22.68 s → **1086 ± 4 MB/s** | **1.061 GB/s** | **7.41 samples/s**
-
----
-
-## s3dlio v0.9.84 — Range Optimization Bug Fix — NP=1
-
-**Library version:** s3dlio v0.9.82 wheel (to be tagged v0.9.84)  
-**Key change:** `S3DLIO_ENABLE_RANGE_OPTIMIZATION=0` now correctly applies to **all** code paths
-including `get_many()` / `get_objects_parallel()` (was a confirmed no-op prior to v0.9.82).
-This replaces the previous workaround of `S3DLIO_RANGE_THRESHOLD_MB=1000`.
-
-**Env vars applied in `tests/object-store/dlio_s3dlio_train.sh`:**
-```bash
-export S3DLIO_ENABLE_RANGE_OPTIMIZATION=0   # skip HEAD + single GET (bug fixed in v0.9.82)
-export S3DLIO_RT_THREADS=8                  # 8 Tokio threads per process
-```
-
-**Effect of the bug fix vs the old workaround (`RANGE_THRESHOLD_MB=1000`):**
-- Old (`RANGE_THRESHOLD_MB=1000`): still issued 1 HEAD per file (to compare size against threshold), then fell back to single GET — **1 HEAD + 1 GET per file**
-- New (`ENABLE_RANGE_OPTIMIZATION=0`): skips HEAD entirely, goes directly to single GET — **0 HEADs + 1 GET per file**; also skips the pre-stat phase in `get_objects_parallel()`
-
-**Additional changes in v0.9.82 hit path:**
-- `concurrent_range_get_impl()`: mutex-free collect-then-assemble (no impact when range opt disabled)
-- `get_objects_parallel()`: O(N log N) sort via pre-built HashMap index (replaces O(N²) linear scan)
-- `ObjectSizeCache` TTL changed from 5 min → 1 hour default (no impact for single-epoch test runs)
-- OnceLock caching of env var reads (eliminates env syscall on hot path)
-
-### DLIO [METRIC] Output (NP=1)
-
-```
-[METRIC] Number of Simulated Accelerators: 1
-[METRIC] Training Accelerator Utilization [AU] (%): 15.1989 (0.1397)
-[METRIC] Training Throughput (samples/second): 3.1146 (0.0269)
-[METRIC] Training I/O Throughput (MB/second): 435.4454 (3.7665)
-```
-
-> DLIO [METRIC] excludes per-step compute time (~0.323 s/step × 24 steps ≈ 7.75 s/epoch) from the
-> denominator. Wall-clock figures below are used throughout this document for consistency.
-
-### Per-Epoch Detail — NP=1 (v0.9.84 bug-fix wheel)
-
-**Methodology:** MB/s = 24,628.8 MB ÷ duration_s; GB/s = MB/s ÷ 1024; samples/s = 168 ÷ duration_s.
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 24 | 71.52 s | 0.336 | 344.3 | 2.349 | Cold read from MinIO over network |
-| 2 | 24 | 60.22 s | 0.399 | 408.9 | 2.790 | Warm — page cache active |
-| 3 | 24 | 59.64 s | 0.403 | 412.9 | 2.817 | Warm |
-| 4 | 24 | 59.38 s | 0.405 | 414.7 | 2.829 | Warm |
-| 5 | 24 | 59.51 s | 0.404 | 413.8 | 2.823 | Warm |
-
-**Warm avg (epochs 2–5):** 59.69 s → **413 ± 2 MB/s** | **0.403 GB/s** | **2.815 ± 0.015 samples/s**
-
-### DLIO [METRIC] Output (NP=2)
-
-```
-[METRIC] Number of Simulated Accelerators: 2 
-[METRIC] Training Accelerator Utilization [AU] (%): 15.1657 (0.1176)
-[METRIC] Training Throughput (samples/second): 5.9271 (0.0493)
-[METRIC] Training I/O Throughput (MB/second): 828.6602 (6.8904)
-```
-
-> DLIO [METRIC] excludes per-step compute time (~0.323 s/step × 12 steps ≈ 3.9 s/epoch) from the
-> denominator. Wall-clock figures below are used throughout this document for consistency.
-
-### Per-Epoch Detail — NP=2 (v0.9.84 bug-fix wheel)
-
-**Methodology:** MB/s = 24,628.8 MB ÷ duration_s; GB/s = MB/s ÷ 1024; samples/s = 168 ÷ duration_s.
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 12 | 45.40 s | 0.530 | 542.5 | 3.700 | Cold read from MinIO over network |
-| 2 | 12 | 34.76 s | 0.692 | 708.6 | 4.833 | Warm — page cache active |
-| 3 | 12 | 34.68 s | 0.694 | 710.2 | 4.845 | Warm |
-| 4 | 12 | 34.21 s | 0.703 | 719.9 | 4.912 | Warm |
-| 5 | 12 | 34.39 s | 0.699 | 716.1 | 4.885 | Warm |
-
-**Warm avg (epochs 2–5):** 34.51 s → **713 ± 5 MB/s** | **0.697 GB/s** | **4.87 ± 0.03 samples/s**
-
-**Scaling NP=1 → NP=2: 413 → 713 MB/s = 1.73×** (vs ideal 2.0×). Consistent with prior v0.9.82 NP=1→2 scaling (1.79× for the workaround run).
-
-### DLIO [METRIC] Output (NP=4)
-
-```
-[METRIC] Number of Simulated Accelerators: 4 
-[METRIC] Training Accelerator Utilization [AU] (%): 19.2339 (0.5320)
-[METRIC] Training Throughput (samples/second): 13.3328 (0.3688)
-[METRIC] Training I/O Throughput (MB/second): 1864.0430 (51.5630)
-```
-
-> DLIO [METRIC] excludes per-step compute time (~0.323 s/step × 6 steps ≈ 1.9 s/epoch) from the
-> denominator. Wall-clock figures below are used throughout this document for consistency.
-
-### Per-Epoch Detail — NP=4 (v0.9.84 bug-fix wheel)
-
-**Methodology:** MB/s = 24,628.8 MB ÷ duration_s; GB/s = MB/s ÷ 1024; samples/s = 168 ÷ duration_s.
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 6 | 33.55 s | 0.716 | 733.9 | 5.007 | Cold read from MinIO over network |
-| 2 | 6 | 22.58 s | 1.066 | 1090.7 | 7.440 | Warm — page cache active |
-| 3 | 6 | 22.60 s | 1.065 | 1089.8 | 7.434 | Warm |
-| 4 | 6 | 22.79 s | 1.056 | 1080.6 | 7.372 | Warm |
-| 5 | 6 | 22.66 s | 1.062 | 1086.8 | 7.414 | Warm |
-
-**Warm avg (epochs 2–5):** 22.66 s → **1087 ± 4 MB/s** | **1.062 GB/s** | **7.42 ± 0.03 samples/s**
-
-**Scaling NP=2 → NP=4: 713 → 1087 MB/s = 1.52×** (vs ideal 2.0×). Page cache saturation limits marginal gain — all 168 files cached after epoch 1 regardless of NP. Matches prior NP=4 result (1090 ± 1 MB/s) to within noise.
-
-### DLIO [METRIC] Output (NP=8)
-
-```
-[METRIC] Number of Simulated Accelerators: 8 
-[METRIC] Training Accelerator Utilization [AU] (%): 37.9346 (3.1990)
-[METRIC] Training Throughput (samples/second): 32.8631 (2.7722)
-[METRIC] Training I/O Throughput (MB/second): 4594.5609 (387.5733)
-```
-
-> DLIO [METRIC] excludes per-step compute time (~0.323 s/step × 3 steps ≈ 1.0 s/epoch) from the
-> denominator. Wall-clock figures below are used throughout this document for consistency.
-
-### Per-Epoch Detail — NP=8 (v0.9.84 bug-fix wheel)
-
-**Methodology:** MB/s = 24,628.8 MB ÷ duration_s; GB/s = MB/s ÷ 1024; samples/s = 168 ÷ duration_s.
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | MB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 3 | 36.14 s | 0.666 | 681.5 | 4.648 | Cold read from MinIO over network |
-| 2 | 3 | 23.11 s | 1.041 | 1065.7 | 7.270 | Warm — page cache active |
-| 3 | 3 | 24.70 s | 0.974 | 997.1 | 6.802 | Warm |
-| 4 | 3 | 31.50 s | 0.764 | 781.9 | 5.333 | Warm — **anomalous slowdown** (network jitter / cache pressure) |
-| 5 | 3 | 22.86 s | 1.052 | 1077.4 | 7.348 | Warm |
-
-**Warm avg (epochs 2–5):** 25.54 s → **964 ± 120 MB/s** | **0.942 GB/s** | **6.58 ± 0.86 samples/s**
-
-> **High variance note:** Epoch 4 (31.50 s) is a clear outlier — 2.5σ above the mean of the other 3 warm epochs (23.11, 24.70, 22.86 s → avg 23.56 s → **1045 MB/s**). This is consistent with the prior NP=8 run (1086 ± 4 MB/s) and the NP=4 result (1087 ± 4 MB/s). The anomaly is likely a transient network hiccup or OS page reclaim event, not a characteristic of the implementation.
-
-**Scaling NP=4 → NP=8: 1087 → 964 MB/s (including E4 anomaly) or ~1045 MB/s (excluding E4) = essentially flat.** Both results confirm NP=8 with 3 steps/epoch hits the same page-cache ceiling as NP=4. Additional ranks add no benefit once the working set is fully cached.
-
-| Run | Env vars | Warm MB/s | Warm samples/s | vs first |
-|---|---|:-:|:-:|:-:|
-| Untuned (v0.9.82) | defaults | **332 ± 0.7** | 2.37 ± 0.005 | 1.0× |
-| `ENABLE_RANGE_OPTIMIZATION=0` (v0.9.82 — no-op) | `RT_THREADS=8` | **329.5 ± 0.9** | 2.357 ± 0.007 | ~1.0× |
-| `RANGE_THRESHOLD_MB=1000` (v0.9.82 — workaround) + zero-copy fix | `RT_THREADS=8` | **408 ± 2** | 2.784 ± 0.015 | 1.23× |
-| `ENABLE_RANGE_OPTIMIZATION=0` (v0.9.84 — bug fixed) | `RT_THREADS=8` | **413 ± 2** | 2.815 ± 0.015 | 1.24× |
-
-**Net result:** The v0.9.84 bug fix delivers a marginal further improvement (~5 MB/s, ~1.2%) over the
-`RANGE_THRESHOLD_MB=1000` workaround — consistent with the theoretical saving (HEAD requests eliminated
-per batch). The difference is within noise given MinIO + network variability on this test system.
-The primary gain in both cases comes from eliminating range splitting (HEAD + 37 range GETs → 0 HEADs + 1 GET).
-The `ENABLE_RANGE_OPTIMIZATION=0` path is now the preferred and correct setting for this environment.
-
-> DLIO `[METRIC]` reports **6066 MB/s** — this is an anomalously high average driven by high variance (stddev 955 MB/s); wall-clock warm epochs show consistent ~1086 MB/s. The DLIO metric likely includes at least one epoch where the page cache served the entire dataset near memory bandwidth.
-
-**Scaling NP=4 → NP=8: 1087 → 964 MB/s measured (anomalous E4 at 31.50 s); excluding that outlier, the 3 normal warm epochs average ~1045 MB/s — essentially flat vs NP=4.** Confirms the page-cache ceiling is reached by NP=4.
-
-### Impact vs Prior Runs
-
-| Configuration | NP | Warm MB/s | vs untuned NP=1 | vs minio-py (same NP) |
-|---|:-:|:-:|:-:|:-:|
-| s3dlio untuned (baseline) | 1 | 332 ± 0.7 | 1.00× | 0.72× |
-| s3dlio + `S3DLIO_ENABLE_RANGE_OPTIMIZATION=0` + `S3DLIO_RT_THREADS=8` *(no-op env var)* | 1 | 329.5 ± 0.9 | ~1.00× | 0.72× |
-| **s3dlio + `S3DLIO_RANGE_THRESHOLD_MB=1000` + `S3DLIO_RT_THREADS=8` + zero-copy fix** | **1** | **408 ± 2** | **+23%** | **0.89×** |
-| **s3dlio + `S3DLIO_RANGE_THRESHOLD_MB=1000` + `S3DLIO_RT_THREADS=8` + zero-copy fix** | **2** | **729 ± 5** | **2.19×** | **0.85×** |
-| **s3dlio v0.9.84 `ENABLE_RANGE_OPTIMIZATION=0` + `RT_THREADS=8`** | **1** | **413 ± 2** | **+24%** | **0.90×** |
-| **s3dlio v0.9.84 `ENABLE_RANGE_OPTIMIZATION=0` + `RT_THREADS=8`** | **2** | **713 ± 5** | **2.15×** | **0.83×** |
-| **s3dlio v0.9.84 `ENABLE_RANGE_OPTIMIZATION=0` + `RT_THREADS=8`** | **4** | **1087 ± 4** | **3.27×** | **0.99×** |
-| **s3dlio v0.9.84 `ENABLE_RANGE_OPTIMIZATION=0` + `RT_THREADS=8`** | **8** | **964 ± 120** ¹ | **2.90×** | **0.87×** |
-| **s3dlio + `S3DLIO_RANGE_THRESHOLD_MB=1000` + `S3DLIO_RT_THREADS=8` + zero-copy fix** | **4** | **1090 ± 1** | **3.28×** | **0.99×** |
-| **s3dlio + `S3DLIO_RANGE_THRESHOLD_MB=1000` + `S3DLIO_RT_THREADS=8` + zero-copy fix** | **8** | **1086 ± 4** | **3.27×** | **0.98×** |
-| minio-py (reference) | 1 | 459 ± 1 | 1.38× | 1.00× |
-| minio-py (reference) | 2 | 857 ± 3 | 2.58× | 1.00× |
-| minio-py (reference) | 4 | 1097 ± 3 | 3.30× | 1.00× |
-| minio-py (reference) | 8 | 1107 ± 3 | 3.33× | 1.00× |
-
-¹ NP=8 v0.9.84 high variance (±120 MB/s) driven by epoch 4 anomaly (31.50 s vs ~23 s for other warm epochs). Excluding epoch 4, the 3 remaining warm epochs average ~1045 MB/s (0.87× minio-py), consistent with the NP=8 v0.9.82 run (1086 ± 4 MB/s).
-
-**At NP=4, s3dlio tuned matches minio-py within 1–2%.** Both libraries hit the same
-page-cache ceiling (≈1087–1097 MB/s) and adding more ranks provides no further gain. The gap at
-NP=1/2 (0.83–0.90×) is attributable to per-file fixed overhead; this cost becomes negligible
-once cache-serve time dominates. The Rust-level HEAD elimination will primarily benefit
-cold-epoch (epoch 1) performance across all NP levels.
-
----
-
-## minio-py Training (Read) Performance — Scaling Study
-
-**Bucket:** `mlp-minio` | **Config:** `configs/dlio/workload/unet3d_h100_minio.yaml`  
-Same workload as s3dlio/s3torchconnector scaling study: 168 × ~140 MB NPZ, batch_size=7, 5 epochs, 4 DataLoader threads/rank.
-
-### Summary
-
-All figures computed per [Metrics Methodology](#metrics-methodology) above. NP=4/8 re-runs pending.
-
-| MPI Ranks (NP) | Steps/epoch | Epoch 1 time (cold) | Epoch 2–5 time (warm) | I/O Throughput (MB/s) | I/O Throughput (GB/s) | Samples/s | vs NP=1 |
-|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 24 | 64.9 s | ~53.6 s | **459 ± 1** | **0.459** | 3.13 ± 0.01 | 1.0× |
-| 2 | 12 | ~41.5 s | ~28.7 s | **857 ± 3** | **0.857** | 5.85 ± 0.02 | 1.87× |
-| 4 | 6 | ~34.0 s | ~22.4 s | **1097 ± 3** | **1.097** | 7.49 ± 0.02 | 2.39× |
-| 8 | 3 | ~34.7 s | ~22.8 s | **1107 ± 3** | **1.081** | 7.37 ± 0.02 | 2.35× |
-
-### Per-Epoch Detail — NP=1
-
-| Epoch | Steps | Duration | GB/s | Samples/s | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 24 | 64.93 s | 0.379 | 2.59 | Cold |
-| 2 | 24 | 53.82 s | 0.458 | 3.12 | Network-rate |
-| 3 | 24 | 53.52 s | 0.460 | 3.14 | Network-rate |
-| 4 | 24 | 53.60 s | 0.460 | 3.13 | Network-rate |
-| 5 | 24 | 53.63 s | 0.459 | 3.13 | Network-rate |
-
-### Per-Epoch Detail — NP=2
-
-| Epoch | Steps | Duration | GB/s | Samples/s | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 12 | 41.50 s | 0.593 | 4.05 | Cold |
-| 2 | 12 | 28.84 s | 0.854 | 5.83 | Network-rate |
-| 3 | 12 | 28.71 s | 0.858 | 5.85 | Network-rate |
-| 4 | 12 | 28.71 s | 0.858 | 5.85 | Network-rate |
-| 5 | 12 | 28.64 s | 0.860 | 5.87 | Network-rate |
-
-### Per-Epoch Detail — NP=4
-
-| Epoch | Steps | Duration | GB/s | Samples/s | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 6 | 34.00 s | 0.724 | 4.94 | Cold |
-| 2 | 6 | 22.52 s | 1.093 | 7.46 | Page cache active |
-| 3 | 6 | 22.37 s | 1.101 | 7.51 | Warm |
-| 4 | 6 | 22.45 s | 1.097 | 7.48 | Warm |
-| 5 | 6 | 22.43 s | 1.098 | 7.49 | Warm |
-
-### Per-Epoch Detail — NP=8
-
-| Epoch | Steps | Duration | GB/s | Samples/s | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 3 | 34.69 s | 0.710 | 4.85 | Cold |
-| 2 | 3 | 22.85 s | 1.078 | 7.35 | Page cache active |
-| 3 | 3 | 22.72 s | 1.084 | 7.39 | Warm |
-| 4 | 3 | 22.78 s | 1.081 | 7.37 | Warm |
-| 5 | 3 | 22.77 s | 1.081 | 7.37 | Warm |
-
----
-
-## s3torchconnector Training (Read) Performance — Scaling Study
-
-> **⚠️ RESULTS NOT REPRESENTATIVE — SEQUENTIAL FETCH ISSUE**
-> These results were collected using `S3IterableDataset.from_objects()`, which fetches files
-> **one at a time per DataLoader worker** (4 total concurrent GETs across all workers).
-> This is fundamentally less concurrent than minio (up to 64 total) and s3dlio (up to 256 total).
-> The numbers below reflect sequential-fetch throughput, **not** the true read capability
-> of the s3torchconnector library. These results should be re-run after implementing the
-> `ThreadPoolExecutor + S3Client.get_object()` fix. See `S3library_review_21-Mar.md` for
-> full analysis and remediation options.
-
-Using `S3IterableDataset.from_objects()` with `S3ReaderConstructor.sequential()` — single streaming GET per file, no range splitting, no HEAD requests.
-
-### Summary
-
-| MPI Ranks (NP) | Steps/epoch | Epoch 1 time (cold) | Epoch 2–5 time (warm) | I/O Throughput (MB/s) | I/O Throughput (GB/s) | Samples/s | vs NP=1 |
-|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 24 | 96.75 s | ~85.9 s | **303.0 ± 1.1** | **0.296** | 2.1672 ± 0.0082 | 1.0× |
-| 2 | 12 | 56.17 s | ~46.5 s | **627.2 ± 6.4** | **0.613** | 4.4861 ± 0.0458 | 2.07× |
-| 4 | 6 | 33.69 s | ~22.7 s | **1934.7 ± 65.9** | **1.890** | 13.8379 ± 0.4712 | 6.38× ¹ |
-| 8 | 3 | 36.66 s | ~24.2 s | **5557 ± 242** | **5.426** | 39.7469 ± 1.7296 | 18.3× ¹ ² |
-
-### Per-Epoch Detail — NP=1
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 24 | 96.75 s | 0.255 | 2.1727 | Cold read from MinIO over network |
-| 2 | 24 | 86.43 s | 0.285 | 2.1513 | Warm — page cache active |
-| 3 | 24 | 85.74 s | 0.287 | 2.1709 | Warm |
-| 4 | 24 | 85.71 s | 0.287 | 2.1734 | Warm |
-| 5 | 24 | 85.79 s | 0.287 | 2.1677 | Warm |
-
-**Warm avg:** ~85.92 s → **0.287 GB/s**.
-
-> **vs s3dlio NP=1:** s3torchconnector warm throughput (0.287 GB/s) is ~8% slower than s3dlio tuned NP=1 (0.312 GB/s). This is expected: `S3IterableDataset.sequential()` issues one streaming GET per file on a single connection (no parallelism within a file), whereas s3dlio's `get_many()` uses Tokio async concurrency across all files in the batch simultaneously.
-
-### Per-Epoch Detail — NP=2
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 12 | 56.17 s | 0.438 | 4.6012 | Cold read from MinIO over network |
-| 2 | 12 | 46.05 s | 0.535 | 4.6056 | Warm — page cache active |
-| 3 | 12 | 46.55 s | 0.529 | 4.5692 | Warm |
-| 4 | 12 | 46.85 s | 0.526 | 4.5370 | Warm |
-| 5 | 12 | 46.65 s | 0.528 | 4.5319 | Warm |
-
-**Warm avg:** ~46.53 s → **0.529 GB/s**.
-
-> **vs s3dlio NP=2:** s3torchconnector warm throughput (0.529 GB/s) is ~6% slower than s3dlio tuned NP=2 (0.562 GB/s) — the relative gap is consistent with NP=1 (~8%). Scaling from NP=1→NP=2 is 2.07× (linear), matching s3dlio's 2.05× scaling at the same step.
-
-### Per-Epoch Detail — NP=4
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 6 | 33.69 s | 0.731 | 12.1958 | Cold read from MinIO over network |
-| 2 | 6 | 22.48 s | 1.095 | 14.6062 | Warm — page cache active |
-| 3 | 6 | 22.74 s | 1.083 | 15.1972 | Warm |
-| 4 | 6 | 23.14 s | 1.065 | 14.4476 | Warm |
-| 5 | 6 | 22.48 s | 1.095 | 13.9308 | Warm |
-
-**Warm avg:** ~22.71 s → **1.084 GB/s**.
-
-¹ **METRIC throughput (1934.7 MB/s) far exceeds the 1,200 MB/s physical network ceiling** — the majority of warm-epoch reads are served from the Linux page cache, not the network. This is identical behaviour to s3dlio NP=4 (warm avg ~22.73 s, 1.058 GB/s). The wall-clock warm GB/s (1.084) is the reliable signal; the METRIC value is inflated by cache hits.
-
-> **vs s3dlio NP=4:** warm epoch durations are nearly identical (22.71 s vs 22.73 s) — at NP=4 both libraries are overwhelmingly page-cache-bound and the library difference disappears entirely.
-
-### Per-Epoch Detail — NP=8
-
-| Epoch | Steps | Duration | GB/s (wall-clock) | Throughput (samples/s) | Notes |
-|:-:|:-:|:-:|:-:|:-:|---|
-| 1 | 3 | 36.66 s | 0.672 | 51.53 | Cold read from MinIO over network |
-| 2 | 3 | 24.34 s | 1.012 | 57.66 | Warm — page cache active |
-| 3 | 3 | 24.26 s | 1.015 | 47.32 | Warm |
-| 4 | 3 | 24.18 s | 1.018 | 30.64 | Warm |
-| 5 | 3 | 23.85 s | 1.033 | 12.18 | Warm |
-
-**Warm avg:** ~24.16 s → **1.019 GB/s**.
-
-¹ ² **METRIC throughput and samples/s at NP=8 are unreliable** — with only 3 steps/epoch, sub-second timing noise in any single step dominates the per-epoch average. The wall-clock epoch duration (23.85–24.34 s warm, CV <1%) is the reliable signal. METRIC MB/s (5557) is ~4.6× above the physical network ceiling (1,200 MB/s), confirming the workload is overwhelmingly page-cache-served at NP=8.
-
-> **vs s3dlio NP=8:** s3torchconnector warm avg 24.16 s vs minio-py warm avg ~22.5–22.9 s from the minio NP=8 section. s3torchconnector is within ~7% of minio-py at NP=8 — both are cache-dominated and the library differences are negligible.
-
----
-
-## How to Reproduce
-
-```bash
-cd /path/to/mlp-storage
-
-# Populate bucket (skip if data already present)
-bash tests/object-store/dlio_s3dlio_datagen.sh
-
-# Run training at different MPI ranks
-NP=1 bash tests/object-store/dlio_s3dlio_train.sh
-NP=2 bash tests/object-store/dlio_s3dlio_train.sh
-NP=4 bash tests/object-store/dlio_s3dlio_train.sh
-
-# Results are in the most recent /tmp/dlio-s3dlio-train-* directory
-grep -E "Simulated Acc|Throughput|I/O" /tmp/dlio-s3dlio-train-*/dlio.log
-```
-
-To measure cold-read performance only, clear the page cache between runs (requires root):
-
-```bash
-sync && sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'
-NP=4 bash tests/object-store/dlio_s3dlio_train.sh
-# Only epoch 1 duration is meaningful in this case
-```
-
----
-
-## Known Issues
-
-### OpenMPI vader BTL crash (NP ≥ 4 without the fix)
-
-**Symptom:** `mpirun` exits with signal 11 (Segmentation fault) immediately after
-`Starting block 1`, before any step completes. NP=1 and NP=2 work fine.
-
-**Root cause:** OpenMPI automatically selects the `vader` BTL (shared-memory
-transport) when all ranks run on the same physical node. At NP≥4, a race
-condition in vader's shared-memory ring-buffer causes one rank to dereference
-a fragment pointer already freed by another rank during `MPI_Barrier`.
-
-The full crash stack was:
-```
-mca_btl_vader_poll_handle_frag → opal_progress → ompi_sync_wait_mt
-  → mca_pml_ob1_recv → ompi_coll_base_barrier_intra_basic_linear
-  → MPI_Barrier  ← SEGV_MAPERR
-```
-
-**Fix:** Add `--mca btl ^vader` to the `mpirun` invocation. This disables vader
-and forces OpenMPI to use TCP loopback for intra-node communication instead.
-All scripts in `tests/object-store/` already include this flag.
-
----
-
-## Environment
-
-```
-Python:         3.13 (linuxbrew)
-s3dlio:         0.9.84
-dlio_benchmark: fork (mlp-storage/dlio_benchmark)
-mpi4py:         bundled with openmpi3
-OpenMPI:        system (/usr/lib/x86_64-linux-gnu/openmpi)
-DLIO_S3_IMPLEMENTATION=mlp
-multiprocessing_context=spawn   (required — fork kills Tokio runtime in workers)
-```
diff --git a/tests/object-store/old-archive/dlio_s3dlio_checkpoint.sh b/tests/object-store/old-archive/dlio_s3dlio_checkpoint.sh
deleted file mode 100755
index 2dff7733..00000000
--- a/tests/object-store/old-archive/dlio_s3dlio_checkpoint.sh
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3dlio_checkpoint.sh
-#
-# Run DLIO checkpointing directly via dlio_benchmark — NO mlpstorage wrapper.
-# Writes and reads llama3-8b checkpoints to/from MinIO using s3dlio.
-#
-# Config  : configs/dlio/workload/llama3_8b_checkpoint_s3dlio.yaml
-# Workload: LLaMA 3 8B — ZeRO-3, 8 ranks, ~13.1 GB per rank per checkpoint
-# Storage : s3dlio → MinIO  (endpoint from AWS_ENDPOINT_URL)  bucket: chckpt-test1
-# Objects : s3://chckpt-test1/s3dlio/llama3-8b/<checkpoint_id>/<rank_file>.pt
-#
-# MPI ranks:
-#   llama3-8b with ZeRO-3 requires exactly 8 MPI ranks (the closed reference value).
-#   Each rank writes its shard of the model+optimizer state (~13.1 GB).
-#   Run with NP=8 for full workload; NP=1 for a single-rank sanity check.
-#
-# Environment overrides:
-#   NP=1 bash dlio_s3dlio_checkpoint.sh       → 1 rank, ~13.1 GB per checkpoint
-#   NP=8 bash dlio_s3dlio_checkpoint.sh       → 8 ranks, ~105 GB per checkpoint
-#   CHECKPOINTS=1 bash dlio_s3dlio_checkpoint.sh  → write+read 1 checkpoint only
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3dlio_checkpoint.sh
-
-# Performance tuning:
-#
-# S3DLIO_ENABLE_RANGE_OPTIMIZATION=0:
-#   Disables range splitting for write path (checkpoint objects are written as
-#   a single streaming PUT, not split into range sub-requests).
-export S3DLIO_ENABLE_RANGE_OPTIMIZATION=0
-export S3DLIO_RT_THREADS=8              # 8 Tokio threads per process (vs default 32)
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ────────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Check s3dlio is installed ─────────────────────────────────────────────────
-if ! python3 -c "import s3dlio" 2>/dev/null; then
-    echo "ERROR: s3dlio is not installed." >&2
-    echo "  Install with: uv sync" >&2
-    exit 1
-fi
-
-# ── Tunables (override via env) ────────────────────────────────────────────────
-# NP          = MPI ranks (8 = full llama3-8b ZeRO-3; 1 = single-rank sanity)
-# CHECKPOINTS = number of checkpoints to write AND read
-NP=${NP:-1}
-CHECKPOINTS=${CHECKPOINTS:-2}
-
-BUCKET="chckpt-test1"
-S3_PREFIX="s3dlio/llama3-8b"
-
-RUN_DIR="/tmp/dlio-s3dlio-checkpoint-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Checkpoint — s3dlio + MinIO  (llama3-8b)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket      : $BUCKET"
-echo "  Objects at  : s3://$BUCKET/$S3_PREFIX/"
-echo "  Endpoint    : $AWS_ENDPOINT_URL"
-echo "  MPI ranks   : $NP   (default=1; full run: NP=8 bash $0)"
-echo "  Checkpoints : $CHECKPOINTS write + $CHECKPOINTS read"
-echo "  Per-rank    : ~13.1 GB per checkpoint  (ZeRO-3, 8 ranks)"
-echo "  Run dir     : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: verify bucket is reachable ────────────────────────────────────
-echo "Checking bucket reachability: s3://$BUCKET/ ..."
-python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-try:
-    files = s3dlio.list("s3://${BUCKET}/", recursive=False)
-    print(f"  Bucket accessible — {len(files)} top-level entries")
-except Exception as e:
-    print(f"  ERROR: Cannot access bucket s3://${BUCKET}/: {e}", file=sys.stderr)
-    sys.exit(1)
-PYEOF
-echo ""
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=llama3_8b_checkpoint_s3dlio \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    "++workload.checkpoint.num_checkpoints_write=$CHECKPOINTS" \
-    "++workload.checkpoint.num_checkpoints_read=$CHECKPOINTS" \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-echo "✅  Checkpoint test complete — results in $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_s3dlio_cleanup.sh b/tests/object-store/old-archive/dlio_s3dlio_cleanup.sh
deleted file mode 100755
index 63ba65f0..00000000
--- a/tests/object-store/old-archive/dlio_s3dlio_cleanup.sh
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3dlio_cleanup.sh
-#
-# Delete all test objects from the MinIO bucket (mlp-s3dlio).
-# Use this to reset between datagen runs without running the full cycle.
-#
-# Storage : S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-s3dlio
-# Removes : s3://mlp-s3dlio/test-run/unet3d/train/*
-#
-# Safety  : Lists files first, shows count, prompts for confirmation.
-#           To skip the prompt: FORCE=1 bash dlio_s3dlio_cleanup.sh
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3dlio_cleanup.sh
-#   FORCE=1 bash tests/object-store/dlio_s3dlio_cleanup.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-# ── Config ────────────────────────────────────────────────────────────────────
-FORCE=${FORCE:-0}
-
-BUCKET="${BUCKET:-mlp-s3dlio}"
-S3_PREFIX="test-run/unet3d/train"
-LIST_URI="s3://${BUCKET}/${S3_PREFIX}/"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Cleanup — s3dlio + MinIO"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Prefix   : $S3_PREFIX"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── List what will be deleted ─────────────────────────────────────────────────
-echo "Listing objects to delete: $LIST_URI ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -eq 0 ]]; then
-    echo "✅  Bucket is already empty — nothing to delete."
-    exit 0
-fi
-
-echo "Found $FILE_COUNT objects to delete."
-
-# ── Confirm before deleting ────────────────────────────────────────────────────
-if [[ "$FORCE" -eq 0 ]]; then
-    echo ""
-    echo "⚠️   This will permanently delete $FILE_COUNT objects from $LIST_URI"
-    echo "    To skip this prompt: FORCE=1 bash $0"
-    read -r -p "Delete all $FILE_COUNT objects? [y/N] " REPLY
-    if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
-        echo "Aborted — no objects deleted."
-        exit 0
-    fi
-fi
-
-# ── Delete ────────────────────────────────────────────────────────────────────
-echo ""
-echo "Deleting $FILE_COUNT objects ..."
-DELETED=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-for f in files:
-    s3dlio.delete(f)
-print(len(files))
-PYEOF
-)
-
-echo ""
-echo "✅  Cleanup complete — deleted $DELETED objects from $LIST_URI"
diff --git a/tests/object-store/old-archive/dlio_s3dlio_cycle.sh b/tests/object-store/old-archive/dlio_s3dlio_cycle.sh
deleted file mode 100755
index cf827492..00000000
--- a/tests/object-store/old-archive/dlio_s3dlio_cycle.sh
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3dlio_cycle.sh
-#
-# Full DLIO direct cycle test — NO mlpstorage CLI wrapper.
-#
-# Calls dlio_benchmark directly for every phase:
-#   1. Datagen  — generate 168 × ~140 MB NPZ files → MinIO (mlp-s3dlio bucket)
-#   2. Verify   — use s3dlio Python API to list and count the files
-#   3. Train    — run 1 epoch of training reading from MinIO via s3dlio
-#   4. Cleanup  — delete all test objects from the bucket
-#
-# Config : unet3d_h100_s3dlio_datagen.yaml + unet3d_h100_s3dlio.yaml
-#          (real h100 workload — 168 files × ~140 MB NPZ)
-# Storage: S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-s3dlio
-# Data   : mlp-s3dlio/test-run/unet3d/train/
-#
-# Requirements:
-#   - .env file in repo root with AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
-#     AWS_ENDPOINT_URL, AWS_REGION  (no credentials in this script)
-#   - Python venv at .venv/  with dlio_benchmark and s3dlio installed
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3dlio_cycle.sh
-
-set -euo pipefail
-
-# ── Locate repo root ───────────────────────────────────────────────────────────
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ────────────────────────────────────────────────────────────────
-# allexport ensures every variable sourced from .env is exported to child
-# processes (mpirun, python, dlio_benchmark, etc.).
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    # shellcheck disable=SC1091
-    source .env
-    set +o allexport
-fi
-
-# Fail fast if credentials are missing — don't let dlio start and then error.
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found — run: python -m venv .venv && uv sync >&2
-    exit 1
-fi
-# shellcheck disable=SC1091
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found — is dlio_benchmark installed in the venv?" >&2
-    exit 1
-fi
-
-# ── Config ────────────────────────────────────────────────────────────────────
-BUCKET="${BUCKET:-mlp-s3dlio}"
-S3_PREFIX="test-run/unet3d/train"           # matches data_folder=test-run/unet3d + DLIO appends /train/
-LIST_URI="s3://${BUCKET}/${S3_PREFIX}/"
-EXPECTED_FILES=168
-CONFIG_DIR="$REPO_ROOT/configs/dlio"
-
-# MPI ranks for datagen — more ranks = faster generation of 168 × 140 MB files
-DATAGEN_NP=${DATAGEN_NP:-8}
-TRAIN_NP=${TRAIN_NP:-1}
-
-# Unique run dir keeps DLIO output logs for this cycle
-RUN_DIR="/tmp/dlio-s3dlio-cycle-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-# ── Helper ────────────────────────────────────────────────────────────────────
-banner() { echo ""; echo "════════════════════════════════════════════════════════"; echo "  $*"; echo "════════════════════════════════════════════════════════"; echo ""; }
-step()   { echo ""; echo "──── $* ────"; echo ""; }
-ok()     { echo "✅  $*"; }
-fail()   { echo "❌  $*" >&2; exit 1; }
-
-banner "DLIO Direct Cycle — s3dlio + MinIO"
-echo "  Bucket       : $BUCKET"
-echo "  Prefix       : $S3_PREFIX"
-echo "  Endpoint     : $AWS_ENDPOINT_URL"
-echo "  Files        : $EXPECTED_FILES × ~140 MB NPZ  (real h100 workload)"
-echo "  Datagen MPI  : $DATAGEN_NP ranks"
-echo "  Train MPI    : $TRAIN_NP rank(s)"
-echo "  Run dir      : $RUN_DIR"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 1 — DATAGEN
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 1 — Datagen (writing ${EXPECTED_FILES} × ~140 MB files to S3)"
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$DATAGEN_NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_s3dlio_datagen \
-    "++hydra.run.dir=$RUN_DIR/datagen" \
-    ++hydra.output_subdir=null \
-    --config-dir="$CONFIG_DIR"
-
-ok "Datagen complete"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 2 — VERIFY
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 2 — Verify (listing $LIST_URI)"
-
-FOUND=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_ENDPOINT_URL", "${AWS_ENDPOINT_URL}")
-os.environ.setdefault("AWS_REGION",       "${AWS_REGION}")
-import s3dlio
-files = s3dlio.list("${LIST_URI}", recursive=True)
-print(len(files))
-for f in files[:5]:
-    print("  ", f, file=sys.stderr)
-if len(files) > 5:
-    print(f"  ... and {len(files)-5} more", file=sys.stderr)
-PYEOF
-)
-
-echo "Files found in S3: $FOUND (expected: $EXPECTED_FILES)"
-if [[ "$FOUND" -ne "$EXPECTED_FILES" ]]; then
-    fail "File count mismatch: got $FOUND, expected $EXPECTED_FILES — datagen may have failed"
-fi
-ok "Verify passed — $FOUND files confirmed in bucket"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 3 — TRAIN
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 3 — Training (1 epoch, reading from S3 via s3dlio)"
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$TRAIN_NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_s3dlio \
-    "++hydra.run.dir=$RUN_DIR/train" \
-    ++hydra.output_subdir=null \
-    --config-dir="$CONFIG_DIR"
-
-ok "Training complete"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# PHASE 4 — CLEANUP
-# ══════════════════════════════════════════════════════════════════════════════
-banner "Phase 4 — Cleanup (deleting all test objects)"
-
-DELETED=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_ENDPOINT_URL", "${AWS_ENDPOINT_URL}")
-os.environ.setdefault("AWS_REGION",       "${AWS_REGION}")
-import s3dlio
-files = s3dlio.list("${LIST_URI}", recursive=True)
-for f in files:
-    s3dlio.delete(f)
-print(len(files))
-PYEOF
-)
-
-ok "Cleanup complete — deleted $DELETED objects from s3://$BUCKET/$S3_PREFIX/"
-
-# ══════════════════════════════════════════════════════════════════════════════
-# DONE
-# ══════════════════════════════════════════════════════════════════════════════
-banner "ALL PHASES PASSED"
-echo "  Datagen  ✅  generated $EXPECTED_FILES × ~140 MB NPZ files"
-echo "  Verify   ✅  $FOUND files confirmed in S3"
-echo "  Training ✅  1 epoch completed"
-echo "  Cleanup  ✅  $DELETED objects deleted"
-echo ""
-echo "  DLIO logs: $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_s3dlio_datagen.sh b/tests/object-store/old-archive/dlio_s3dlio_datagen.sh
deleted file mode 100755
index bc8fa6d4..00000000
--- a/tests/object-store/old-archive/dlio_s3dlio_datagen.sh
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3dlio_datagen.sh
-#
-# Run DLIO datagen directly via dlio_benchmark — NO mlpstorage wrapper.
-# Generates 168 × ~140 MB NPZ files into MinIO (mlp-s3dlio bucket).
-#
-# Config  : configs/dlio/workload/unet3d_h100_s3dlio_datagen.yaml
-# Workload: UNet3D h100 — 168 × ~140 MB NPZ
-# Storage : S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-s3dlio
-# Data    : s3://mlp-s3dlio/test-run/unet3d/train/
-#
-# Environment overrides:
-#   NP=4 bash dlio_s3dlio_datagen.sh      → 4 MPI ranks writing in parallel
-#   FORCE=1 bash dlio_s3dlio_datagen.sh   → overwrite even if files already exist
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3dlio_datagen.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Tunables (override via env) ────────────────────────────────────────────────
-# NP    = MPI ranks — more ranks write more files in parallel
-# FORCE = set to 1 to skip the pre-flight "files already exist" warning
-NP=${NP:-8}
-FORCE=${FORCE:-0}
-
-BUCKET="${BUCKET:-mlp-s3dlio}"
-S3_PREFIX="test-run/unet3d/train"
-LIST_URI="s3://${BUCKET}/${S3_PREFIX}/"
-EXPECTED_FILES=168
-
-RUN_DIR="/tmp/dlio-s3dlio-datagen-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Datagen — s3dlio + MinIO  (unet3d h100)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Prefix   : $S3_PREFIX"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "  Files    : $EXPECTED_FILES × ~140 MB NPZ"
-echo "  MPI ranks: $NP   (override: NP=4 bash $0)"
-echo "  Run dir  : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: warn if files already exist ────────────────────────────────────
-echo "Checking for existing data: $LIST_URI ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -gt 0 && "$FORCE" -eq 0 ]]; then
-    echo ""
-    echo "⚠️   WARNING: $FILE_COUNT files already exist in $LIST_URI"
-    echo "    Datagen will overwrite them."
-    echo "    To skip this warning: FORCE=1 bash $0"
-    echo "    To clean up first:    bash tests/object-store/dlio_s3dlio_cleanup.sh"
-    echo ""
-    read -r -p "Continue anyway? [y/N] " REPLY
-    if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
-        echo "Aborted."
-        exit 0
-    fi
-elif [[ "$FILE_COUNT" -gt 0 ]]; then
-    echo "⚠️   $FILE_COUNT files already exist — FORCE=1 set, overwriting"
-else
-    echo "✅  Bucket is empty — proceeding with datagen"
-fi
-echo ""
-
-# ── Data generation method ────────────────────────────────────────────────────
-# ALWAYS force dgen-py. We hard-assign here (not :=) so we override any
-# DLIO_DATA_GEN=numpy that might be set in the caller's shell environment.
-# dgen-py is 155x faster than NumPy and is the ONLY supported default.
-# If dgen-py is not installed this will fail fast with a clear error message.
-DLIO_DATA_GEN=dgen
-export DLIO_DATA_GEN
-
-# ── s3dlio tuning env vars ────────────────────────────────────────────────────
-# Override any of these at invocation, e.g.:
-#   S3DLIO_MAX_HTTP_CONNECTIONS=400 bash dlio_s3dlio_datagen.sh
-: "${S3DLIO_USE_OPTIMIZED_HTTP:=1}"          # enable connection pooling (default on)
-: "${S3DLIO_MAX_HTTP_CONNECTIONS:=200}"       # idle connections per host
-: "${S3DLIO_HTTP_IDLE_TIMEOUT_MS:=5000}"     # keep-alive idle timeout
-: "${S3DLIO_RT_THREADS:=16}"                 # tokio async worker threads
-: "${S3DLIO_OPERATION_TIMEOUT_SECS:=300}"    # per-op timeout (140 MB PUTs need headroom)
-: "${RUST_LOG:=info}"                        # s3dlio logging level (info / debug)
-
-export S3DLIO_USE_OPTIMIZED_HTTP S3DLIO_MAX_HTTP_CONNECTIONS S3DLIO_HTTP_IDLE_TIMEOUT_MS
-export S3DLIO_RT_THREADS S3DLIO_OPERATION_TIMEOUT_SECS RUST_LOG
-
-echo "── data generation ────────────────────────────────────────"
-echo "  DLIO_DATA_GEN              = $DLIO_DATA_GEN  (forced — dgen-py only)"
-echo "── s3dlio tuning ──────────────────────────────────────────"
-echo "  S3DLIO_USE_OPTIMIZED_HTTP  = $S3DLIO_USE_OPTIMIZED_HTTP"
-echo "  S3DLIO_MAX_HTTP_CONNECTIONS= $S3DLIO_MAX_HTTP_CONNECTIONS"
-echo "  S3DLIO_HTTP_IDLE_TIMEOUT_MS= $S3DLIO_HTTP_IDLE_TIMEOUT_MS"
-echo "  S3DLIO_RT_THREADS          = $S3DLIO_RT_THREADS"
-echo "  S3DLIO_OPERATION_TIMEOUT_SECS=$S3DLIO_OPERATION_TIMEOUT_SECS"
-echo "  RUST_LOG                   = $RUST_LOG"
-echo "───────────────────────────────────────────────────────────"
-echo ""
-
-# ── Run datagen ────────────────────────────────────────────────────────────────
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    -x DLIO_DATA_GEN \
-    -x S3DLIO_USE_OPTIMIZED_HTTP \
-    -x S3DLIO_MAX_HTTP_CONNECTIONS \
-    -x S3DLIO_HTTP_IDLE_TIMEOUT_MS \
-    -x S3DLIO_RT_THREADS \
-    -x S3DLIO_OPERATION_TIMEOUT_SECS \
-    -x RUST_LOG \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_s3dlio_datagen \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-
-# ── Post-flight: verify file count ────────────────────────────────────────────
-echo "Verifying generated files ..."
-FOUND=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FOUND" -ne "$EXPECTED_FILES" ]]; then
-    echo "⚠️   File count: $FOUND (expected $EXPECTED_FILES) — some files may have been skipped or failed"
-else
-    echo "✅  Datagen complete — $FOUND / $EXPECTED_FILES files confirmed in $LIST_URI"
-fi
-echo "    DLIO logs: $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_s3dlio_train.sh b/tests/object-store/old-archive/dlio_s3dlio_train.sh
deleted file mode 100755
index ed6d544e..00000000
--- a/tests/object-store/old-archive/dlio_s3dlio_train.sh
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3dlio_train.sh
-#
-# Run DLIO training directly via dlio_benchmark — NO mlpstorage wrapper.
-# Assumes data is already in the bucket (run dlio_s3dlio_cycle.sh datagen first
-# if needed, or the cycle script if starting from scratch).
-#
-# Config  : configs/dlio/workload/unet3d_h100_s3dlio.yaml
-# Workload: UNet3D h100 — 168 × ~140 MB NPZ, 5 epochs, batch_size=7
-# Storage : S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-s3dlio
-# Data    : s3://mlp-s3dlio/test-run/unet3d/train/
-#
-# MPI vs PyTorch workers — these are different:
-#   NP (--np)         = MPI ranks  = simulated distributed training nodes
-#   read_threads (YAML) = PyTorch DataLoader workers per MPI rank
-#   Total I/O processes = NP × read_threads
-#
-# Environment overrides:
-#   NP=4 bash dlio_s3dlio_train.sh        → 4 MPI ranks × 4 threads = 16 readers
-#   NP=1 READ_THREADS=8 bash ...          → 1 rank × 8 threads = 8 readers
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3dlio_train.sh
-
-# Performance tuning — applied before mpirun:
-#
-# S3DLIO_ENABLE_RANGE_OPTIMIZATION=0:
-#   Disables range splitting entirely on ALL code paths (fixed in v0.9.82).
-#   For this test environment (1 Gbps NIC, 147 MB files, slow MinIO):
-#     - Files are too small for range splitting to help vs a single streaming GET
-#     - Range splitting would open 37 sub-requests per file, adding TCP overhead
-#     - Disabling also skips the pre-stat HEAD phase in get_objects_parallel(),
-#       eliminating N HEAD requests per batch (N = files per step)
-#   For production (100+ Gbps, fast object storage): set =1 (the default)
-#   and tune S3DLIO_RANGE_THRESHOLD_MB for your file size instead.
-#
-# S3DLIO_RT_THREADS=8:
-#   Tokio async runtime threads per MPI rank. Default is 32.
-#   This test machine has ~16 cores; with NP=1 and 4 DataLoader workers,
-#   8 Tokio threads prevents over-subscription. Scale with: total_cores / NP.
-export S3DLIO_ENABLE_RANGE_OPTIMIZATION=0   # skip HEAD + single GET (best for this env)
-export S3DLIO_RT_THREADS=8                  # 8 Tokio threads per process (vs default 32)
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ────────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Tunables (override via env) ────────────────────────────────────────────────
-# NP          = MPI ranks (1 = single process, 4 = 4 simulated nodes, etc.)
-# READ_THREADS = PyTorch DataLoader workers per rank (set in YAML, overridable here)
-NP=${NP:-1}
-BUCKET="${BUCKET:-mlp-s3dlio}"
-S3_PREFIX="${S3_PREFIX:-test-run/unet3d/train}"
-
-RUN_DIR="/tmp/dlio-s3dlio-train-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Training — s3dlio + MinIO  (unet3d h100)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-  echo "  Data     : $S3_PREFIX/ (168 × ~140 MB NPZ)"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "  MPI ranks: $NP   (override: NP=4 bash $0)"
-echo "  Workers  : 4 per rank  (reader.read_threads in YAML)"
-echo "  Epochs   : 5"
-echo "  Batch    : 7"
-echo "  Run dir  : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: verify training data exists ────────────────────────────────────
-echo "Checking training data: s3://$BUCKET/$S3_PREFIX/ ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -eq 0 ]]; then
-    echo ""
-    echo "❌  ERROR: No training files found in s3://$BUCKET/$S3_PREFIX/"
-    echo "    Run datagen first to populate the bucket:"
-    echo "      bash tests/object-store/dlio_s3dlio_datagen.sh"
-    echo "    Or run the full cycle (datagen + train + cleanup):"
-    echo "      bash tests/object-store/dlio_s3dlio_cycle.sh"
-    exit 1
-fi
-
-echo "✅  Found $FILE_COUNT training files — proceeding"
-echo ""
-
-# ── Note on the expected 'valid' listing ──────────────────────────────────────
-# DLIO always tries to list a valid/ path. It will find 0 files and skip it.
-# That is normal — we have train data only. Not an error.
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_s3dlio \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-echo "✅  Training complete — results in $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_s3torch_checkpoint.sh b/tests/object-store/old-archive/dlio_s3torch_checkpoint.sh
deleted file mode 100755
index e4e7dcb5..00000000
--- a/tests/object-store/old-archive/dlio_s3torch_checkpoint.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3torch_checkpoint.sh
-#
-# Run DLIO checkpointing directly via dlio_benchmark — NO mlpstorage wrapper.
-# Writes and reads llama3-8b checkpoints to/from MinIO using s3torchconnector.
-#
-# Config  : configs/dlio/workload/llama3_8b_checkpoint_s3torch.yaml
-# Workload: LLaMA 3 8B — ZeRO-3, 8 ranks, ~13.1 GB per rank per checkpoint
-# Storage : s3torchconnector → MinIO  (endpoint from AWS_ENDPOINT_URL)  bucket: chckpt-test1
-# Objects : s3://chckpt-test1/s3torch/llama3-8b/<checkpoint_id>/<rank_file>.pt
-#
-# MPI ranks:
-#   llama3-8b with ZeRO-3 requires exactly 8 MPI ranks (the closed reference value).
-#   Each rank writes its shard of the model+optimizer state (~13.1 GB).
-#   Run with NP=8 for full workload; NP=1 for a single-rank sanity check.
-#
-# Environment overrides:
-#   NP=1 bash dlio_s3torch_checkpoint.sh       → 1 rank, ~13.1 GB per checkpoint
-#   NP=8 bash dlio_s3torch_checkpoint.sh       → 8 ranks, ~105 GB per checkpoint
-#   CHECKPOINTS=1 bash dlio_s3torch_checkpoint.sh  → write+read 1 checkpoint only
-#
-# Prerequisites:
-#   uv sync (s3torchconnector must be added to pyproject.toml dependencies)
-#   (s3dlio is used for pre-flight bucket check — it must also be installed)
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3torch_checkpoint.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ────────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Check s3torchconnector is installed ───────────────────────────────────────
-if ! python3 -c "import s3torchconnector" 2>/dev/null; then
-    echo "ERROR: s3torchconnector is not installed." >&2
-    echo "  Install with: uv sync (s3torchconnector must be added to pyproject.toml dependencies)" >&2
-    echo "  Or: uv sync" >&2
-    exit 1
-fi
-
-# ── Tunables (override via env) ────────────────────────────────────────────────
-NP=${NP:-1}
-CHECKPOINTS=${CHECKPOINTS:-2}
-
-BUCKET="chckpt-test1"
-S3_PREFIX="s3torch/llama3-8b"
-
-RUN_DIR="/tmp/dlio-s3torch-checkpoint-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Checkpoint — s3torchconnector + MinIO  (llama3-8b)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket      : $BUCKET"
-echo "  Objects at  : s3://$BUCKET/$S3_PREFIX/"
-echo "  Endpoint    : $AWS_ENDPOINT_URL"
-echo "  MPI ranks   : $NP   (default=1; full run: NP=8 bash $0)"
-echo "  Checkpoints : $CHECKPOINTS write + $CHECKPOINTS read"
-echo "  Per-rank    : ~13.1 GB per checkpoint  (ZeRO-3, 8 ranks)"
-echo "  Run dir     : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: verify bucket is reachable ────────────────────────────────────
-# s3torchconnector has no standalone listing API — use s3dlio for bucket checks.
-echo "Checking bucket reachability: s3://$BUCKET/ ..."
-python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-try:
-    files = s3dlio.list("s3://${BUCKET}/", recursive=False)
-    print(f"  Bucket accessible — {len(files)} top-level entries")
-except Exception as e:
-    print(f"  ERROR: Cannot access bucket s3://${BUCKET}/: {e}", file=sys.stderr)
-    sys.exit(1)
-PYEOF
-echo ""
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=llama3_8b_checkpoint_s3torch \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    "++workload.checkpoint.num_checkpoints_write=$CHECKPOINTS" \
-    "++workload.checkpoint.num_checkpoints_read=$CHECKPOINTS" \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-echo "✅  Checkpoint test complete — results in $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_s3torch_cleanup.sh b/tests/object-store/old-archive/dlio_s3torch_cleanup.sh
deleted file mode 100755
index 30e45451..00000000
--- a/tests/object-store/old-archive/dlio_s3torch_cleanup.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3torch_cleanup.sh
-#
-# Delete all test objects from the MinIO bucket (mlp-s3torch).
-# Use this to reset between datagen runs without running the full cycle.
-#
-# Storage : S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-s3torch
-# Removes : s3://mlp-s3torch/test-run/unet3d/train/*
-#
-# Safety  : Lists files first, shows count, prompts for confirmation.
-#           To skip the prompt: FORCE=1 bash dlio_s3torch_cleanup.sh
-#
-# Note    : s3torchconnector has no standalone listing/deletion API.
-#           This script uses s3dlio for all bucket operations.
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3torch_cleanup.sh
-#   FORCE=1 bash tests/object-store/dlio_s3torch_cleanup.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ───────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-# ── Config ────────────────────────────────────────────────────────────────────
-FORCE=${FORCE:-0}
-
-BUCKET="${BUCKET:-mlp-s3torch}"
-S3_PREFIX="test-run/unet3d/train"
-LIST_URI="s3://${BUCKET}/${S3_PREFIX}/"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Cleanup — s3torchconnector + MinIO"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Prefix   : $S3_PREFIX"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── List what will be deleted ─────────────────────────────────────────────────
-# s3torchconnector has no standalone listing API — use s3dlio for bucket operations.
-echo "Listing objects to delete: $LIST_URI ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -eq 0 ]]; then
-    echo "✅  Bucket is already empty — nothing to delete."
-    exit 0
-fi
-
-echo "Found $FILE_COUNT objects to delete."
-
-# ── Confirm before deleting ───────────────────────────────────────────────────
-if [[ "$FORCE" -eq 0 ]]; then
-    echo ""
-    echo "⚠️   This will permanently delete $FILE_COUNT objects from $LIST_URI"
-    echo "    To skip this prompt: FORCE=1 bash $0"
-    read -r -p "Delete all $FILE_COUNT objects? [y/N] " REPLY
-    if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
-        echo "Aborted — no objects deleted."
-        exit 0
-    fi
-fi
-
-# ── Delete ────────────────────────────────────────────────────────────────────
-echo ""
-echo "Deleting $FILE_COUNT objects ..."
-DELETED=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-for f in files:
-    s3dlio.delete(f)
-print(len(files))
-PYEOF
-)
-
-echo ""
-echo "✅  Cleanup complete — deleted $DELETED objects from $LIST_URI"
diff --git a/tests/object-store/old-archive/dlio_s3torch_datagen.sh b/tests/object-store/old-archive/dlio_s3torch_datagen.sh
deleted file mode 100755
index d213d273..00000000
--- a/tests/object-store/old-archive/dlio_s3torch_datagen.sh
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3torch_datagen.sh
-#
-# Run DLIO datagen directly via dlio_benchmark — NO mlpstorage wrapper.
-# Generates 168 × ~140 MB NPZ files into MinIO (mlp-s3torch bucket).
-#
-# Config  : configs/dlio/workload/unet3d_h100_s3torch_datagen.yaml
-# Workload: UNet3D h100 — 168 × ~140 MB NPZ
-# Storage : s3torchconnector → S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-s3torch
-# Data    : s3://mlp-s3torch/test-run/unet3d/train/
-#
-# Prerequisites:
-#   uv sync (s3torchconnector must be added to pyproject.toml dependencies)
-#   (s3dlio is used for pre/post-flight listing — it must also be installed)
-#
-# Environment overrides:
-#   NP=4 bash dlio_s3torch_datagen.sh      → 4 MPI ranks writing in parallel
-#   FORCE=1 bash dlio_s3torch_datagen.sh   → overwrite even if files already exist
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3torch_datagen.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ───────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Check s3torchconnector is installed ───────────────────────────────────────
-if ! python3 -c "import s3torchconnector" 2>/dev/null; then
-    echo "ERROR: s3torchconnector is not installed." >&2
-    echo "  Install with: uv sync (s3torchconnector must be added to pyproject.toml dependencies)" >&2
-    echo "  Or: uv sync" >&2
-    exit 1
-fi
-
-# ── Tunables (override via env) ───────────────────────────────────────────────
-# NP    = MPI ranks — more ranks write more files in parallel
-# FORCE = set to 1 to skip the pre-flight "files already exist" warning
-NP=${NP:-8}
-FORCE=${FORCE:-0}
-
-BUCKET="${BUCKET:-mlp-s3torch}"
-S3_PREFIX="test-run/unet3d/train"
-LIST_URI="s3://${BUCKET}/${S3_PREFIX}/"
-EXPECTED_FILES=168
-
-RUN_DIR="/tmp/dlio-s3torch-datagen-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Datagen — s3torchconnector + MinIO  (unet3d h100)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Prefix   : $S3_PREFIX"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "  Files    : $EXPECTED_FILES × ~140 MB NPZ"
-echo "  MPI ranks: $NP   (override: NP=4 bash $0)"
-echo "  Run dir  : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: warn if files already exist ───────────────────────────────────
-# s3torchconnector has no standalone listing API — use s3dlio for bucket checks.
-echo "Checking for existing data: $LIST_URI ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -gt 0 && "$FORCE" -eq 0 ]]; then
-    echo ""
-    echo "⚠️   WARNING: $FILE_COUNT files already exist in $LIST_URI"
-    echo "    Datagen will overwrite them."
-    echo "    To skip this warning: FORCE=1 bash $0"
-    echo "    To clean up first:    bash tests/object-store/dlio_s3torch_cleanup.sh"
-    echo ""
-    read -r -p "Continue anyway? [y/N] " REPLY
-    if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
-        echo "Aborted."
-        exit 0
-    fi
-elif [[ "$FILE_COUNT" -gt 0 ]]; then
-    echo "⚠️   $FILE_COUNT files already exist — FORCE=1 set, overwriting"
-else
-    echo "✅  Bucket is empty — proceeding with datagen"
-fi
-echo ""
-
-# ── Data generation method ────────────────────────────────────────────────────
-# ALWAYS force dgen-py. We hard-assign here (not :=) so we override any
-# DLIO_DATA_GEN=numpy that might be set in the caller's shell environment.
-# dgen-py is 155x faster than NumPy and is the ONLY supported default.
-# If dgen-py is not installed this will fail fast with a clear error message.
-DLIO_DATA_GEN=dgen
-export DLIO_DATA_GEN
-
-echo "── data generation ────────────────────────────────────────"
-echo "  DLIO_DATA_GEN = $DLIO_DATA_GEN  (forced — dgen-py only)"
-echo "───────────────────────────────────────────────────────────"
-echo ""
-
-# ── Run datagen ───────────────────────────────────────────────────────────────
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    -x DLIO_DATA_GEN \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_s3torch_datagen \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-
-# ── Post-flight: verify file count ────────────────────────────────────────────
-echo "Verifying generated files ..."
-FOUND=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FOUND" -ne "$EXPECTED_FILES" ]]; then
-    echo "⚠️   File count: $FOUND (expected $EXPECTED_FILES) — some files may have been skipped or failed"
-else
-    echo "✅  Datagen complete — $FOUND / $EXPECTED_FILES files confirmed in $LIST_URI"
-fi
-echo "    DLIO logs: $RUN_DIR"
diff --git a/tests/object-store/old-archive/dlio_s3torch_train.sh b/tests/object-store/old-archive/dlio_s3torch_train.sh
deleted file mode 100755
index 6bbfd4b5..00000000
--- a/tests/object-store/old-archive/dlio_s3torch_train.sh
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/usr/bin/env bash
-# dlio_s3torch_train.sh
-#
-# Run DLIO training directly via dlio_benchmark — NO mlpstorage wrapper.
-# Assumes data is already in the bucket (run dlio_s3torch_datagen.sh first
-# if needed).
-#
-# Config  : configs/dlio/workload/unet3d_h100_s3torch.yaml
-# Workload: UNet3D h100 — 168 × ~140 MB NPZ, 5 epochs, batch_size=7
-# Storage : s3torchconnector → S3-compatible object storage (endpoint from AWS_ENDPOINT_URL)  bucket: mlp-s3torch
-# Data    : s3://mlp-s3torch/test-run/unet3d/train/
-#
-# Prerequisites:
-#   uv sync (s3torchconnector must be added to pyproject.toml dependencies)
-#   (s3dlio is used for pre-flight listing — it must also be installed)
-#
-# MPI vs PyTorch workers — these are different:
-#   NP (--np)         = MPI ranks  = simulated distributed training nodes
-#   read_threads (YAML) = PyTorch DataLoader workers per MPI rank
-#   Total I/O processes = NP × read_threads
-#
-# Environment overrides:
-#   NP=4 bash dlio_s3torch_train.sh        → 4 MPI ranks × 4 threads = 16 readers
-#   NP=1 READ_THREADS=8 bash ...           → 1 rank × 8 threads = 8 readers
-#
-# Usage:
-#   cd /path/to/mlp-storage
-#   bash tests/object-store/dlio_s3torch_train.sh
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ───────────────────────────────────────────────────────────────
-if [[ -f .env ]]; then
-    echo "[env] Loading credentials from .env"
-    set -o allexport
-    source .env  # shellcheck disable=SC1091
-    set +o allexport
-fi
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID not set — add it to .env}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY not set — add it to .env}"
-: "${AWS_ENDPOINT_URL:?ERROR: AWS_ENDPOINT_URL not set — add it to .env (e.g. http://your-s3-host:9000)}"
-: "${AWS_REGION:=us-east-1}"
-
-# ── Virtual environment ───────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found" >&2; exit 1
-fi
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found in venv" >&2; exit 1
-fi
-
-# ── Check s3torchconnector is installed ───────────────────────────────────────
-if ! python3 -c "import s3torchconnector" 2>/dev/null; then
-    echo "ERROR: s3torchconnector is not installed." >&2
-    echo "  Install with: uv sync (s3torchconnector must be added to pyproject.toml dependencies)" >&2
-    echo "  Or: uv sync" >&2
-    exit 1
-fi
-
-# ── Tunables (override via env) ───────────────────────────────────────────────
-# NP = MPI ranks (1 = single process, 4 = 4 simulated nodes, etc.)
-NP=${NP:-1}
-
-BUCKET="${BUCKET:-mlp-s3torch}"
-S3_PREFIX="test-run/unet3d/train"
-
-RUN_DIR="/tmp/dlio-s3torch-train-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "════════════════════════════════════════════════════════"
-echo "  DLIO Training — s3torchconnector + MinIO  (unet3d h100)"
-echo "════════════════════════════════════════════════════════"
-echo "  Bucket   : $BUCKET"
-echo "  Data     : $S3_PREFIX  (168 × ~140 MB NPZ)"
-echo "  Endpoint : $AWS_ENDPOINT_URL"
-echo "  MPI ranks: $NP   (override: NP=4 bash $0)"
-echo "  Workers  : 4 per rank  (reader.read_threads in YAML)"
-echo "  Epochs   : 5"
-echo "  Batch    : 7"
-echo "  Run dir  : $RUN_DIR"
-echo "════════════════════════════════════════════════════════"
-echo ""
-
-# ── Pre-flight: verify training data exists ───────────────────────────────────
-# s3torchconnector has no standalone listing API — use s3dlio for bucket checks.
-echo "Checking training data: s3://$BUCKET/$S3_PREFIX/ ..."
-FILE_COUNT=$(python3 - <<PYEOF
-import os, sys
-os.environ.setdefault("AWS_REGION", "us-east-1")
-import s3dlio
-files = s3dlio.list("s3://${BUCKET}/${S3_PREFIX}/", recursive=True)
-print(len(files))
-PYEOF
-)
-
-if [[ "$FILE_COUNT" -eq 0 ]]; then
-    echo ""
-    echo "❌  ERROR: No training files found in s3://$BUCKET/$S3_PREFIX/"
-    echo "    Run datagen first to populate the bucket:"
-    echo "      bash tests/object-store/dlio_s3torch_datagen.sh"
-    exit 1
-fi
-
-echo "✅  Found $FILE_COUNT training files — proceeding"
-echo ""
-
-# ── Note on the expected 'valid' listing ──────────────────────────────────────
-# DLIO always tries to list a valid/ path. It will find 0 files and skip it.
-# That is normal — we have train data only. Not an error.
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -np "$NP" --allow-run-as-root \
-    --mca btl ^vader \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_s3torch \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=null \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-echo ""
-echo "✅  Training complete — results in $RUN_DIR"
diff --git a/tests/object-store/old-archive/llama3_8b_checkpoint_minio.yaml b/tests/object-store/old-archive/llama3_8b_checkpoint_minio.yaml
deleted file mode 100644
index c6a4ecf7..00000000
--- a/tests/object-store/old-archive/llama3_8b_checkpoint_minio.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-# LLaMA 3 8B — minio SDK Checkpointing Config
-#
-# Purpose : Checkpoint-only workload for llama3-8b using the minio Python SDK
-#           for object I/O.
-# Storage : MinIO at https://172.16.1.40:9000  (bucket: chckpt-test1)
-# Data    : s3://chckpt-test1/minio/llama3-8b/
-#
-# Model sizing (ZeRO-3, 8 ranks, fp16 model + fp32 optimizer):
-#   Total model+optimizer: 15 GB + 90 GB = 105 GB
-#   Per-rank write:  105 GB / 8 ranks ≈ 13.1 GB
-#   Per-checkpoint total I/O: ~105 GB write + ~105 GB read = ~210 GB
-#
-# Prerequisites (before running dlio_benchmark):
-#   source /home/eval/Documents/Code/mlp-storage/.env
-#   # ensures AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set
-#   # bucket must exist: s3://chckpt-test1/
-#
-# Run directly (8 MPI ranks = 8 simulated GPU processes):
-#   cd /home/eval/Documents/Code/mlp-storage
-#   source .env && source .venv/bin/activate
-#   DLIO_S3_IMPLEMENTATION=mlp \
-#   mpirun -n 8 --allow-run-as-root \
-#     .venv/bin/dlio_benchmark \
-#     workload=llama3_8b_checkpoint_minio \
-#     --config-dir=/home/eval/Documents/Code/mlp-storage/configs/dlio
-#
-# Or use the convenience script:
-#   bash tests/object-store/dlio_minio_checkpoint.sh
-#
-# Override checkpoint count (quick test with 1 checkpoint):
-#   DLIO_S3_IMPLEMENTATION=mlp mpirun -n 8 --allow-run-as-root \
-#     .venv/bin/dlio_benchmark \
-#     workload=llama3_8b_checkpoint_minio \
-#     ++workload.checkpoint.num_checkpoints_write=1 \
-#     ++workload.checkpoint.num_checkpoints_read=1 \
-#     --config-dir=/home/eval/Documents/Code/mlp-storage/configs/dlio
-
-model:
-  name: llama_8b
-  type: transformer
-  num_layers: 32
-  model_datatype: fp16
-  optimizer_datatype: fp32
-  parallelism:
-    pipeline: 1
-    tensor: 1
-    zero_stage: 3
-  transformer:
-    vocab_size: 128256
-    hidden_size: 4096
-    ffn_hidden_size: 14336
-    num_attention_heads: 32
-    num_kv_heads: 8
-
-framework: pytorch
-
-workflow:
-  generate_data: False
-  train: False
-  checkpoint: True
-
-# ---------------------------------------------------------------------------
-# Storage — minio SDK talking to MinIO
-# ---------------------------------------------------------------------------
-storage:
-  storage_type: s3
-  storage_root: chckpt-test1           # S3 bucket name
-
-  # storage_library is read by config.py and injected into storage_options so
-  # that PyTorchObjStoreCheckpointing can find it via
-  # storage_options.get("storage_library"). There is NO default — this field
-  # is REQUIRED for all object storage workloads.
-  storage_library: minio
-
-  storage_options:
-    endpoint_url: https://172.16.1.40:9000
-    region: us-east-1
-    secure: false
-    # Credentials come from environment variables — do NOT hardcode here.
-    # Set these before running:
-    #   export AWS_ACCESS_KEY_ID=...
-    #   export AWS_SECRET_ACCESS_KEY=...
-    # (or: source /home/eval/Documents/Code/mlp-storage/.env)
-
-# ---------------------------------------------------------------------------
-# Checkpoint — full s3:// URI as checkpoint_folder
-# ---------------------------------------------------------------------------
-# checkpoint_folder must be a full s3:// URI when storage_type=s3.
-# PyTorchObjStoreCheckpointing.get_name() calls os.path.join() on this URI
-# and the per-rank suffix to produce the final object key.
-checkpoint:
-  checkpoint_folder: s3://chckpt-test1/minio/llama3-8b
-  time_between_checkpoints: 5
-  num_checkpoints_write: 2
-  num_checkpoints_read: 2
diff --git a/tests/object-store/old-archive/llama3_8b_checkpoint_s3dlio.yaml b/tests/object-store/old-archive/llama3_8b_checkpoint_s3dlio.yaml
deleted file mode 100644
index 71f60803..00000000
--- a/tests/object-store/old-archive/llama3_8b_checkpoint_s3dlio.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-# LLaMA 3 8B — s3dlio Checkpointing Config
-#
-# Purpose : Checkpoint-only workload for llama3-8b using s3dlio for object I/O.
-# Storage : MinIO at https://172.16.1.40:9000  (bucket: chckpt-test1)
-# Data    : s3://chckpt-test1/s3dlio/llama3-8b/
-#
-# Model sizing (ZeRO-3, 8 ranks, fp16 model + fp32 optimizer):
-#   Total model+optimizer: 15 GB + 90 GB = 105 GB
-#   Per-rank write:  105 GB / 8 ranks ≈ 13.1 GB
-#   Per-checkpoint total I/O: ~105 GB write + ~105 GB read = ~210 GB
-#
-# Prerequisites (before running dlio_benchmark):
-#   source /home/eval/Documents/Code/mlp-storage/.env
-#   # ensures AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set
-#   # bucket must exist: s3://chckpt-test1/
-#
-# Run directly (8 MPI ranks = 8 simulated GPU processes):
-#   cd /home/eval/Documents/Code/mlp-storage
-#   source .env && source .venv/bin/activate
-#   DLIO_S3_IMPLEMENTATION=mlp \
-#   mpirun -n 8 --allow-run-as-root \
-#     .venv/bin/dlio_benchmark \
-#     workload=llama3_8b_checkpoint_s3dlio \
-#     --config-dir=/home/eval/Documents/Code/mlp-storage/configs/dlio
-#
-# Or use the convenience script:
-#   bash tests/object-store/dlio_s3dlio_checkpoint.sh
-#
-# Override checkpoint count (quick test with 1 checkpoint):
-#   DLIO_S3_IMPLEMENTATION=mlp mpirun -n 8 --allow-run-as-root \
-#     .venv/bin/dlio_benchmark \
-#     workload=llama3_8b_checkpoint_s3dlio \
-#     ++workload.checkpoint.num_checkpoints_write=1 \
-#     ++workload.checkpoint.num_checkpoints_read=1 \
-#     --config-dir=/home/eval/Documents/Code/mlp-storage/configs/dlio
-
-model:
-  name: llama_8b
-  type: transformer
-  num_layers: 32
-  model_datatype: fp16
-  optimizer_datatype: fp32
-  parallelism:
-    pipeline: 1
-    tensor: 1
-    zero_stage: 3
-  transformer:
-    vocab_size: 128256
-    hidden_size: 4096
-    ffn_hidden_size: 14336
-    num_attention_heads: 32
-    num_kv_heads: 8
-
-framework: pytorch
-
-workflow:
-  generate_data: False
-  train: False
-  checkpoint: True
-
-# ---------------------------------------------------------------------------
-# Storage — s3dlio talking to MinIO
-# ---------------------------------------------------------------------------
-storage:
-  storage_type: s3
-  storage_root: chckpt-test1           # S3 bucket name
-
-  # storage_library is read by config.py and injected into storage_options so
-  # that PyTorchObjStoreCheckpointing can find it via
-  # storage_options.get("storage_library"). There is NO default — this field
-  # is REQUIRED for all object storage workloads.
-  storage_library: s3dlio
-
-  storage_options:
-    endpoint_url: https://172.16.1.40:9000
-    region: us-east-1
-    s3_force_path_style: true
-    # Credentials come from environment variables — do NOT hardcode here.
-    # Set these before running:
-    #   export AWS_ACCESS_KEY_ID=...
-    #   export AWS_SECRET_ACCESS_KEY=...
-    # (or: source /home/eval/Documents/Code/mlp-storage/.env)
-
-# ---------------------------------------------------------------------------
-# Checkpoint — full s3:// URI as checkpoint_folder
-# ---------------------------------------------------------------------------
-# checkpoint_folder must be a full s3:// URI when storage_type=s3.
-# PyTorchObjStoreCheckpointing.get_name() calls os.path.join() on this URI
-# and the per-rank suffix to produce the final object key.
-checkpoint:
-  checkpoint_folder: s3://chckpt-test1/s3dlio/llama3-8b
-  time_between_checkpoints: 5
-  num_checkpoints_write: 2
-  num_checkpoints_read: 2
diff --git a/tests/object-store/old-archive/llama3_8b_checkpoint_s3torch.yaml b/tests/object-store/old-archive/llama3_8b_checkpoint_s3torch.yaml
deleted file mode 100644
index 0c9d9eb4..00000000
--- a/tests/object-store/old-archive/llama3_8b_checkpoint_s3torch.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-# LLaMA 3 8B — s3torchconnector Checkpointing Config
-#
-# Purpose : Checkpoint-only workload for llama3-8b using the AWS
-#           s3torchconnector library for object I/O.
-# Storage : MinIO at https://172.16.1.40:9000  (bucket: chckpt-test1)
-# Data    : s3://chckpt-test1/s3torch/llama3-8b/
-#
-# Model sizing (ZeRO-3, 8 ranks, fp16 model + fp32 optimizer):
-#   Total model+optimizer: 15 GB + 90 GB = 105 GB
-#   Per-rank write:  105 GB / 8 ranks ≈ 13.1 GB
-#   Per-checkpoint total I/O: ~105 GB write + ~105 GB read = ~210 GB
-#
-# Prerequisites (before running dlio_benchmark):
-#   pip install s3torchconnector        # or s3-torch-connector-builder
-#   source /home/eval/Documents/Code/mlp-storage/.env
-#   # ensures AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set
-#   # bucket must exist: s3://chckpt-test1/
-#
-# Run directly (8 MPI ranks = 8 simulated GPU processes):
-#   cd /home/eval/Documents/Code/mlp-storage
-#   source .env && source .venv/bin/activate
-#   DLIO_S3_IMPLEMENTATION=mlp \
-#   mpirun -n 8 --allow-run-as-root \
-#     .venv/bin/dlio_benchmark \
-#     workload=llama3_8b_checkpoint_s3torch \
-#     --config-dir=/home/eval/Documents/Code/mlp-storage/configs/dlio
-#
-# Or use the convenience script:
-#   bash tests/object-store/dlio_s3torch_checkpoint.sh
-#
-# Override checkpoint count (quick test with 1 checkpoint):
-#   DLIO_S3_IMPLEMENTATION=mlp mpirun -n 8 --allow-run-as-root \
-#     .venv/bin/dlio_benchmark \
-#     workload=llama3_8b_checkpoint_s3torch \
-#     ++workload.checkpoint.num_checkpoints_write=1 \
-#     ++workload.checkpoint.num_checkpoints_read=1 \
-#     --config-dir=/home/eval/Documents/Code/mlp-storage/configs/dlio
-
-model:
-  name: llama_8b
-  type: transformer
-  num_layers: 32
-  model_datatype: fp16
-  optimizer_datatype: fp32
-  parallelism:
-    pipeline: 1
-    tensor: 1
-    zero_stage: 3
-  transformer:
-    vocab_size: 128256
-    hidden_size: 4096
-    ffn_hidden_size: 14336
-    num_attention_heads: 32
-    num_kv_heads: 8
-
-framework: pytorch
-
-workflow:
-  generate_data: False
-  train: False
-  checkpoint: True
-
-# ---------------------------------------------------------------------------
-# Storage — s3torchconnector talking to MinIO
-# ---------------------------------------------------------------------------
-storage:
-  storage_type: s3
-  storage_root: chckpt-test1           # S3 bucket name
-
-  # storage_library is read by config.py and injected into storage_options so
-  # that PyTorchObjStoreCheckpointing can find it via
-  # storage_options.get("storage_library"). There is NO default — this field
-  # is REQUIRED for all object storage workloads.
-  storage_library: s3torchconnector
-
-  storage_options:
-    endpoint_url: https://172.16.1.40:9000
-    region: us-east-1
-    # Credentials come from environment variables — do NOT hardcode here.
-    # Set these before running:
-    #   export AWS_ACCESS_KEY_ID=...
-    #   export AWS_SECRET_ACCESS_KEY=...
-    # (or: source /home/eval/Documents/Code/mlp-storage/.env)
-
-# ---------------------------------------------------------------------------
-# Checkpoint — full s3:// URI as checkpoint_folder
-# ---------------------------------------------------------------------------
-# checkpoint_folder must be a full s3:// URI when storage_type=s3.
-# PyTorchObjStoreCheckpointing.get_name() calls os.path.join() on this URI
-# and the per-rank suffix to produce the final object key.
-checkpoint:
-  checkpoint_folder: s3://chckpt-test1/s3torch/llama3-8b
-  time_between_checkpoints: 5
-  num_checkpoints_write: 2
-  num_checkpoints_read: 2
diff --git a/tests/object-store/run_datagen.sh b/tests/object-store/old-archive/run_datagen.sh
similarity index 100%
rename from tests/object-store/run_datagen.sh
rename to tests/object-store/old-archive/run_datagen.sh
diff --git a/tests/object-store/run_training.sh b/tests/object-store/old-archive/run_training.sh
similarity index 100%
rename from tests/object-store/run_training.sh
rename to tests/object-store/old-archive/run_training.sh
diff --git a/tests/object-store/old-archive/s3dlio_performance_analysis.md b/tests/object-store/old-archive/s3dlio_performance_analysis.md
deleted file mode 100644
index 8594635f..00000000
--- a/tests/object-store/old-archive/s3dlio_performance_analysis.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# s3dlio Performance Notes — DLIO Training Workload
-
-**Date:** March 20, 2026  
-**Status:** Historical — issues identified here are substantially resolved in s3dlio v0.9.84.  
-See [dlio_mpi_object_results.md](dlio_mpi_object_results.md) for current benchmark results.
-
----
-
-## Background
-
-During March 2026 testing with DLIO (168 × ~147 MB NPZ files, UNet3D profile, MinIO backend
-at ~1.2 GB/s network ceiling), s3dlio showed lower single-rank throughput than minio-py
-under default settings. Root-cause analysis identified six issues, all of which have since
-been addressed.
-
-## What Was Found and Fixed
-
-Six issues were identified in s3dlio v0.9.82 and earlier:
-
-| # | Issue | Resolution |
-|---|-------|-----------|
-| 1 | Redundant HEAD request per object on the `get_many` code path | Fixed in v0.9.84 |
-| 2 | Range splitting threshold too aggressive for 1 Gbps environments (37 sub-requests per 147 MB file) | Fixed in v0.9.84; `S3DLIO_RANGE_THRESHOLD_MB` env var now correctly controls the `get_many` path |
-| 3 | Tokio runtime thread over-provisioning (32 threads/process × 16 worker processes) | Mitigated: set `S3DLIO_RT_THREADS=8`; architectural fix pending in a future release |
-| 4 | Unnecessary Python-side memory copy in the DLIO NPZ reader (`bytes(data)` discarding zero-copy view) | Fixed in mlp-storage reader: zero-copy `_BytesViewIO` wrapper applied |
-| 5 | Mutex contention during parallel range-chunk assembly | Fixed in v0.9.82 |
-| 6 | O(N²) sort in `get_objects_parallel` for input-order preservation | Fixed in v0.9.82 |
-
-## Outcome
-
-After fixes, s3dlio and minio-py converge to within 1% of each other at NP=4
-(~1087–1097 MB/s), confirming all issues were caused by the above bugs rather than
-any fundamental capability difference between the libraries.
-
-On high-bandwidth systems (10/100 Gbps), s3dlio's adaptive range-splitting provides
-significant advantages that minio-py (which never issues range requests) cannot match.
-The threshold defaults are now better calibrated for typical deployment environments.
-
-## Useful Environment Variables
-
-For 1 Gbps or bandwidth-saturated environments, these env vars can further tune behavior:
-
-```bash
-# Raise range-split threshold above your largest file size to use single-stream GET
-export S3DLIO_RANGE_THRESHOLD_MB=1000
-
-# Reduce Tokio threads per worker process (recommended for high MPI rank counts)
-export S3DLIO_RT_THREADS=8
-```
-
diff --git a/tests/object-store/old-archive/test_dlio_direct_s3dlio.sh b/tests/object-store/old-archive/test_dlio_direct_s3dlio.sh
deleted file mode 100644
index 6fc4e8a3..00000000
--- a/tests/object-store/old-archive/test_dlio_direct_s3dlio.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env bash
-# test_dlio_direct_s3dlio.sh
-#
-# Run dlio_benchmark DIRECTLY — no mlpstorage wrapper.
-#
-# Purpose : Confirm that s3dlio reads the unet3d h100 dataset from MinIO
-#           without any mlpstorage layer in the way.  All debug prints from
-#           config.py, main.py, storage_factory.py, and obj_store_lib.py go
-#           directly to this terminal — nothing is captured.
-#
-# Data    : 168 × ~140 MB NPZ files already in MinIO bucket mlp-s3dlio at
-#             test-run/unet3d/train/
-#
-# Config  : configs/dlio/workload/unet3d_h100_s3dlio.yaml  (our custom YAML
-#           that includes the full storage section for s3dlio + MinIO).
-#
-# Usage   : bash tests/object-store/test_dlio_direct_s3dlio.sh
-#           Must be run from the mlp-storage repo root.
-
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# ── Credentials ────────────────────────────────────────────────────────────────
-# Load from .env if present; variables already exported in shell take priority.
-if [[ -f .env ]]; then
-    echo "[info] Loading credentials from .env"
-    # shellcheck disable=SC1091
-    set -o allexport
-    source .env
-    set +o allexport
-fi
-
-: "${AWS_ACCESS_KEY_ID:?ERROR: AWS_ACCESS_KEY_ID is not set (source .env or export it)}"
-: "${AWS_SECRET_ACCESS_KEY:?ERROR: AWS_SECRET_ACCESS_KEY is not set (source .env or export it)}"
-
-# ── Virtual environment ────────────────────────────────────────────────────────
-if [[ ! -f .venv/bin/activate ]]; then
-    echo "ERROR: .venv not found — run: cd $REPO_ROOT && python -m venv .venv && uv sync >&2
-    exit 1
-fi
-# shellcheck disable=SC1091
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-
-DLIO_BIN=".venv/bin/dlio_benchmark"
-if [[ ! -x "$DLIO_BIN" ]]; then
-    echo "ERROR: $DLIO_BIN not found" >&2
-    exit 1
-fi
-
-# ── Run directory ──────────────────────────────────────────────────────────────
-RUN_DIR="/tmp/dlio-s3dlio-direct-$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RUN_DIR"
-
-echo ""
-echo "═══════════════════════════════════════════════════════════════"
-echo "  dlio_benchmark DIRECT — s3dlio → MinIO (unet3d h100)"
-echo "  Config  : configs/dlio/workload/unet3d_h100_s3dlio.yaml"
-echo "  Bucket  : mlp-s3dlio"
-echo "  Data    : test-run/unet3d/train/  (168 × ~140 MB NPZ)"
-echo "  Run dir : $RUN_DIR"
-echo "═══════════════════════════════════════════════════════════════"
-echo ""
-
-# ── Execute ────────────────────────────────────────────────────────────────────
-# DLIO_S3_IMPLEMENTATION=mlp  → ensures our mlp-storage obj_store_lib is used
-#                                (not the upstream dlio s3torchconnector path).
-# -n 1                         → single MPI rank (no distributed needed for test)
-# workload=unet3d_h100_s3dlio  → our custom config in configs/dlio/workload/
-# --config-dir                 → point Hydra at mlp-storage's config tree
-#
-# All stdout goes to terminal — no buffering, no capture.
-
-DLIO_S3_IMPLEMENTATION=mlp \
-mpirun -n 1 --allow-run-as-root \
-    "$DLIO_BIN" \
-    workload=unet3d_h100_s3dlio \
-    "++hydra.run.dir=$RUN_DIR" \
-    ++hydra.output_subdir=dlio_config \
-    --config-dir="$REPO_ROOT/configs/dlio"
-
-EXIT_CODE=$?
-
-echo ""
-if [[ $EXIT_CODE -eq 0 ]]; then
-    echo "✅ dlio_benchmark completed successfully (exit 0)"
-    echo "   Results: $RUN_DIR"
-else
-    echo "❌ dlio_benchmark FAILED (exit $EXIT_CODE)"
-    echo "   Run dir: $RUN_DIR"
-fi
-
-exit $EXIT_CODE
diff --git a/tests/object-store/old-archive/test_dlio_multilib_demo.py b/tests/object-store/old-archive/test_dlio_multilib_demo.py
deleted file mode 100644
index 10433246..00000000
--- a/tests/object-store/old-archive/test_dlio_multilib_demo.py
+++ /dev/null
@@ -1,678 +0,0 @@
-#!/usr/bin/env python3
-"""
-DLIO Multi-Library Benchmark Demo
-
-Demonstrates two DLIO-driven workloads across s3dlio, minio, and s3torchconnector.
-I/O is handled by DLIO (via mlpstorage), NOT by the direct native APIs — this is
-specifically to show how each library performs when used as DLIO's storage backend.
-
-Workload 1 — TRAINING
-  Phase 0: cleanup  — delete existing dlio-train/* objects from the library's bucket
-  Phase 1: datagen  — DLIO generates 100 × 128 MiB NPZ objects and writes them to S3
-  Phase 2: train    — DLIO reads all objects over 2 full epochs
-
-Workload 2 — CHECKPOINT
-  Model: llama3-8b, 8 simulated ranks, open mode → ~105 GB / ~97.8 GiB total.
-  (Closest standard DLIO model configuration to the 128 GiB target.)
-  Phase 0: cleanup  — delete existing dlio-ckpt/* objects from the library's bucket
-  Phase 1: save     — DLIO writes 1 checkpoint (8 rank shards × ~13.12 GB each)
-  Phase 2: restore  — DLIO reads the checkpoint back
-
-Credentials are loaded from mlp-storage/.env (same as other test scripts in this folder).
-Each library uses its own dedicated S3 bucket to avoid interference.
-
-Usage:
-  # All libraries, both workloads (default)
-  python test_dlio_multilib_demo.py
-
-  # Single workload
-  python test_dlio_multilib_demo.py --workload training
-  python test_dlio_multilib_demo.py --workload checkpoint
-
-  # Specific library/libraries
-  python test_dlio_multilib_demo.py --library s3dlio
-  python test_dlio_multilib_demo.py --library s3dlio minio
-
-  # Combine flags
-  python test_dlio_multilib_demo.py --workload training --library s3dlio minio
-"""
-
-import os
-import sys
-import time
-import subprocess
-import argparse
-from pathlib import Path
-
-# ── Configuration ───────────────────────────────────────────────────────────────
-
-DEFAULT_LIBRARIES = ['s3dlio', 'minio', 's3torchconnector']
-
-LIBRARY_BUCKETS = {
-    's3dlio':           os.environ.get('BUCKET_S3DLIO', 'bucket-s3dlio'),
-    'minio':            os.environ.get('BUCKET_MINIO', 'bucket-minio'),
-    's3torchconnector': os.environ.get('BUCKET_S3TORCH', 'bucket-s3torch'),
-}
-
-# Workload 1 — Training
-TRAIN_MODEL         = 'unet3d'
-TRAIN_NUM_ACCEL     = 1
-TRAIN_ACCEL_TYPE    = 'a100'
-TRAIN_NUM_FILES     = 100
-TRAIN_SIZE_MiB      = 128
-TRAIN_RECORD_BYTES  = TRAIN_SIZE_MiB * 1024 * 1024   # 134,217,728
-TRAIN_SAMPLES_PER   = 1                               # 1 sample = 1 file
-TRAIN_EPOCHS        = 2
-TRAIN_PREFIX        = 'dlio-train'
-
-# Workload 2 — Checkpoint
-# StreamingCheckpointing uses a fixed 128 MB buffer pool regardless of checkpoint size.
-# ~100 GB single-object checkpoint per library.  At ~0.5 GB/s → ~200s per library.
-CKPT_SIZE_GB        = 16.0           # single streaming object per library
-CKPT_CHUNK_MB       = 32            # 32 MB chunks
-CKPT_NUM_BUFFERS    = 4             # 4 buffers × 32 MB = 128 MB RAM max
-CKPT_PREFIX         = 'dlio-ckpt'
-
-# Per-library checkpoint size overrides.
-# s3torchconnector fails at ~78 GB due to a CRT multipart bug.
-# Re-add {'s3torchconnector': 75.0} here if CKPT_SIZE_GB is raised back toward 100 GB.
-CKPT_SIZE_GB_OVERRIDE = {}
-
-# Shared
-CLIENT_MEM_GB   = 32
-RESULTS_DIR     = '/tmp/dlio_multilib_demo'
-PAUSE_SECONDS   = 30                # wait for S3 eventual consistency between phases
-
-
-# ── Credentials ─────────────────────────────────────────────────────────────────
-
-def load_env_config() -> dict:
-    """Load .env file then let actual env vars override."""
-    env_path = None
-    for candidate in [
-        Path(__file__).parent.parent / '.env',
-        Path(__file__).parent / '.env',
-        Path.cwd() / '.env',
-    ]:
-        if candidate.exists():
-            env_path = candidate
-            break
-
-    config = {}
-    if env_path:
-        with open(env_path) as f:
-            for line in f:
-                line = line.strip()
-                if line and not line.startswith('#') and '=' in line:
-                    key, _, val = line.partition('=')
-                    config[key.strip()] = val.strip()
-        print(f'Loaded credentials from: {env_path}')
-    else:
-        print('No .env file found — using environment variables only')
-
-    for key in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_ENDPOINT_URL', 'AWS_REGION']:
-        if key in os.environ:
-            config[key] = os.environ[key]
-
-    return config
-
-
-def build_env(config: dict, library: str) -> dict:
-    """Subprocess environment: current env + credentials + STORAGE_LIBRARY."""
-    env = os.environ.copy()
-    env.update(config)
-    env['STORAGE_LIBRARY'] = library
-    return env
-
-
-# ── Subprocess helpers ───────────────────────────────────────────────────────────
-
-def pause(seconds: int, reason: str):
-    """Sleep with a simple one-line message."""
-    print(f'\n  Sleeping {seconds}s — {reason}')
-    sys.stdout.flush()
-    time.sleep(seconds)
-
-
-import contextlib
-
-@contextlib.contextmanager
-def _s3_env(config: dict):
-    """Temporarily apply S3 credentials to os.environ for in-process s3dlio calls."""
-    keys = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY',
-            'AWS_ENDPOINT_URL', 'AWS_ENDPOINT_URL_S3', 'AWS_REGION']
-    old = {k: os.environ.get(k) for k in keys}
-    if config.get('AWS_ACCESS_KEY_ID'):
-        os.environ['AWS_ACCESS_KEY_ID'] = config['AWS_ACCESS_KEY_ID']
-    if config.get('AWS_SECRET_ACCESS_KEY'):
-        os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS_SECRET_ACCESS_KEY']
-    endpoint = config.get('AWS_ENDPOINT_URL')
-    if endpoint:
-        os.environ['AWS_ENDPOINT_URL']    = endpoint
-        os.environ['AWS_ENDPOINT_URL_S3'] = endpoint
-    if config.get('AWS_REGION'):
-        os.environ['AWS_REGION'] = config['AWS_REGION']
-    try:
-        yield
-    finally:
-        for k, v in old.items():
-            if v is None:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v
-
-
-def clean_prefix(bucket: str, prefix: str, config: dict):
-    """Delete all objects under s3://bucket/prefix/ using s3dlio Python API."""
-    import s3dlio
-    uri = f's3://{bucket}/{prefix}/'.rstrip('/') + '/'
-    with _s3_env(config):
-        try:
-            full_uris = s3dlio.list(uri, recursive=True)
-            if not full_uris:
-                print(f'    (nothing to clean at {uri})')
-                return
-            for obj_uri in full_uris:
-                s3dlio.delete(obj_uri)
-            print(f'    Cleaned {len(full_uris)} object(s) at {uri}')
-        except Exception as e:
-            print(f'    (nothing to clean at {uri}: {e})')
-
-
-def list_prefix(bucket: str, prefix: str, config: dict, label: str = '') -> int:
-    """List & count objects under s3://bucket/prefix/ using s3dlio Python API.
-    Returns the number of objects found."""
-    import s3dlio
-    uri = f's3://{bucket}/{prefix}/'.rstrip('/') + '/'
-    tag = f' [{label}]' if label else ''
-    with _s3_env(config):
-        try:
-            full_uris = s3dlio.list(uri, recursive=True)
-            count = len(full_uris)
-            if count:
-                print(f'    s3dlio list {uri}{tag}: {count} object(s)')
-                # Show up to 5 keys (strip the URI prefix for readability)
-                for obj_uri in full_uris[:5]:
-                    print(f'      {obj_uri}')
-                if count > 5:
-                    print(f'      ... ({count - 5} more)')
-            else:
-                print(f'    s3dlio list {uri}{tag}: (empty)')
-            return count
-        except Exception as e:
-            print(f'    s3dlio list {uri}{tag}: error: {e}')
-            return 0
-
-
-def run_phase(label: str, cmd: list, env: dict, timeout_s: int = 3600) -> tuple:
-    """
-    Stream subprocess output live.
-    Returns (returncode, elapsed_seconds, captured_output).
-    Prints each output line indented for readability.
-    """
-    print(f'\n  $ {" ".join(cmd[:8])} {"..." if len(cmd) > 8 else ""}')
-    t_start = time.perf_counter()
-    proc = subprocess.Popen(
-        cmd, env=env,
-        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-        text=True, bufsize=1,
-    )
-    captured_lines = []
-    try:
-        for line in proc.stdout:
-            sys.stdout.write(f'    {line}')
-            sys.stdout.flush()
-            captured_lines.append(line)
-        proc.wait(timeout=timeout_s)
-    except subprocess.TimeoutExpired:
-        proc.kill()
-        proc.wait()
-        elapsed = time.perf_counter() - t_start
-        print(f'\n  ❌ {label} timed out after {elapsed:.0f}s')
-        return -1, elapsed, ''.join(captured_lines)
-
-    elapsed = time.perf_counter() - t_start
-    if proc.returncode == 0:
-        print(f'  ✅ {label}: done in {elapsed:.1f}s')
-    else:
-        print(f'  ❌ {label}: FAILED (exit {proc.returncode}) after {elapsed:.1f}s')
-    return proc.returncode, elapsed, ''.join(captured_lines)
-
-
-# ── Workload 1: Training ─────────────────────────────────────────────────────────
-
-def run_training(library: str, config: dict) -> dict:
-    bucket = LIBRARY_BUCKETS[library]
-    env    = build_env(config, library)
-    data_folder = f's3://{bucket}/{TRAIN_PREFIX}'
-    total_gb    = TRAIN_NUM_FILES * TRAIN_SIZE_MiB / 1024.0
-    region      = config.get('AWS_REGION', 'us-east-1')
-
-    print(f'\n── Training  [{library}]  s3://{bucket}/{TRAIN_PREFIX}/ ──')
-    print(f'   {TRAIN_NUM_FILES} × {TRAIN_SIZE_MiB} MiB = {total_gb:.2f} GiB   '
-          f'| {TRAIN_EPOCHS} epochs')
-
-    # Phase 0: cleanup
-    print('\n  Phase 0: Cleanup')
-    clean_prefix(bucket, TRAIN_PREFIX, config)
-
-    # Shared storage params (passed to both datagen and run)
-    storage_params = [
-        f'storage.storage_type=s3',
-        f'storage.storage_root={bucket}',
-        f'storage.storage_library={library}',
-        f'storage.storage_options.endpoint_url={config["AWS_ENDPOINT_URL"]}',
-        f'storage.storage_options.access_key_id={config["AWS_ACCESS_KEY_ID"]}',
-        f'storage.storage_options.secret_access_key={config["AWS_SECRET_ACCESS_KEY"]}',
-        f'storage.storage_options.region={region}',
-        f'storage.storage_options.s3_force_path_style=true',
-        f'dataset.data_folder={data_folder}',
-        f'dataset.num_files_train={TRAIN_NUM_FILES}',
-        f'dataset.num_samples_per_file={TRAIN_SAMPLES_PER}',
-        f'dataset.record_length={TRAIN_RECORD_BYTES}',
-        f'dataset.format=npz',          # required: S3+PyTorch only supports npz/npy
-    ]
-
-    # datagen uses --num-processes (NOT --num-accelerators / --accelerator-type)
-    datagen_flags = [
-        '--model', TRAIN_MODEL,
-        '--num-processes', '8',
-        '--open',
-        '--skip-validation',
-        '--results-dir', RESULTS_DIR,
-    ]
-    # training run uses --num-accelerators + --accelerator-type + --client-host-memory-in-gb
-    run_flags = [
-        '--model', TRAIN_MODEL,
-        '--num-accelerators', str(TRAIN_NUM_ACCEL),
-        '--accelerator-type', TRAIN_ACCEL_TYPE,
-        '--client-host-memory-in-gb', str(CLIENT_MEM_GB),
-        '--open',
-        '--skip-validation',
-        '--results-dir', RESULTS_DIR,
-    ]
-
-    # Phase 1: datagen (write)
-    print(f'\n  Phase 1: datagen — write {TRAIN_NUM_FILES} × {TRAIN_SIZE_MiB} MiB objects')
-    rc_gen = -1; t_gen = 0.0
-    rc_run = -1; t_run = 0.0
-    try:
-        rc_gen, t_gen, _ = run_phase(
-            'datagen',
-            ['mlpstorage', 'training', 'datagen'] + datagen_flags + ['--params'] + storage_params,
-            env,
-        )
-
-        gen_gbps = total_gb / t_gen if rc_gen == 0 and t_gen > 0 else None
-
-        if rc_gen == 0:
-            obj_count = list_prefix(bucket, TRAIN_PREFIX, config, 'after datagen')
-            if obj_count < TRAIN_NUM_FILES:
-                print(f'  ❌ datagen validation FAILED: bucket shows {obj_count} objects, '
-                      f'expected {TRAIN_NUM_FILES}')
-                rc_gen = 1
-            else:
-                pause(PAUSE_SECONDS, 'S3 eventual consistency — new objects must be visible before reads')
-
-        # Phase 2: training run (read × epochs)
-        print(f'\n  Phase 2: train — read {TRAIN_EPOCHS} epochs '
-              f'({total_gb * TRAIN_EPOCHS:.2f} GiB total reads)')
-        if rc_gen != 0:
-            print('  ⚠ Skipping training run — datagen did not produce expected objects')
-        else:
-            rc_run, t_run, _ = run_phase(
-                'training run',
-                ['mlpstorage', 'training', 'run'] + run_flags + ['--params'] + storage_params + [
-                    f'train.epochs={TRAIN_EPOCHS}',
-                    f'train.batch_size=1',
-                    f'reader.batch_size=1',
-                    f'reader.read_threads=8',
-                    f'reader.prefetch_size=4',
-                ],
-                env,
-            )
-    finally:
-        # Always clean up — prevent filling storage between runs
-        print(f'\n  Phase 3: Cleanup (post-run)')
-        clean_prefix(bucket, TRAIN_PREFIX, config)
-        list_prefix(bucket, TRAIN_PREFIX, config, 'after cleanup')
-
-    read_total_gb = total_gb * TRAIN_EPOCHS
-    gen_gbps  = total_gb     / t_gen if rc_gen == 0 and t_gen > 0 else None
-    run_gbps  = read_total_gb / t_run if rc_run == 0 and t_run > 0 else None
-
-    return {
-        'library':    library,
-        'workload':   'training',
-        'dataset_gb': total_gb,
-        'epochs':     TRAIN_EPOCHS,
-        'gen_ok':     rc_gen == 0,
-        'run_ok':     rc_run == 0,
-        'gen_time':   t_gen,
-        'run_time':   t_run,
-        'gen_gbps':   gen_gbps,
-        'run_gbps':   run_gbps,
-    }
-
-
-# ── Workload 2: Checkpoint ────────────────────────────────────────────────────────
-
-def run_checkpoint(library: str, config: dict, network_gbps: float = None) -> dict:
-    """
-    Write a streaming checkpoint via StreamingCheckpointing.save(), then read it
-    back via StreamingCheckpointing.load().  Cleanup happens only after both phases.
-
-    StreamingCheckpointing uses a fixed producer-consumer pipeline:
-      chunk_size × num_buffers = 32 MB × 4 = 128 MB RAM, regardless of checkpoint size.
-    dgen-py generates data in parallel while the library uploads it — memory stays flat.
-    """
-    from mlpstorage_py.checkpointing import StreamingCheckpointing
-
-    bucket      = LIBRARY_BUCKETS[library]
-    env         = build_env(config, library)
-    uri         = f's3://{bucket}/{CKPT_PREFIX}/checkpoint.dat'
-    size_gb     = CKPT_SIZE_GB_OVERRIDE.get(library, CKPT_SIZE_GB)
-    total_bytes = int(size_gb * 1024 ** 3)
-
-    size_note = f'  (capped at {size_gb:.0f} GB for {library})' if library in CKPT_SIZE_GB_OVERRIDE else ''
-    print(f'\n── Checkpoint  [{library}]  {uri} ──')
-    print(f'   Size: {size_gb:.0f} GB  |  backend: {library}{size_note}')
-    print(f'   RAM usage: streaming pipeline ({CKPT_CHUNK_MB} MB chunks '
-          f'× {CKPT_NUM_BUFFERS} buffers = '
-          f'{CKPT_CHUNK_MB * CKPT_NUM_BUFFERS} MB max regardless of checkpoint size)')
-
-    # Apply credentials to os.environ so the storage backend writers can pick them up
-    saved_env = {k: os.environ.get(k) for k in config}
-    for k, v in config.items():
-        os.environ[k] = v
-    os.environ['STORAGE_LIBRARY'] = library
-
-    ok_write = False
-    ok_read  = False
-    t_write  = 0.0
-    t_read   = 0.0
-    write_gbps = None
-    read_gbps  = None
-    try:
-        # Phase 0: cleanup
-        print('\n  Phase 0: Cleanup')
-        clean_prefix(bucket, CKPT_PREFIX, config)
-        list_prefix(bucket, CKPT_PREFIX, config, 'before save')
-        pause(PAUSE_SECONDS, 'storage settling after cleanup')
-
-        # Phase 1: streaming save
-        print(f'\n  Phase 1: StreamingCheckpointing.save() → {uri}')
-        if network_gbps:
-            print(f'   {size_gb:.0f} GB at {network_gbps:.3f} GB/s ({network_gbps*8:.0f} Gbps) → expect ~'
-                  f'{size_gb / network_gbps:.0f}s minimum')
-        else:
-            print(f'   {size_gb:.0f} GB  (no --network-gbits specified; no timing estimate)')
-        checkpoint = StreamingCheckpointing(
-            chunk_size   = CKPT_CHUNK_MB * 1024 * 1024,
-            num_buffers  = CKPT_NUM_BUFFERS,
-            use_dgen     = True,
-            backend      = library,
-            fadvise_mode = 'none',
-        )
-        t_start  = time.perf_counter()
-        result   = checkpoint.save(uri, total_bytes)
-        t_write  = time.perf_counter() - t_start
-
-        io_time    = result.get('io_time', t_write)
-        write_gbps = size_gb / io_time if io_time > 0 else size_gb / t_write
-        gen_gbps   = result.get('gen_throughput_gbps', 0)
-        bottleneck = result.get('bottleneck', '?')
-
-        print(f'  ✅ checkpoint save done in {t_write:.1f}s  '
-              f'({write_gbps:.3f} GB/s I/O  |  {gen_gbps:.1f} GB/s gen  '
-              f'|  bottleneck: {bottleneck})')
-        ok_write = True
-
-        list_prefix(bucket, CKPT_PREFIX, config, 'after save')
-        pause(PAUSE_SECONDS, 'S3 eventual consistency before read')
-
-        # Phase 2: streaming load (read back)
-        print(f'\n  Phase 2: StreamingCheckpointing.load() ← {uri}')
-        if network_gbps:
-            print(f'   {size_gb:.0f} GB at {network_gbps:.3f} GB/s → expect ~'
-                  f'{size_gb / network_gbps:.0f}s minimum')
-        r_start  = time.perf_counter()
-        load_result = checkpoint.load(uri, total_bytes)
-        t_read   = time.perf_counter() - r_start
-
-        r_io_time  = load_result.get('io_time', t_read)
-        read_gbps  = size_gb / r_io_time if r_io_time > 0 else size_gb / t_read
-        print(f'  ✅ checkpoint load done in {t_read:.1f}s  ({read_gbps:.3f} GB/s)')
-        ok_read = True
-
-    except Exception as e:
-        elapsed = time.perf_counter() - (t_start if 't_start' in dir() else time.perf_counter())
-        print(f'  ❌ Checkpoint phase failed after {elapsed:.1f}s: {type(e).__name__}: {e}')
-        import traceback
-        traceback.print_exc()
-    finally:
-        # Cleanup runs after both write and read are done (or on error)
-        print(f'\n  Phase 3: Cleanup (post-run)')
-        clean_prefix(bucket, CKPT_PREFIX, config)
-        list_prefix(bucket, CKPT_PREFIX, config, 'after cleanup')
-        # Restore original env
-        for k, v in saved_env.items():
-            if v is None:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v
-        os.environ.pop('STORAGE_LIBRARY', None)
-
-    return {
-        'library':    library,
-        'workload':   'checkpoint',
-        'size_gb':    size_gb,
-        'ok_write':   ok_write,
-        'ok_read':    ok_read,
-        'ok':         ok_write and ok_read,
-        't_write':    t_write,
-        't_read':     t_read,
-        'write_gbps': write_gbps,
-        'read_gbps':  read_gbps,
-    }
-
-
-# ── Results table ─────────────────────────────────────────────────────────────────
-
-def print_results(training_results: list, checkpoint_results: list):
-    print()
-    print('=' * 96)
-    print('DLIO MULTI-LIBRARY BENCHMARK — RESULTS')
-    print('=' * 96)
-
-    if training_results:
-        total_gb    = TRAIN_NUM_FILES * TRAIN_SIZE_MiB / 1024.0
-        read_total  = total_gb * TRAIN_EPOCHS
-        print()
-        print(f'WORKLOAD 1: TRAINING')
-        print(f'  {TRAIN_NUM_FILES} objects × {TRAIN_SIZE_MiB} MiB = '
-              f'{total_gb:.2f} GiB dataset  |  {TRAIN_EPOCHS} epochs  |  '
-              f'{read_total:.2f} GiB total reads per library')
-        print(f'  {"Library":<22} {"Write GB/s":>12} {"Read GB/s":>12} '
-              f'{"Gen s":>8} {"Train s":>9}  {"Status"}')
-        print(f'  {"-"*22} {"-"*12} {"-"*12} {"-"*8} {"-"*9}  {"-"*6}')
-
-        best_gen  = max((r['gen_gbps'] for r in training_results if r.get('gen_gbps')), default=0)
-        best_read = max((r['run_gbps'] for r in training_results if r.get('run_gbps')), default=0)
-
-        for r in training_results:
-            gen_s  = f"{r['gen_gbps']:.3f}"  if r.get('gen_gbps')  else 'N/A  '
-            read_s = f"{r['run_gbps']:.3f}"  if r.get('run_gbps')  else 'N/A  '
-            gmark  = ' ◀W' if r.get('gen_gbps')  == best_gen  else '   '
-            rmark  = ' ◀R' if r.get('run_gbps')  == best_read else '   '
-            t_gen  = f"{r['gen_time']:.1f}s" if r.get('gen_time') else '-'
-            t_run  = f"{r['run_time']:.1f}s" if r.get('run_time') else '-'
-            status = ('✅' if (r['gen_ok'] and r['run_ok'])
-                      else ('❌ datagen failed' if not r['gen_ok'] else '❌ train failed'))
-            print(f"  {r['library']:<22} {gen_s+gmark:>15} {read_s+rmark:>15} "
-                  f"{t_gen:>8} {t_run:>9}  {status}")
-
-        print()
-        print('  Write GB/s = DLIO datagen throughput (generate + write to S3)')
-        print('  Read GB/s  = DLIO training read throughput (total read GiB / total read time)')
-        print('  ◀W = fastest write   ◀R = fastest read')
-        print()
-        print('  Compare these numbers to the native API results in WRITE_READ_COMPARISON_RESULTS.md')
-        print('  to quantify DLIO overhead vs raw library throughput.')
-
-    if checkpoint_results:
-        print()
-        print(f'WORKLOAD 2: CHECKPOINT  (StreamingCheckpointing — fixed 128 MB RAM)')
-        print(f'  Single object per library via streaming producer-consumer pipeline')
-        print(f'  {CKPT_CHUNK_MB} MB chunks × {CKPT_NUM_BUFFERS} buffers = '
-              f'{CKPT_CHUNK_MB * CKPT_NUM_BUFFERS} MB RAM max regardless of checkpoint size')
-        print(f'  {"Library":<22} {"Size GB":>9} {"Write GB/s":>12} {"Read GB/s":>12}  {"Status"}')
-        print(f'  {"-"*22} {"-"*9} {"-"*12} {"-"*12}  {"-"*6}')
-
-        best_w = max((r['write_gbps'] for r in checkpoint_results if r.get('write_gbps')), default=0)
-        best_r = max((r['read_gbps']  for r in checkpoint_results if r.get('read_gbps')),  default=0)
-
-        for r in checkpoint_results:
-            w_s   = f"{r['write_gbps']:.3f}" if r.get('write_gbps') else 'N/A  '
-            rd_s  = f"{r['read_gbps']:.3f}"  if r.get('read_gbps')  else 'N/A  '
-            wmark = ' ◀W' if r.get('write_gbps') == best_w else '   '
-            rmark = ' ◀R' if r.get('read_gbps')  == best_r else '   '
-            if not r.get('ok_write', r.get('ok')):
-                status = '❌ write failed'
-            elif not r.get('ok_read', True):
-                status = '❌ read failed'
-            else:
-                status = '✅'
-            print(f"  {r['library']:<22} {r['size_gb']:>9.0f} {w_s+wmark:>15} {rd_s+rmark:>15}  {status}")
-
-        print()
-        print('  Write GB/s = I/O throughput from StreamingCheckpointing.save()')
-        print('  Read GB/s  = I/O throughput from StreamingCheckpointing.load() (byte-range GETs, data discarded)')
-        print('  ◀W = fastest write   ◀R = fastest read')
-        print('  dgen-py generates write data concurrently; bottleneck is always I/O, not generation')
-
-    print()
-    print('=' * 96)
-
-
-# ── Preflight checks ──────────────────────────────────────────────────────────────
-
-def preflight(do_checkpoint: bool):
-    ok = True
-
-    # mlpstorage
-    import shutil
-    if not shutil.which('mlpstorage'):
-        print('ERROR: mlpstorage not found in PATH. Activate the virtualenv first.')
-        ok = False
-
-    # StreamingCheckpointing is in-process — no MPI required.
-    # (mlpstorage.checkpointing import verified at import-time above)
-
-    return ok
-
-
-# ── Main ──────────────────────────────────────────────────────────────────────────
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='DLIO multi-library benchmark demo (training + checkpoint)',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  python test_dlio_multilib_demo.py                                        # all libraries, both workloads
-  python test_dlio_multilib_demo.py --workload training                    # training only
-  python test_dlio_multilib_demo.py --workload checkpoint                  # checkpoint only
-  python test_dlio_multilib_demo.py --library s3dlio                       # single library
-  python test_dlio_multilib_demo.py --library s3dlio minio                 # two libraries
-  python test_dlio_multilib_demo.py --workload training --library s3dlio minio
-  python test_dlio_multilib_demo.py --workload checkpoint --network-gbits 10    # 10 Gbps link → ~80s estimate
-        """,
-    )
-    parser.add_argument(
-        '--workload', choices=['training', 'checkpoint', 'both'], default='both',
-        help='Which workload to run (default: both)',
-    )
-    parser.add_argument(
-        '--library', choices=['s3dlio', 'minio', 's3torchconnector'],
-        nargs='+', dest='libraries', metavar='LIBRARY',
-        help='Library/libraries to test (default: all three)',
-    )
-    parser.add_argument(
-        '--network-gbits', type=float, default=None, metavar='N',
-        help='Network link speed in Gbps (gigabits/s, e.g. 10 for a 10 Gbps link). '
-             'Optional — used only for informational time estimates in the checkpoint '
-             'phase. Does not affect test logic.',
-    )
-    args = parser.parse_args()
-
-    libraries     = args.libraries or DEFAULT_LIBRARIES
-    do_training   = args.workload in ('training', 'both')
-    do_checkpoint = args.workload in ('checkpoint', 'both')
-    # Convert Gbps → GB/s internally (1 byte = 8 bits)
-    network_gbps  = args.network_gbits / 8.0 if args.network_gbits else None
-
-    config = load_env_config()
-    for key in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_ENDPOINT_URL']:
-        if not config.get(key):
-            print(f'ERROR: {key} not set in .env or environment', file=sys.stderr)
-            sys.exit(1)
-
-    if not preflight(do_checkpoint):
-        sys.exit(1)
-
-    # Header
-    total_gb = TRAIN_NUM_FILES * TRAIN_SIZE_MiB / 1024.0
-    print()
-    print('=' * 96)
-    print('DLIO MULTI-LIBRARY BENCHMARK DEMO')
-    print('  I/O through DLIO (mlpstorage) — compares s3dlio, minio, s3torchconnector')
-    print('=' * 96)
-    print(f'  Endpoint:    {config["AWS_ENDPOINT_URL"]}')
-    print(f'  Libraries:   {", ".join(libraries)}')
-    print(f'  Workloads:   {args.workload}')
-    if do_training:
-        print(f'  Training:    {TRAIN_NUM_FILES} × {TRAIN_SIZE_MiB} MiB = '
-              f'{total_gb:.2f} GiB/library  |  {TRAIN_EPOCHS} epochs')
-    if do_checkpoint:
-        net_hint = (f'  |  ~{CKPT_SIZE_GB / network_gbps:.0f}s at {args.network_gbits:.0f} Gbps'
-                    if network_gbps else '')
-        print(f'  Checkpoint:  {CKPT_SIZE_GB:.0f} GB streaming  |  '
-              f'{CKPT_CHUNK_MB} MB chunks × {CKPT_NUM_BUFFERS} buffers = '
-              f'{CKPT_CHUNK_MB * CKPT_NUM_BUFFERS} MB RAM  |  backend per library{net_hint}')
-    print(f'  Buckets:     ' +
-          '  '.join(f'{l}={LIBRARY_BUCKETS[l]}' for l in libraries if l in LIBRARY_BUCKETS))
-    print('=' * 96)
-
-    training_results   = []
-    checkpoint_results = []
-
-    for i, lib in enumerate(libraries):
-        if i > 0:
-            pause(PAUSE_SECONDS, f'cooldown between libraries ({libraries[i-1]} → {lib})')
-        if do_training:
-            result = run_training(lib, config)
-            training_results.append(result)
-        if do_checkpoint:
-            if do_training:
-                pause(PAUSE_SECONDS, 'cooldown between training and checkpoint workloads')
-            result = run_checkpoint(lib, config, network_gbps=network_gbps)
-            checkpoint_results.append(result)
-
-    print_results(training_results, checkpoint_results)
-
-    all_ok = (
-        all(r['gen_ok'] and r['run_ok'] for r in training_results) and
-        all(r['ok'] for r in checkpoint_results)
-    )
-
-    if all_ok:
-        print('✅ All tests passed.')
-        sys.exit(0)
-    else:
-        print('❌ Some tests failed — see output above.')
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/object-store/old-archive/test_minio_checkpoint.py b/tests/object-store/old-archive/test_minio_checkpoint.py
deleted file mode 100644
index b68c6ad5..00000000
--- a/tests/object-store/old-archive/test_minio_checkpoint.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python3
-"""MinIO streaming checkpoint test.
-
-Credential precedence: .env file < environment variables < CLI options
-"""
-
-import os
-import sys
-import time
-import argparse
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-
-def load_env_config():
-    env_path = None
-    for candidate in [
-        Path(__file__).parent.parent / ".env",
-        Path(__file__).parent / ".env",
-        Path.cwd() / ".env",
-    ]:
-        if candidate.exists():
-            env_path = candidate
-            break
-
-    config = {}
-    if env_path:
-        with open(env_path) as f:
-            for line in f:
-                line = line.strip()
-                if line and not line.startswith('#') and '=' in line:
-                    key, _, val = line.partition('=')
-                    config[key.strip()] = val.strip()
-        print(f"Loaded credentials from: {env_path}")
-    else:
-        print("No .env file found, using environment variables")
-
-    for key in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_ENDPOINT_URL', 'AWS_REGION']:
-        if key in os.environ:
-            config[key] = os.environ[key]
-
-    return config
-
-
-def apply_config(config: dict):
-    for key, val in config.items():
-        os.environ[key] = val
-
-
-
-def test_minio_checkpoint(uri: str, size_gb: float, part_size_mb: int, num_parallel: int):
-    from mlpstorage_py.checkpointing import StreamingCheckpointing
-
-    total_bytes = int(size_gb * (1024**3))
-    part_size = part_size_mb * 1024 * 1024
-
-    print("=" * 80)
-    print("MINIO CHECKPOINT TEST")
-    print("=" * 80)
-    print(f"URI:              {uri}")
-    print(f"Size:             {size_gb:.2f} GB")
-    print(f"Part size:        {part_size_mb} MB")
-    print(f"Parallel uploads: {num_parallel}")
-    print("=" * 80)
-    print()
-
-    checkpoint = StreamingCheckpointing(
-        chunk_size=32 * 1024 * 1024,
-        num_buffers=4,
-        use_dgen=True,
-        backend='minio',
-        part_size=part_size,
-        num_parallel_uploads=num_parallel,
-    )
-
-    try:
-        start = time.perf_counter()
-        result = checkpoint.save(uri, total_bytes)
-        elapsed = time.perf_counter() - start
-        io_throughput = result.get('io_throughput_gbps', size_gb / elapsed)
-
-        print()
-        print("=" * 80)
-        print("✅ SUCCESS")
-        print("=" * 80)
-        print(f"Time:             {elapsed:.2f}s")
-        print(f"I/O Throughput:   {io_throughput:.2f} GB/s")
-        print(f"Total Throughput: {size_gb / elapsed:.2f} GB/s")
-        if 'memory_usage_mb' in result:
-            print(f"Memory:           {result['memory_usage_mb']:.1f} MB")
-        print("=" * 80)
-        return True
-    except Exception as e:
-        print()
-        print("=" * 80)
-        print(f"❌ FAILED: {e}")
-        print("=" * 80)
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='MinIO streaming checkpoint test',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument('--bucket', default=os.environ.get('S3_BUCKET', 'bucket-minio'), help='S3/MinIO bucket name')
-    parser.add_argument('--key', default=None,
-                        help='Object key (default: auto-generated with timestamp)')
-    parser.add_argument('--s3-uri', default=None,
-                        help='Full S3 URI (overrides --bucket / --key)')
-    parser.add_argument('--size-gb', type=float, default=1.0, help='Checkpoint size in GB')
-    parser.add_argument('--part-size', type=int, default=32, help='Multipart part size in MB')
-    parser.add_argument('--num-parallel', type=int, default=8, help='Number of parallel uploads')
-    parser.add_argument('--endpoint', default=None, help='S3 endpoint URL')
-    parser.add_argument('--access-key', default=None, help='AWS/MinIO access key')
-    parser.add_argument('--secret-key', default=None, help='AWS/MinIO secret key')
-    parser.add_argument('--region', default=None, help='AWS region')
-    args = parser.parse_args()
-
-    config = load_env_config()
-    if args.endpoint:
-        config['AWS_ENDPOINT_URL'] = args.endpoint
-    if args.access_key:
-        config['AWS_ACCESS_KEY_ID'] = args.access_key
-    if args.secret_key:
-        config['AWS_SECRET_ACCESS_KEY'] = args.secret_key
-    if args.region:
-        config['AWS_REGION'] = args.region
-    apply_config(config)
-
-    if args.s3_uri:
-        uri = args.s3_uri
-    else:
-        key = args.key or f"test/minio-checkpoint-{int(time.time())}.dat"
-        uri = f"s3://{args.bucket}/{key}"
-
-    success = test_minio_checkpoint(uri, args.size_gb, args.part_size, args.num_parallel)
-    sys.exit(0 if success else 1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/object-store/old-archive/test_mlp_minio.sh b/tests/object-store/old-archive/test_mlp_minio.sh
deleted file mode 100755
index d6205222..00000000
--- a/tests/object-store/old-archive/test_mlp_minio.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Test MLP implementation with minio library
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# Load .env — env vars already in the shell take precedence
-if [ -f ".env" ]; then
-    while IFS='=' read -r key value; do
-        [[ "$key" =~ ^[[:space:]]*# ]] && continue
-        [[ -z "${key// /}" ]] && continue
-        key="${key// /}"
-        [[ -v "$key" ]] && continue   # skip if already set in environment
-        export "$key"="$value"
-    done < .env
-    echo "Loaded credentials from .env"
-fi
-
-if [[ -z "$AWS_ACCESS_KEY_ID" ]] || [[ -z "$AWS_SECRET_ACCESS_KEY" ]] || [[ -z "$AWS_ENDPOINT_URL" ]]; then
-    echo "ERROR: Missing required S3 credentials"
-    echo ""
-    echo "Set via .env file or environment variables:"
-    echo "  AWS_ACCESS_KEY_ID=your_access_key"
-    echo "  AWS_SECRET_ACCESS_KEY=your_secret_key"
-    echo "  AWS_ENDPOINT_URL=http://your-s3-endpoint:9000"
-    exit 1
-fi
-
-BUCKET="${BUCKET:-mlp-minio}"
-S3_CLI="${S3_CLI:-s3-cli}"
-
-echo "========================================================================"
-echo "TEST: MLP Implementation with minio library"
-echo "========================================================================"
-echo "Bucket:   $BUCKET"
-echo "Endpoint: $AWS_ENDPOINT_URL"
-echo "Library:  minio (MinIO native SDK)"
-echo ""
-
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-echo "Active venv: $(which python)"
-echo "Active mlpstorage: $(which mlpstorage)"
-echo ""
-
-S3_BUCKET="$BUCKET"
-DATA_DIR="test-run/"
-COMMON_PARAMS="dataset.num_files_train=3 dataset.num_samples_per_file=5 dataset.record_length=65536 storage.s3_force_path_style=true"
-s3_params="storage.storage_type=s3 storage.storage_options.storage_library=minio storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} storage.storage_options.access_key_id=${AWS_ACCESS_KEY_ID} storage.storage_options.secret_access_key=${AWS_SECRET_ACCESS_KEY} storage.storage_root=${S3_BUCKET}"
-
-echo "Step 1: Cleaning bucket..."
-"$S3_CLI" delete -r "s3://${S3_BUCKET}/" 2>/dev/null || true
-echo ""
-
-echo "Step 2: Verifying bucket is empty..."
-"$S3_CLI" ls -r "s3://${S3_BUCKET}/" || true
-echo ""
-
-echo "Step 3: Running data generation..."
-DLIO_S3_IMPLEMENTATION=mlp mlpstorage training datagen \
-  --model unet3d -np 1 -dd "${DATA_DIR}" \
-  --param ${COMMON_PARAMS} ${s3_params}
-
-echo ""
-echo "Step 4: Verifying objects created..."
-"$S3_CLI" ls "s3://${S3_BUCKET}/${DATA_DIR}unet3d/train/"
-echo ""
-
-echo "Step 5: Complete bucket listing..."
-"$S3_CLI" ls -r "s3://${S3_BUCKET}/"
-
-deactivate
-
-echo ""
-echo "========================================================================"
-echo "✅ TEST COMPLETE: MLP + minio"
-echo "========================================================================"
diff --git a/tests/object-store/old-archive/test_mlp_s3dlio.sh b/tests/object-store/old-archive/test_mlp_s3dlio.sh
deleted file mode 100755
index a705aa29..00000000
--- a/tests/object-store/old-archive/test_mlp_s3dlio.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-# Test MLP implementation with s3dlio library
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# Load .env — env vars already in the shell take precedence
-if [ -f ".env" ]; then
-    while IFS='=' read -r key value; do
-        [[ "$key" =~ ^[[:space:]]*# ]] && continue
-        [[ -z "${key// /}" ]] && continue
-        key="${key// /}"
-        [[ -v "$key" ]] && continue   # skip if already set in environment
-        export "$key"="$value"
-    done < .env
-    echo "Loaded credentials from .env"
-fi
-
-if [[ -z "$AWS_ACCESS_KEY_ID" ]] || [[ -z "$AWS_SECRET_ACCESS_KEY" ]] || [[ -z "$AWS_ENDPOINT_URL" ]]; then
-    echo "ERROR: Missing required S3 credentials"
-    echo ""
-    echo "Set via .env file or environment variables:"
-    echo "  AWS_ACCESS_KEY_ID=your_access_key"
-    echo "  AWS_SECRET_ACCESS_KEY=your_secret_key"
-    echo "  AWS_ENDPOINT_URL=http://your-s3-endpoint:9000"
-    exit 1
-fi
-
-BUCKET="${BUCKET:-mlp-s3dlio}"
-S3_CLI="${S3_CLI:-s3-cli}"
-
-echo "========================================================================"
-echo "TEST: MLP Implementation with s3dlio"
-echo "========================================================================"
-echo "Bucket:   $BUCKET"
-echo "Endpoint: $AWS_ENDPOINT_URL"
-echo "Library:  s3dlio (our high-performance library)"
-echo ""
-
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-echo "Active venv: $(which python)"
-echo "Active mlpstorage: $(which mlpstorage)"
-echo ""
-
-S3_BUCKET="$BUCKET"
-DATA_DIR="test-run/"
-# Real unet3d h100 workload parameters (unet3d_h100.yaml): 168 files x ~140 MB each
-COMMON_PARAMS="dataset.num_files_train=168 dataset.num_samples_per_file=1 dataset.record_length_bytes=146600628 dataset.record_length_bytes_stdev=0 dataset.record_length_bytes_resize=2097152 storage.s3_force_path_style=true"
-s3_params="storage.storage_type=s3 storage.storage_options.storage_library=s3dlio storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} storage.storage_options.access_key_id=${AWS_ACCESS_KEY_ID} storage.storage_options.secret_access_key=${AWS_SECRET_ACCESS_KEY} storage.storage_root=${S3_BUCKET}"
-
-echo "Step 1: Cleaning bucket..."
-"$S3_CLI" delete -r "s3://${S3_BUCKET}/" 2>/dev/null || true
-echo ""
-
-echo "Step 2: Verifying bucket is empty..."
-"$S3_CLI" ls -r "s3://${S3_BUCKET}/" || true
-echo ""
-
-echo "Step 3: Running data generation..."
-set +e  # s3dlio compat layer may still have issues — capture result rather than abort
-DLIO_S3_IMPLEMENTATION=mlp mlpstorage training datagen \
-  --model unet3d -np 8 -dd "${DATA_DIR}" \
-  --param ${COMMON_PARAMS} ${s3_params}
-
-RESULT=$?
-set -e
-
-echo ""
-if [ $RESULT -eq 0 ]; then
-    echo "Step 4: Verifying objects created..."
-    "$S3_CLI" ls "s3://${S3_BUCKET}/${DATA_DIR}unet3d/train/"
-    echo ""
-    echo "Step 5: Complete bucket listing..."
-    "$S3_CLI" ls -r "s3://${S3_BUCKET}/"
-    echo ""
-    echo "Step 6: Running training..."
-    set +e
-    export DLIO_S3_IMPLEMENTATION=mlp
-    mlpstorage training run \
-      --model unet3d --allow-run-as-root --skip-validation \
-      --num-accelerators 1 --accelerator-type h100 --client-host-memory-in-gb 512 \
-      --param ${COMMON_PARAMS} ${s3_params} \
-        dataset.data_folder="${DATA_DIR}unet3d"
-
-    TRAIN_RESULT=$?
-    set -e
-    echo ""
-    if [ $TRAIN_RESULT -eq 0 ]; then
-        echo "========================================================================"
-        echo "✅ TEST COMPLETE: MLP + s3dlio (datagen + training)"
-        echo "========================================================================"
-    else
-        echo "========================================================================"
-        echo "❌ TRAINING FAILED: MLP + s3dlio (exit code $TRAIN_RESULT)"
-        echo "========================================================================"
-        deactivate
-        exit $TRAIN_RESULT
-    fi
-else
-    echo "Step 4: Checking if any objects were created despite error..."
-    "$S3_CLI" ls -r "s3://${S3_BUCKET}/" || true
-    echo ""
-    echo "========================================================================"
-    echo "❌ TEST FAILED: MLP + s3dlio (exit code $RESULT)"
-    echo "========================================================================"
-    deactivate
-    exit $RESULT
-fi
-
-deactivate
diff --git a/tests/object-store/old-archive/test_mlp_s3torch.sh b/tests/object-store/old-archive/test_mlp_s3torch.sh
deleted file mode 100755
index 628abd56..00000000
--- a/tests/object-store/old-archive/test_mlp_s3torch.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Test MLP implementation with s3torchconnector library
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# Load .env — env vars already in the shell take precedence
-if [ -f ".env" ]; then
-    while IFS='=' read -r key value; do
-        [[ "$key" =~ ^[[:space:]]*# ]] && continue
-        [[ -z "${key// /}" ]] && continue
-        key="${key// /}"
-        [[ -v "$key" ]] && continue   # skip if already set in environment
-        export "$key"="$value"
-    done < .env
-    echo "Loaded credentials from .env"
-fi
-
-if [[ -z "$AWS_ACCESS_KEY_ID" ]] || [[ -z "$AWS_SECRET_ACCESS_KEY" ]] || [[ -z "$AWS_ENDPOINT_URL" ]]; then
-    echo "ERROR: Missing required S3 credentials"
-    echo ""
-    echo "Set via .env file or environment variables:"
-    echo "  AWS_ACCESS_KEY_ID=your_access_key"
-    echo "  AWS_SECRET_ACCESS_KEY=your_secret_key"
-    echo "  AWS_ENDPOINT_URL=http://your-s3-endpoint:9000"
-    exit 1
-fi
-
-BUCKET="${BUCKET:-mlp-s3torch}"
-S3_CLI="${S3_CLI:-s3-cli}"
-
-echo "========================================================================"
-echo "TEST: MLP Implementation with s3torchconnector"
-echo "========================================================================"
-echo "Bucket:   $BUCKET"
-echo "Endpoint: $AWS_ENDPOINT_URL"
-echo "Library:  s3torchconnector (AWS official connector)"
-echo ""
-
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-echo "Active venv: $(which python)"
-echo "Active mlpstorage: $(which mlpstorage)"
-echo ""
-
-S3_BUCKET="$BUCKET"
-DATA_DIR="test-run/"
-COMMON_PARAMS="dataset.num_files_train=3 dataset.num_samples_per_file=5 dataset.record_length=65536 storage.s3_force_path_style=true"
-s3_params="storage.storage_type=s3 storage.storage_options.storage_library=s3torchconnector storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} storage.storage_options.access_key_id=${AWS_ACCESS_KEY_ID} storage.storage_options.secret_access_key=${AWS_SECRET_ACCESS_KEY} storage.storage_root=${S3_BUCKET}"
-
-echo "Step 1: Cleaning bucket..."
-"$S3_CLI" delete -r "s3://${S3_BUCKET}/" 2>/dev/null || true
-echo ""
-
-echo "Step 2: Verifying bucket is empty..."
-"$S3_CLI" ls -r "s3://${S3_BUCKET}/" || true
-echo ""
-
-echo "Step 3: Running data generation..."
-DLIO_S3_IMPLEMENTATION=mlp mlpstorage training datagen \
-  --model unet3d -np 1 -dd "${DATA_DIR}" \
-  --param ${COMMON_PARAMS} ${s3_params}
-
-echo ""
-echo "Step 4: Verifying objects created..."
-"$S3_CLI" ls "s3://${S3_BUCKET}/${DATA_DIR}unet3d/train/"
-echo ""
-
-echo "Step 5: Complete bucket listing..."
-"$S3_CLI" ls -r "s3://${S3_BUCKET}/"
-
-deactivate
-
-echo ""
-echo "========================================================================"
-echo "✅ TEST COMPLETE: MLP + s3torchconnector"
-echo "========================================================================"
diff --git a/tests/object-store/test_multi_endpoint_s3dlio.py b/tests/object-store/old-archive/test_multi_endpoint_s3dlio.py
similarity index 100%
rename from tests/object-store/test_multi_endpoint_s3dlio.py
rename to tests/object-store/old-archive/test_multi_endpoint_s3dlio.py
diff --git a/tests/object-store/old-archive/test_s3dlio_checkpoint.py b/tests/object-store/old-archive/test_s3dlio_checkpoint.py
deleted file mode 100644
index 75d20f62..00000000
--- a/tests/object-store/old-archive/test_s3dlio_checkpoint.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/usr/bin/env python3
-"""
-StreamingCheckpointing with s3dlio backend.
-
-Writes a configurable-size checkpoint to S3 using the streaming producer-consumer
-pipeline: dgen-py generates data in parallel while s3dlio uploads it, keeping
-memory usage constant at ~128 MB regardless of checkpoint size.
-
-Configuration:
-  32 MB chunks, 4 buffers (128 MB pool), fadvise=none
-  300s SIGALRM timeout to detect hung S3 connections early
-
-Credential precedence (lowest → highest):
-  .env file  <  environment variables  <  CLI options
-
-Usage:
-  python test_s3dlio_checkpoint.py --bucket my-bucket
-  python test_s3dlio_checkpoint.py --bucket my-bucket --size-gb 4.0
-  python test_s3dlio_checkpoint.py --s3-uri s3://my-bucket/ckpt/test.dat --size-gb 8.0
-"""
-
-import os
-import sys
-import time
-import signal
-import argparse
-from contextlib import contextmanager
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-
-def load_env_config() -> dict:
-    """Load config from .env, then let environment variables override."""
-    env_path = None
-    for candidate in [
-        Path(__file__).parent.parent / ".env",
-        Path(__file__).parent / ".env",
-        Path.cwd() / ".env",
-    ]:
-        if candidate.exists():
-            env_path = candidate
-            break
-
-    config = {}
-    if env_path:
-        with open(env_path) as f:
-            for line in f:
-                line = line.strip()
-                if line and not line.startswith('#') and '=' in line:
-                    key, _, val = line.partition('=')
-                    config[key.strip()] = val.strip()
-        print(f"Loaded credentials from: {env_path}")
-    else:
-        print("No .env file found, using environment variables")
-
-    # Environment variables override .env
-    for key in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_ENDPOINT_URL', 'AWS_REGION']:
-        if key in os.environ:
-            config[key] = os.environ[key]
-
-    return config
-
-
-def apply_config(config: dict):
-    for key, val in config.items():
-        os.environ[key] = val
-
-
-class TimeoutException(Exception):
-    pass
-
-
-@contextmanager
-def timeout(seconds: int, message: str = 'Operation timed out'):
-    """SIGALRM-based timeout context manager (Unix only)."""
-    def _handler(signum, frame):
-        raise TimeoutException(message)
-
-    signal.signal(signal.SIGALRM, _handler)
-    signal.alarm(seconds)
-    try:
-        yield
-    finally:
-        signal.alarm(0)
-
-
-def run(s3_uri: str, size_gb: float):
-    from mlpstorage_py.checkpointing import StreamingCheckpointing
-
-    total_bytes = int(size_gb * (1024 ** 3))
-    endpoint = os.environ.get('AWS_ENDPOINT_URL', '(default)')
-    access_key = os.environ.get('AWS_ACCESS_KEY_ID', '')
-
-    print()
-    print("=" * 80)
-    print("S3DLIO STREAMING CHECKPOINT TEST")
-    print("=" * 80)
-    print(f"Endpoint: {endpoint}")
-    print(f"URI:      {s3_uri}")
-    print(f"Size:     {size_gb} GB  ({total_bytes:,} bytes)")
-    print(f"Config:   32 MB chunks, 4 buffers (128 MB pool), fadvise=none")
-    if access_key:
-        print(f"Access:   {access_key[:8]}...{access_key[-4:]}")
-    print("=" * 80)
-    print()
-
-    try:
-        import s3dlio
-        print(f"  s3dlio  {s3dlio.__version__}  ✅")
-    except ImportError:
-        print("  s3dlio  ❌  not installed — pip install s3dlio")
-        sys.exit(1)
-
-    try:
-        import dgen_py
-        print(f"  dgen-py {dgen_py.__version__}  ✅")
-    except ImportError:
-        print("  dgen-py ❌  not installed — pip install dgen-py")
-        sys.exit(1)
-
-    print()
-    checkpoint = StreamingCheckpointing(
-        chunk_size=32 * 1024 * 1024,
-        num_buffers=4,
-        use_dgen=True,
-        backend='s3dlio',
-        fadvise_mode='none',
-    )
-    print("StreamingCheckpointing ready  (backend=s3dlio, 32 MB chunks × 4 buffers)")
-    print()
-    print(f"Writing {size_gb} GB → {s3_uri}  [timeout: 300s]")
-    print()
-
-    start_time = time.perf_counter()
-    try:
-        with timeout(300, f"Write timed out after 300s  (size={size_gb:.2f} GB)"):
-            result = checkpoint.save(s3_uri, total_bytes)
-        elapsed = time.perf_counter() - start_time
-    except TimeoutException as e:
-        elapsed = time.perf_counter() - start_time
-        print(f"\n❌ TIMEOUT after {elapsed:.0f}s: {e}")
-        print("   Check S3 endpoint connectivity and credentials.")
-        sys.exit(1)
-    except Exception as e:
-        elapsed = time.perf_counter() - start_time
-        print(f"\n❌ Error after {elapsed:.1f}s: {type(e).__name__}: {e}")
-        import traceback
-        traceback.print_exc()
-        sys.exit(1)
-
-    print("=" * 80)
-    print("✅ COMPLETED")
-    print("=" * 80)
-    print(f"  Wall time:  {elapsed:.2f}s")
-
-    if result:
-        gen_time = result.get('gen_time', 0)
-        io_time = result.get('io_time', 0)
-        if gen_time:
-            print(f"  Generation: {gen_time:.2f}s  ({result.get('gen_throughput_gbps', 0):.2f} GB/s)")
-        if io_time:
-            print(f"  I/O:        {io_time:.2f}s  ({result.get('io_throughput_gbps', 0):.2f} GB/s)")
-
-    overall = (total_bytes / (1024 ** 3)) / elapsed
-    print(f"  Overall:    {overall:.2f} GB/s")
-    print(f"  URI:        {s3_uri}")
-    print("=" * 80)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='StreamingCheckpointing with s3dlio backend',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        epilog="""
-Examples:
-  python test_s3dlio_checkpoint.py --bucket my-bucket
-  python test_s3dlio_checkpoint.py --bucket my-bucket --size-gb 4.0
-  python test_s3dlio_checkpoint.py --s3-uri s3://my-bucket/ckpt/test.dat --size-gb 8.0
-        """,
-    )
-    parser.add_argument('--bucket', default=os.environ.get('S3_BUCKET', 'bucket-s3dlio'),
-                        help='S3 bucket name')
-    parser.add_argument('--key', default=None,
-                        help='Object key (default: auto-generated with timestamp)')
-    parser.add_argument('--s3-uri', default=None,
-                        help='Full S3 URI — overrides --bucket and --key')
-    parser.add_argument('--size-gb', type=float, default=1.0,
-                        help='Checkpoint size in GB')
-    parser.add_argument('--endpoint', default=None,
-                        help='S3 endpoint URL (e.g. http://minio-host:9000)')
-    parser.add_argument('--access-key', default=None, help='AWS access key ID')
-    parser.add_argument('--secret-key', default=None, help='AWS secret access key')
-    parser.add_argument('--region', default=None, help='AWS region')
-    args = parser.parse_args()
-
-    # Credential precedence: .env < env vars < CLI
-    config = load_env_config()
-    if args.endpoint:
-        config['AWS_ENDPOINT_URL'] = args.endpoint
-    if args.access_key:
-        config['AWS_ACCESS_KEY_ID'] = args.access_key
-    if args.secret_key:
-        config['AWS_SECRET_ACCESS_KEY'] = args.secret_key
-    if args.region:
-        config['AWS_REGION'] = args.region
-    apply_config(config)
-
-    if args.s3_uri:
-        s3_uri = args.s3_uri
-    else:
-        key = args.key or f"test/checkpoint-{int(time.time())}.dat"
-        s3_uri = f"s3://{args.bucket}/{key}"
-
-    run(s3_uri, args.size_gb)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/object-store/old-archive/test_s3dlio_multilib.sh b/tests/object-store/old-archive/test_s3dlio_multilib.sh
deleted file mode 100644
index 262f23c5..00000000
--- a/tests/object-store/old-archive/test_s3dlio_multilib.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-cd "$REPO_ROOT"
-
-# Load .env — env vars already in the shell take precedence
-if [ -f ".env" ]; then
-    while IFS='=' read -r key value; do
-        [[ "$key" =~ ^[[:space:]]*# ]] && continue
-        [[ -z "${key// /}" ]] && continue
-        key="${key// /}"
-        [[ -v "$key" ]] && continue   # skip if already set in environment
-        export "$key"="$value"
-    done < .env
-    echo "Loaded credentials from .env"
-fi
-
-if [[ -z "$AWS_ACCESS_KEY_ID" ]] || [[ -z "$AWS_SECRET_ACCESS_KEY" ]] || [[ -z "$AWS_ENDPOINT_URL" ]]; then
-    echo "ERROR: Missing required S3 credentials"
-    echo ""
-    echo "Set via .env file or environment variables:"
-    echo "  AWS_ACCESS_KEY_ID=your_access_key"
-    echo "  AWS_SECRET_ACCESS_KEY=your_secret_key"
-    echo "  AWS_ENDPOINT_URL=http://your-s3-endpoint:9000"
-    exit 1
-fi
-
-S3_BUCKET="${BUCKET:-pr1-test-s3dlio}"
-S3_CLI="${S3_CLI:-s3-cli}"
-
-echo "========================================================================"
-echo "TEST: Multi-library support - s3dlio backend"
-echo "========================================================================"
-echo "This tests the dpsi fork's built-in multi-library support with s3dlio"
-echo ""
-DATA_DIR="s3dlio-multilib-test"
-NUM_FILES=20
-
-echo "Bucket: ${S3_BUCKET}"
-echo "Library: s3dlio (zero-copy, 20-30 GB/s)"
-echo "Data directory: ${DATA_DIR}"
-echo "Files: ${NUM_FILES}"
-echo ""
-
-# Activate venv
-source .venv/bin/activate  # .venv managed by uv (run "uv sync" to set up)
-echo "Active venv: $(which python)"
-echo ""
-
-echo "Step 1: Clean any old data..."
-"$S3_CLI" rm -r "s3://${S3_BUCKET}/${DATA_DIR}/" 2>/dev/null || true
-echo ""
-
-echo "Step 2: Data generation with s3dlio..."
-# Use storage.storage_library to select s3dlio
-s3_params="storage.storage_type=s3 storage.storage_library=s3dlio storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} storage.storage_options.access_key_id=${AWS_ACCESS_KEY_ID} storage.storage_options.secret_access_key=${AWS_SECRET_ACCESS_KEY} storage.storage_root=${S3_BUCKET} storage.storage_options.s3_force_path_style=true"
-
-mlpstorage training datagen \
-  --model unet3d \
-  --num-processes 1 \
-  --params dataset.num_files_train=${NUM_FILES} \
-    dataset.data_folder="${DATA_DIR}/unet3d" \
-    $s3_params
-
-if [ $? -ne 0 ]; then
-    echo "❌ Data generation FAILED"
-    exit 1
-fi
-
-echo ""
-echo "✓ Data generation: SUCCESS"
-echo ""
-
-echo "Step 3: Verify S3 data with s3-cli..."
-"$S3_CLI" ls -cr "s3://${S3_BUCKET}/${DATA_DIR}/" | head -10
-echo ""
-
-echo "Step 4: Training (5 epochs) with s3dlio..."
-timeout 300 mlpstorage training run \
-  --model unet3d \
-  --num-accelerators=1 \
-  --accelerator-type=a100 \
-  --client-host-memory-in-gb=4 \
-  --data-dir "${DATA_DIR}/unet3d" \
-  --skip-validation \
-  --params train.epochs=5 \
-    dataset.num_files_train=${NUM_FILES} \
-    dataset.data_folder="${DATA_DIR}/unet3d" \
-    $s3_params
-
-if [ $? -ne 0 ]; then
-    echo "❌ Training FAILED"
-    exit 1
-fi
-
-echo ""
-echo "✓ Training: SUCCESS"
-echo ""
-
-echo "========================================================================"
-echo "✅ MULTI-LIBRARY TEST COMPLETE: s3dlio backend works!"
-echo "========================================================================"
diff --git a/tests/object-store/old-archive/test_s3torch_checkpoint.py b/tests/object-store/old-archive/test_s3torch_checkpoint.py
deleted file mode 100644
index bb210025..00000000
--- a/tests/object-store/old-archive/test_s3torch_checkpoint.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python3
-"""S3TorchConnector streaming checkpoint test.
-
-Credential precedence: .env file < environment variables < CLI options
-"""
-
-import os
-import sys
-import time
-import argparse
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-
-def load_env_config():
-    env_path = None
-    for candidate in [
-        Path(__file__).parent.parent / ".env",
-        Path(__file__).parent / ".env",
-        Path.cwd() / ".env",
-    ]:
-        if candidate.exists():
-            env_path = candidate
-            break
-
-    config = {}
-    if env_path:
-        with open(env_path) as f:
-            for line in f:
-                line = line.strip()
-                if line and not line.startswith('#') and '=' in line:
-                    key, _, val = line.partition('=')
-                    config[key.strip()] = val.strip()
-        print(f"Loaded credentials from: {env_path}")
-    else:
-        print("No .env file found, using environment variables")
-
-    for key in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_ENDPOINT_URL', 'AWS_REGION']:
-        if key in os.environ:
-            config[key] = os.environ[key]
-
-    return config
-
-
-def apply_config(config: dict):
-    for key, val in config.items():
-        os.environ[key] = val
-
-
-
-def test_s3torch_checkpoint(uri: str, size_gb: float):
-    from mlpstorage_py.checkpointing import StreamingCheckpointing
-
-    total_bytes = int(size_gb * (1024**3))
-
-    print("=" * 80)
-    print("S3TORCHCONNECTOR CHECKPOINT TEST")
-    print("=" * 80)
-    print(f"URI:       {uri}")
-    print(f"Size:      {size_gb:.2f} GB")
-    print(f"Multipart: Auto-managed by s3torchconnector")
-    print("=" * 80)
-    print()
-
-    checkpoint = StreamingCheckpointing(
-        chunk_size=32 * 1024 * 1024,
-        num_buffers=4,
-        use_dgen=True,
-        backend='s3torchconnector',
-    )
-
-    try:
-        start = time.perf_counter()
-        result = checkpoint.save(uri, total_bytes)
-        elapsed = time.perf_counter() - start
-        io_throughput = result.get('io_throughput_gbps', size_gb / elapsed)
-
-        print()
-        print("=" * 80)
-        print("✅ SUCCESS")
-        print("=" * 80)
-        print(f"Time:             {elapsed:.2f}s")
-        print(f"I/O Throughput:   {io_throughput:.2f} GB/s")
-        print(f"Total Throughput: {size_gb / elapsed:.2f} GB/s")
-        if 'memory_usage_mb' in result:
-            print(f"Memory:           {result['memory_usage_mb']:.1f} MB")
-        print("=" * 80)
-        return True
-    except Exception as e:
-        print()
-        print("=" * 80)
-        print(f"❌ FAILED: {e}")
-        print("=" * 80)
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='S3TorchConnector streaming checkpoint test',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument('--bucket', default='bucket-s3torch', help='S3 bucket name')
-    parser.add_argument('--key', default=None,
-                        help='Object key (default: auto-generated with timestamp)')
-    parser.add_argument('--s3-uri', default=None,
-                        help='Full S3 URI (overrides --bucket / --key)')
-    parser.add_argument('--size-gb', type=float, default=1.0, help='Checkpoint size in GB')
-    parser.add_argument('--endpoint', default=None, help='S3 endpoint URL')
-    parser.add_argument('--access-key', default=None, help='AWS/MinIO access key')
-    parser.add_argument('--secret-key', default=None, help='AWS/MinIO secret key')
-    parser.add_argument('--region', default=None, help='AWS region')
-    args = parser.parse_args()
-
-    config = load_env_config()
-    if args.endpoint:
-        config['AWS_ENDPOINT_URL'] = args.endpoint
-    if args.access_key:
-        config['AWS_ACCESS_KEY_ID'] = args.access_key
-    if args.secret_key:
-        config['AWS_SECRET_ACCESS_KEY'] = args.secret_key
-    if args.region:
-        config['AWS_REGION'] = args.region
-    apply_config(config)
-
-    if args.s3_uri:
-        uri = args.s3_uri
-    else:
-        key = args.key or f"test/s3torch-checkpoint-{int(time.time())}.dat"
-        uri = f"s3://{args.bucket}/{key}"
-
-    success = test_s3torch_checkpoint(uri, args.size_gb)
-    sys.exit(0 if success else 1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/object-store/old-archive/test_training_mpi_sweep.py b/tests/object-store/old-archive/test_training_mpi_sweep.py
deleted file mode 100644
index 6cf9e85d..00000000
--- a/tests/object-store/old-archive/test_training_mpi_sweep.py
+++ /dev/null
@@ -1,512 +0,0 @@
-#!/usr/bin/env python3
-"""
-Training MPI Process Count Sweep
-
-For every (library, N) combination, runs a COMPLETE cycle:
-  1. Cleanup — delete any leftover objects
-  2. Datagen  — generate 100 × 128 MiB NPZ files with N parallel write processes
-  3. Train    — read the dataset across 2 epochs with N MPI accelerators
-  4. Cleanup  — delete the objects for this run
-
-This means datagen is also under test at each N — both write (datagen) and read
-(training) throughput are measured at the same process count.
-
-Libraries:   s3dlio, minio, s3torchconnector  (or a subset via --library)
-Process counts (N):  1, 2, 4                   (or custom via --process-counts)
-
-Hypothesis being tested:
-  Prior runs at 1 accelerator produced ~0.178 GB/s read throughput despite a
-  ~1.2 GB/s network ceiling.  The question is whether:
-    (a) More MPI processes help by adding independent read pipelines, OR
-    (b) The per-process NPZ deserialise + DataLoader IPC pickle dominates regardless.
-
-Usage:
-  # All libraries, 1/2/4 process counts (default)
-  python test_training_mpi_sweep.py
-
-  # Single library
-  python test_training_mpi_sweep.py --library s3dlio
-
-  # Custom process count sweep
-  python test_training_mpi_sweep.py --process-counts 1 2 4 8
-
-  # Quick test: skip datagen phase (requires data already in bucket)
-  python test_training_mpi_sweep.py --skip-datagen
-
-  # Keep objects after run
-  python test_training_mpi_sweep.py --skip-cleanup
-"""
-
-import os
-import sys
-import time
-import subprocess
-import argparse
-from pathlib import Path
-
-# ── Configuration ────────────────────────────────────────────────────────────────
-
-DEFAULT_LIBRARIES      = ['s3dlio', 'minio', 's3torchconnector']
-DEFAULT_PROCESS_COUNTS = [1, 2, 4]
-
-LIBRARY_BUCKETS = {
-    's3dlio':           'bucket-s3dlio',
-    'minio':            'bucket-minio',
-    's3torchconnector': 'bucket-s3torch',
-}
-
-# Training dataset parameters
-TRAIN_MODEL        = 'unet3d'
-TRAIN_ACCEL_TYPE   = 'a100'
-TRAIN_NUM_FILES    = 100
-TRAIN_SIZE_MiB     = 128
-TRAIN_RECORD_BYTES = TRAIN_SIZE_MiB * 1024 * 1024   # 134,217,728
-TRAIN_SAMPLES_PER  = 1
-TRAIN_EPOCHS       = 2
-TRAIN_PREFIX       = 'dlio-train'
-
-# Per-training-run I/O settings (constant across sweep)
-READ_THREADS   = 8
-PREFETCH_SIZE  = 4
-BATCH_SIZE     = 1
-
-CLIENT_MEM_GB  = 32
-RESULTS_DIR    = '/tmp/dlio_mpi_sweep'
-PAUSE_SECONDS  = 30
-
-
-# ── Credentials ──────────────────────────────────────────────────────────────────
-
-def load_env_config() -> dict:
-    env_path = None
-    for candidate in [
-        Path(__file__).parent.parent / '.env',
-        Path(__file__).parent / '.env',
-        Path.cwd() / '.env',
-    ]:
-        if candidate.exists():
-            env_path = candidate
-            break
-
-    config = {}
-    if env_path:
-        with open(env_path) as f:
-            for line in f:
-                line = line.strip()
-                if line and not line.startswith('#') and '=' in line:
-                    key, _, val = line.partition('=')
-                    config[key.strip()] = val.strip()
-        print(f'Loaded credentials from: {env_path}')
-    else:
-        print('No .env file found — using environment variables only')
-
-    for key in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_ENDPOINT_URL', 'AWS_REGION']:
-        if key in os.environ:
-            config[key] = os.environ[key]
-
-    return config
-
-
-def build_env(config: dict, library: str) -> dict:
-    env = os.environ.copy()
-    env.update(config)
-    env['STORAGE_LIBRARY'] = library
-    return env
-
-
-# ── Subprocess helpers ────────────────────────────────────────────────────────────
-
-def pause(seconds: int, reason: str):
-    print(f'\n  Sleeping {seconds}s — {reason}')
-    sys.stdout.flush()
-    time.sleep(seconds)
-
-
-def clean_prefix(bucket: str, prefix: str, env: dict):
-    uri = f's3://{bucket}/{prefix}/'
-    result = subprocess.run(
-        ['s3-cli', 'delete', '-r', uri],
-        env=env, capture_output=True, text=True,
-    )
-    if result.returncode == 0:
-        print(f'    Cleaned s3://{bucket}/{prefix}/')
-    else:
-        print(f'    (nothing to clean at s3://{bucket}/{prefix}/)')
-
-
-def list_prefix(bucket: str, prefix: str, env: dict, label: str = ''):
-    uri = f's3://{bucket}/{prefix}/'
-    result = subprocess.run(
-        ['s3-cli', 'list', uri],
-        env=env, capture_output=True, text=True,
-    )
-    lines = [l for l in result.stdout.strip().splitlines() if l.strip()]
-    tag = f' [{label}]' if label else ''
-    if lines:
-        print(f'    s3-cli list {uri}{tag}: {len(lines)} object(s)')
-        for l in lines[:5]:
-            print(f'      {l}')
-        if len(lines) > 5:
-            print(f'      ... ({len(lines) - 5} more)')
-    else:
-        print(f'    s3-cli list {uri}{tag}: (empty)')
-
-
-def run_phase(label: str, cmd: list, env: dict, timeout_s: int = 3600) -> tuple:
-    """Stream subprocess output live. Returns (returncode, elapsed_seconds, captured_output)."""
-    print(f'\n  $ {" ".join(cmd[:8])} {"..." if len(cmd) > 8 else ""}')
-    t_start = time.perf_counter()
-    proc = subprocess.Popen(
-        cmd, env=env,
-        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-        text=True, bufsize=1,
-    )
-    captured_lines = []
-    try:
-        for line in proc.stdout:
-            sys.stdout.write(f'    {line}')
-            sys.stdout.flush()
-            captured_lines.append(line)
-        proc.wait(timeout=timeout_s)
-    except subprocess.TimeoutExpired:
-        proc.kill()
-        proc.wait()
-        elapsed = time.perf_counter() - t_start
-        print(f'\n  ❌ {label} timed out after {elapsed:.0f}s')
-        return -1, elapsed, ''.join(captured_lines)
-
-    elapsed = time.perf_counter() - t_start
-    if proc.returncode == 0:
-        print(f'  ✅ {label}: done in {elapsed:.1f}s')
-    else:
-        print(f'  ❌ {label}: FAILED (exit {proc.returncode}) after {elapsed:.1f}s')
-    return proc.returncode, elapsed, ''.join(captured_lines)
-
-
-# ── Storage params builder ────────────────────────────────────────────────────────
-
-def build_storage_params(config: dict, library: str) -> list:
-    bucket      = LIBRARY_BUCKETS[library]
-    data_folder = f's3://{bucket}/{TRAIN_PREFIX}'
-    region      = config.get('AWS_REGION', 'us-east-1')
-    return [
-        f'storage.storage_type=s3',
-        f'storage.storage_root={bucket}',
-        f'storage.storage_options.endpoint_url={config["AWS_ENDPOINT_URL"]}',
-        f'storage.storage_options.access_key_id={config["AWS_ACCESS_KEY_ID"]}',
-        f'storage.storage_options.secret_access_key={config["AWS_SECRET_ACCESS_KEY"]}',
-        f'storage.storage_options.region={region}',
-        f'storage.storage_options.s3_force_path_style=true',
-        f'dataset.data_folder={data_folder}',
-        f'dataset.num_files_train={TRAIN_NUM_FILES}',
-        f'dataset.num_samples_per_file={TRAIN_SAMPLES_PER}',
-        f'dataset.record_length={TRAIN_RECORD_BYTES}',
-        f'dataset.format=npz',
-    ]
-
-
-# ── Single (library, N) cycle ────────────────────────────────────────────────────
-
-def run_one_cycle(library: str, n: int, config: dict,
-                  skip_datagen: bool, skip_cleanup: bool) -> dict:
-    """
-    Full cycle for one (library, process_count) pair:
-      clean → datagen(N) → pause → train(N) → clean
-
-    Returns a result dict with gen_gbps, run_gbps, gen_ok, run_ok.
-    """
-    bucket         = LIBRARY_BUCKETS[library]
-    env            = build_env(config, library)
-    total_gb       = TRAIN_NUM_FILES * TRAIN_SIZE_MiB / 1024.0
-    read_total_gb  = total_gb * TRAIN_EPOCHS
-    storage_params = build_storage_params(config, library)
-
-    result = {
-        'library':       library,
-        'num_processes': n,
-        'gen_ok':        False,
-        'run_ok':        False,
-        'gen_gbps':      None,
-        'run_gbps':      None,
-        'gen_time':      0.0,
-        'run_time':      0.0,
-        'dataset_gb':    total_gb,
-        'epochs':        TRAIN_EPOCHS,
-    }
-
-    print(f'\n{"─"*72}')
-    print(f'  [{library}]  N={n}  |  s3://{bucket}/{TRAIN_PREFIX}/')
-    print(f'{"─"*72}')
-
-    try:
-        # ── Cleanup before ──────────────────────────────────────────────────
-        if not skip_datagen:
-            print('\n  Step 1: Cleanup (pre-run)')
-            clean_prefix(bucket, TRAIN_PREFIX, env)
-
-        # ── Datagen ─────────────────────────────────────────────────────────
-        if skip_datagen:
-            print(f'\n  Step 1: Skipping datagen — using existing data')
-            list_prefix(bucket, TRAIN_PREFIX, env, 'existing')
-            result['gen_ok'] = True
-        else:
-            print(f'\n  Step 2: datagen — {TRAIN_NUM_FILES} × {TRAIN_SIZE_MiB} MiB, '
-                  f'{n} process(es)')
-            datagen_flags = [
-                '--model', TRAIN_MODEL,
-                '--num-processes', str(n),
-                '--open',
-                '--skip-validation',
-                '--results-dir', RESULTS_DIR,
-            ]
-            rc_gen, t_gen, _ = run_phase(
-                f'datagen (N={n})',
-                ['mlpstorage', 'training', 'datagen'] + datagen_flags
-                    + ['--params'] + storage_params,
-                env,
-            )
-            result['gen_ok']   = (rc_gen == 0)
-            result['gen_time'] = t_gen
-            if result['gen_ok']:
-                result['gen_gbps'] = total_gb / t_gen if t_gen > 0 else None
-                list_prefix(bucket, TRAIN_PREFIX, env, 'after datagen')
-                pause(PAUSE_SECONDS, 'S3 eventual consistency before training read')
-            else:
-                print(f'  ❌ datagen failed — skipping training read for this cycle')
-                return result
-
-        # ── Training read ────────────────────────────────────────────────────
-        print(f'\n  Step 3: training run — {TRAIN_EPOCHS} epochs × {total_gb:.2f} GiB, '
-              f'{n} accelerator(s), {READ_THREADS} read threads each')
-        run_flags = [
-            '--model', TRAIN_MODEL,
-            '--num-accelerators', str(n),
-            '--accelerator-type', TRAIN_ACCEL_TYPE,
-            '--client-host-memory-in-gb', str(CLIENT_MEM_GB),
-            '--open',
-            '--skip-validation',
-            '--results-dir', RESULTS_DIR,
-        ]
-        rc_run, t_run, _ = run_phase(
-            f'train (N={n})',
-            ['mlpstorage', 'training', 'run'] + run_flags + ['--params'] + storage_params + [
-                f'train.epochs={TRAIN_EPOCHS}',
-                f'train.batch_size={BATCH_SIZE}',
-                f'reader.batch_size={BATCH_SIZE}',
-                f'reader.read_threads={READ_THREADS}',
-                f'reader.prefetch_size={PREFETCH_SIZE}',
-            ],
-            env,
-        )
-        result['run_ok']   = (rc_run == 0)
-        result['run_time'] = t_run
-        if result['run_ok']:
-            result['run_gbps'] = read_total_gb / t_run if t_run > 0 else None
-
-    finally:
-        # ── Cleanup after ───────────────────────────────────────────────────
-        if not skip_cleanup:
-            print(f'\n  Step 4: Cleanup (post-run)')
-            clean_prefix(bucket, TRAIN_PREFIX, env)
-            list_prefix(bucket, TRAIN_PREFIX, env, 'after cleanup')
-        else:
-            print(f'\n  Skipping cleanup (--skip-cleanup)')
-
-    status = '✅' if result['run_ok'] else '❌'
-    w_s = f"{result['gen_gbps']:.3f} GB/s write" if result.get('gen_gbps') else 'write skipped'
-    r_s = f"{result['run_gbps']:.3f} GB/s read"  if result.get('run_gbps') else 'read FAILED'
-    print(f'\n  {status}  [{library}] N={n}: {w_s}  |  {r_s}')
-    return result
-
-
-# ── Results tables ────────────────────────────────────────────────────────────────
-
-def print_results(all_results: list, process_counts: list):
-    print()
-    print('=' * 100)
-    print('TRAINING MPI PROCESS SWEEP — RESULTS')
-    print('=' * 100)
-    print()
-
-    total_gb   = TRAIN_NUM_FILES * TRAIN_SIZE_MiB / 1024.0
-    read_total = total_gb * TRAIN_EPOCHS
-    print(f'Dataset : {TRAIN_NUM_FILES} × {TRAIN_SIZE_MiB} MiB = {total_gb:.2f} GiB per library')
-    print(f'Reads   : {TRAIN_EPOCHS} epochs = {read_total:.2f} GiB total per cycle')
-    print(f'I/O     : {READ_THREADS} read_threads per MPI process, prefetch {PREFETCH_SIZE}')
-    print(f'Cycle   : clean → datagen(N) → train(N) → clean  (independent for each N)')
-    print()
-
-    libraries_seen = []
-    by_lib = {}
-    for r in all_results:
-        lib = r['library']
-        if lib not in by_lib:
-            by_lib[lib] = {}
-            libraries_seen.append(lib)
-        by_lib[lib][r['num_processes']] = r
-
-    count_headers = '  '.join(f'  N={n}' for n in process_counts)
-    sep = '-' * (26 + len(process_counts) * 12)
-
-    # ── Write throughput ───────────────────────────────────────────────────
-    print(f'  Datagen write throughput (GB/s):')
-    print(f'  {"Library":<24}  {count_headers}')
-    print(f'  {sep}')
-    for lib in libraries_seen:
-        cols = []
-        for n in process_counts:
-            r = by_lib.get(lib, {}).get(n)
-            if r is None:
-                cols.append('    N/A')
-            elif not r.get('gen_ok'):
-                cols.append('   FAIL')
-            elif r.get('gen_gbps') is None:
-                cols.append('   skip')
-            else:
-                cols.append(f'{r["gen_gbps"]:>7.3f}')
-        print(f'  {lib:<24}  ' + '        '.join(cols))
-    print()
-
-    # ── Read throughput ────────────────────────────────────────────────────
-    print(f'  Training read throughput (GB/s):')
-    print(f'  {"Library":<24}  {count_headers}')
-    print(f'  {sep}')
-    for lib in libraries_seen:
-        cols = []
-        for n in process_counts:
-            r = by_lib.get(lib, {}).get(n)
-            if r is None:
-                cols.append('    N/A')
-            elif not r.get('run_ok'):
-                cols.append('   FAIL')
-            else:
-                cols.append(f'{r["run_gbps"]:>7.3f}' if r.get('run_gbps') else '    N/A')
-        print(f'  {lib:<24}  ' + '        '.join(cols))
-    print()
-
-    # ── Scaling vs N=1 ─────────────────────────────────────────────────────
-    if 1 in process_counts:
-        print(f'  Read scaling relative to N=1:')
-        print(f'  {"Library":<24}  {count_headers}')
-        print(f'  {sep}')
-        for lib in libraries_seen:
-            lib_data = by_lib.get(lib, {})
-            baseline = lib_data.get(1, {}).get('run_gbps')
-            cols = []
-            for n in process_counts:
-                gbps = lib_data.get(n, {}).get('run_gbps')
-                if gbps is None:
-                    cols.append('    N/A')
-                elif n == 1:
-                    cols.append(f'{gbps:.3f}  ')
-                elif baseline:
-                    cols.append(f'{gbps / baseline:.2f}×   ')
-                else:
-                    cols.append(f'{gbps:.3f}  ')
-            print(f'  {lib:<24}  ' + '        '.join(cols))
-        print()
-
-    print('  Interpretation:')
-    print('  - ratio > 1.0×: more processes increase throughput (additional I/O pipelines)')
-    print('  - ratio ≈ 1.0×: MPI process count is not the bottleneck')
-    print('  - ratio < 1.0×: more processes hurt (contention or Python overhead dominates)')
-    print()
-    print('=' * 100)
-
-
-# ── Main ──────────────────────────────────────────────────────────────────────────
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='DLIO training sweep: process count for datagen + training',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  python test_training_mpi_sweep.py                               # all libs, N=1,2,4
-  python test_training_mpi_sweep.py --library s3dlio              # one library
-  python test_training_mpi_sweep.py --process-counts 1 2 4 8     # extended sweep
-  python test_training_mpi_sweep.py --skip-datagen                # skip write phase
-  python test_training_mpi_sweep.py --skip-cleanup                # keep objects
-        """,
-    )
-    parser.add_argument(
-        '--library', choices=['s3dlio', 'minio', 's3torchconnector'],
-        nargs='+', dest='libraries', metavar='LIBRARY',
-        help='Library/libraries to sweep (default: all three)',
-    )
-    parser.add_argument(
-        '--process-counts', type=int, nargs='+', default=DEFAULT_PROCESS_COUNTS,
-        metavar='N',
-        help=f'N values to sweep for both datagen and training (default: {DEFAULT_PROCESS_COUNTS})',
-    )
-    parser.add_argument(
-        '--skip-datagen', action='store_true',
-        help='Skip datagen — use data already present in the bucket',
-    )
-    parser.add_argument(
-        '--skip-cleanup', action='store_true',
-        help='Do not delete training data after each cycle',
-    )
-    args = parser.parse_args()
-
-    libraries      = args.libraries or DEFAULT_LIBRARIES
-    process_counts = sorted(set(args.process_counts))
-
-    config = load_env_config()
-    for key in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_ENDPOINT_URL']:
-        if not config.get(key):
-            print(f'ERROR: {key} not set in .env or environment', file=sys.stderr)
-            sys.exit(1)
-
-    import shutil
-    if not shutil.which('mlpstorage'):
-        print('ERROR: mlpstorage not found in PATH. Activate the virtualenv first.',
-              file=sys.stderr)
-        sys.exit(1)
-
-    total_gb   = TRAIN_NUM_FILES * TRAIN_SIZE_MiB / 1024.0
-    n_cycles   = len(libraries) * len(process_counts)
-
-    print()
-    print('=' * 100)
-    print('TRAINING MPI PROCESS SWEEP')
-    print('=' * 100)
-    print(f'  Endpoint:       {config["AWS_ENDPOINT_URL"]}')
-    print(f'  Libraries:      {", ".join(libraries)}')
-    print(f'  Process counts: {process_counts}')
-    print(f'  Total cycles:   {n_cycles}  ({len(libraries)} libs × {len(process_counts)} N values)')
-    print(f'  Dataset:        {TRAIN_NUM_FILES} × {TRAIN_SIZE_MiB} MiB = {total_gb:.2f} GiB/library')
-    print(f'  Cycle:          {"datagen SKIPPED — existing data" if args.skip_datagen else "clean → datagen(N) → train(N) → clean"}')
-    print(f'  I/O:            {READ_THREADS} read threads per process, prefetch {PREFETCH_SIZE}')
-    print('=' * 100)
-
-    all_results = []
-
-    for lib in libraries:
-        for n in process_counts:
-            if all_results:
-                pause(PAUSE_SECONDS, 'cooldown before next cycle')
-
-            result = run_one_cycle(
-                library      = lib,
-                n            = n,
-                config       = config,
-                skip_datagen = args.skip_datagen,
-                skip_cleanup = args.skip_cleanup,
-            )
-            all_results.append(result)
-
-    print_results(all_results, process_counts)
-
-    failed = [r for r in all_results if not r['run_ok']]
-    if not failed:
-        print('✅ All training runs succeeded.')
-        sys.exit(0)
-    else:
-        names = [f'{r["library"]} N={r["num_processes"]}' for r in failed]
-        print(f'❌ Failed: {", ".join(names)}')
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/object-store/run_dlrm_bench.sh b/tests/object-store/run_dlrm_bench.sh
new file mode 100755
index 00000000..2f1a0ae2
--- /dev/null
+++ b/tests/object-store/run_dlrm_bench.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+# =============================================================================
+# run_dlrm_bench.sh — MLPerf Storage DLRM benchmark runner
+# =============================================================================
+#
+# Usage:
+#   ./run_dlrm_bench.sh <NP>
+#
+# NP = number of accelerators / MPI ranks (1, 2, 4, 8)
+#
+# Prerequisites:
+#   - s3-ultra must be running on port 9200 (see start_s3ultra.sh below)
+#   - mlp-storage venv must be at /home/eval/Documents/Code/mlp-storage/.venv
+#   - .env file must be present in /home/eval/Documents/Code/mlp-storage/
+#
+# Results are written to:
+#   /home/eval/Documents/Code/mlp-storage/results/dlrm/
+#
+# =============================================================================
+
+set -euo pipefail
+
+NP="${1:?Usage: $0 <NP> [s3dlio|s3torchconnector|minio] [simulate [log_secs]]  (e.g. ./run_dlrm_bench.sh 1 s3dlio simulate 30)}"
+LIBRARY="${2:-s3dlio}"
+SIMULATE="${3:-}"
+SIM_LOG_SECS="${4:-60}"
+
+REPO=/home/eval/Documents/Code/mlp-storage
+RESULTS_DIR="${REPO}/results/dlrm"
+VENV="${REPO}/.venv"
+
+# 64 parquet files, 1M samples each, ~970 MiB each = ~60.6 GiB total
+NUM_FILES=64
+SAMPLES_PER_FILE=1000000
+DATA_FOLDER="data/dlrm/train"
+
+mkdir -p "${RESULTS_DIR}"
+
+echo "============================================================"
+echo "  DLRM benchmark  NP=${NP}  library=${LIBRARY}${SIMULATE:+  SIMULATE}"
+echo "  Results dir: ${RESULTS_DIR}"
+echo "  $(date '+%Y-%m-%d %H:%M:%S')"
+echo "============================================================"
+
+cd "${REPO}"
+source .env
+
+RUST_LOG=s3dlio=info \
+"${VENV}/bin/python3" -c "from mlpstorage_py.main import main; main()" \
+    training run \
+    --model dlrm \
+    --accelerator-type b200 \
+    --num-accelerators "${NP}" \
+    --num-client-hosts 1 \
+    --client-host-memory-in-gb 64 \
+    --dlio-bin-path "${VENV}/bin" \
+    --object s3 \
+    --skip-validation \
+    --results-dir "${RESULTS_DIR}" \
+    --params \
+        dataset.num_files_train=${NUM_FILES} \
+        dataset.num_samples_per_file=${SAMPLES_PER_FILE} \
+        dataset.data_folder=${DATA_FOLDER} \
+        storage.storage_options.decode_mode=none \
+        storage.storage_options.storage_library=${LIBRARY} \
+        ${SIMULATE:+storage.storage_options.simulate_io=true} \
+        ${SIMULATE:+storage.storage_options.sim_log_secs=${SIM_LOG_SECS}}
+
+echo ""
+echo "============================================================"
+echo "  Run complete — parsing results"
+echo "============================================================"
+
+# Print throughput from the most recent run's metadata.json
+"${VENV}/bin/python3" - <<'PYEOF'
+import json, glob, os
+
+results_dir = "/home/eval/Documents/Code/mlp-storage/results/dlrm"
+files = sorted(glob.glob(f"{results_dir}/**/training_*_metadata.json", recursive=True))
+if not files:
+    print("  No metadata.json found.")
+    exit(0)
+
+latest = files[-1]
+d = json.load(open(latest))
+np_ = d.get("num_processes", "?")
+runtime = d.get("runtime", None)
+
+print(f"  Run dir:    {os.path.dirname(latest).split('/')[-1]}")
+print(f"  NP:         {np_}")
+
+if runtime:
+    total_gb = 64 * 970 / 1024  # 64 files × 970 MiB
+    mbps = total_gb * 1024 / runtime
+    print(f"  Runtime:    {runtime:.1f} s")
+    print(f"  Throughput: {mbps:.0f} MB/s  ({total_gb:.1f} GiB / {runtime:.1f} s)")
+else:
+    print("  Runtime not found in metadata")
+
+# Also print DLIO's own summary if it exists
+run_dir = os.path.dirname(latest)
+summary_path = os.path.join(run_dir, "summary.json")
+if os.path.exists(summary_path):
+    s = json.load(open(summary_path))
+    m = s.get("metric", {})
+    au_mean = m.get("train_au_mean_percentage")
+    tput_mean = m.get("train_throughput_mean_samples_per_second")
+    io_mean = m.get("train_io_mean_MB_per_second")
+    au_ok = m.get("train_au_meet_expectation", "?")
+    if au_mean is not None:
+        print(f"  AU mean:    {au_mean:.1f}%  ({au_ok})")
+    if tput_mean is not None:
+        print(f"  Samples/s:  {tput_mean:.0f}")
+    if io_mean is not None:
+        print(f"  DLIO I/O:   {io_mean:.0f} MB/s")
+else:
+    print("  (no summary.json — DLIO may have crashed during finalize)")
+PYEOF
+
+echo "============================================================"
diff --git a/tests/object-store/run_flux_bench.sh b/tests/object-store/run_flux_bench.sh
new file mode 100755
index 00000000..83cd4e0f
--- /dev/null
+++ b/tests/object-store/run_flux_bench.sh
@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+# =============================================================================
+# run_flux_bench.sh — MLPerf Storage Flux benchmark runner
+# =============================================================================
+#
+# Usage:
+#   ./run_flux_bench.sh <NP> [s3dlio|s3torchconnector|minio] [simulate [log_secs]]
+#
+# NP = number of accelerators / MPI ranks (1, 2, 4, 8)
+#
+# Prerequisites:
+#   - s3-ultra must be running on port 9200 (see start_s3ultra.sh)
+#   - mlp-storage venv must be at /home/eval/Documents/Code/mlp-storage/.venv
+#   - .env file must be present in /home/eval/Documents/Code/mlp-storage/
+#   - Flux Parquet data must be present at dataset.data_folder on the S3 system
+#
+# Flux dataset characteristics:
+#   4296 files × 288 samples/file ≈ 594 MiB/file (uncompressed)
+#   Columns: t5_encodings (524328×f32), clip_encodings (409×f32),
+#            mean (8232×f32), logvar (8232×f32), timestamp (7×f32)
+#   Full dataset: ~2.4 TiB total
+#   Default run:  64 files (~37 GiB subset)
+#
+# Results are written to:
+#   /home/eval/Documents/Code/mlp-storage/results/flux/
+#
+# =============================================================================
+
+set -euo pipefail
+
+NP="${1:?Usage: $0 <NP> [s3dlio|s3torchconnector|minio] [simulate [log_secs]]  (e.g. ./run_flux_bench.sh 1 s3dlio simulate 30)}"
+LIBRARY="${2:-s3dlio}"
+SIMULATE="${3:-}"
+SIM_LOG_SECS="${4:-60}"
+
+REPO=/home/eval/Documents/Code/mlp-storage
+RESULTS_DIR="${REPO}/results/flux"
+VENV="${REPO}/.venv"
+
+# 64 parquet files, 288 samples each, ~594 MiB each = ~37 GiB subset
+# (full scale: 4296 files = ~2.4 TiB)
+NUM_FILES=64
+SAMPLES_PER_FILE=288
+DATA_FOLDER="data/flux"
+
+mkdir -p "${RESULTS_DIR}"
+
+echo "============================================================"
+echo "  Flux benchmark  NP=${NP}  library=${LIBRARY}${SIMULATE:+  SIMULATE}"
+echo "  Results dir: ${RESULTS_DIR}"
+echo "  Files: ${NUM_FILES} × ${SAMPLES_PER_FILE} samples/file (~594 MiB each)"
+echo "  $(date '+%Y-%m-%d %H:%M:%S')"
+echo "============================================================"
+
+cd "${REPO}"
+source .env
+
+RUST_LOG=s3dlio=info \
+"${VENV}/bin/python3" -c "from mlpstorage_py.main import main; main()" \
+    training run \
+    --model flux \
+    --accelerator-type b200 \
+    --num-accelerators "${NP}" \
+    --num-client-hosts 1 \
+    --client-host-memory-in-gb 64 \
+    --dlio-bin-path "${VENV}/bin" \
+    --object s3 \
+    --skip-validation \
+    --results-dir "${RESULTS_DIR}" \
+    --params \
+        dataset.num_files_train=${NUM_FILES} \
+        dataset.num_samples_per_file=${SAMPLES_PER_FILE} \
+        dataset.data_folder=${DATA_FOLDER} \
+        storage.storage_options.decode_mode=none \
+        storage.storage_options.storage_library=${LIBRARY} \
+        ${SIMULATE:+storage.storage_options.simulate_io=true} \
+        ${SIMULATE:+storage.storage_options.sim_log_secs=${SIM_LOG_SECS}}
+
+echo ""
+echo "============================================================"
+echo "  Run complete — parsing results"
+echo "============================================================"
+
+# Print throughput from the most recent run's metadata.json
+"${VENV}/bin/python3" - <<'PYEOF'
+import json, glob, os
+
+results_dir = "/home/eval/Documents/Code/mlp-storage/results/flux"
+files = sorted(glob.glob(f"{results_dir}/**/training_*_metadata.json", recursive=True))
+if not files:
+    print("  No metadata.json found.")
+    exit(0)
+
+latest = files[-1]
+d = json.load(open(latest))
+np_ = d.get("num_processes", "?")
+runtime = d.get("runtime", None)
+
+print(f"  Run dir:    {os.path.dirname(latest).split('/')[-1]}")
+print(f"  NP:         {np_}")
+
+if runtime:
+    # 64 files × ~594 MiB each
+    total_gb = 64 * 594 / 1024
+    mbps = total_gb * 1024 / runtime
+    print(f"  Runtime:    {runtime:.1f} s")
+    print(f"  Throughput: {mbps:.0f} MB/s  ({total_gb:.1f} GiB / {runtime:.1f} s)")
+else:
+    print("  Runtime not found in metadata")
+
+# Also print DLIO's own summary if it exists
+run_dir = os.path.dirname(latest)
+summary_path = os.path.join(run_dir, "summary.json")
+if os.path.exists(summary_path):
+    s = json.load(open(summary_path))
+    m = s.get("metric", {})
+    au_mean = m.get("train_au_mean_percentage")
+    tput_mean = m.get("train_throughput_mean_samples_per_second")
+    io_mean = m.get("train_io_mean_MB_per_second")
+    au_ok = m.get("train_au_meet_expectation", "?")
+    if au_mean is not None:
+        print(f"  AU mean:    {au_mean:.1f}%  ({au_ok})")
+    if tput_mean is not None:
+        print(f"  Samples/s:  {tput_mean:.0f}")
+    if io_mean is not None:
+        print(f"  DLIO I/O:   {io_mean:.0f} MB/s")
+else:
+    print("  (no summary.json — DLIO may have crashed during finalize)")
+PYEOF
+
+echo "============================================================"
diff --git a/tests/object-store/s3ultra-test-results-20260425.md b/tests/object-store/s3ultra-test-results-20260425.md
deleted file mode 100644
index 7816cd32..00000000
--- a/tests/object-store/s3ultra-test-results-20260425.md
+++ /dev/null
@@ -1,322 +0,0 @@
-# mlp-storage Object-Store Test Results — s3-ultra
-
-**Date:** 2026-04-25  
-**Operator:** AI agent  
-**Storage target:** s3-ultra (local pseudo-S3 server)
-
----
-
-## Test Environment
-
-| Component | Details |
-|-----------|---------|
-| **Storage server** | s3-ultra v0.1.6 |
-| **Server address** | `http://127.0.0.1:9101` |
-| **Bucket** | `mlp-s3dlio` |
-| **Storage library** | **s3dlio v0.9.86** |
-| **CLI tool** | s3-cli (credentials via env vars) |
-| **Package manager** | uv |
-| **Host** | loki-russ (local) |
-
-> **Library used: s3dlio — NOT minio or s3torchconnector.**  
-> Version **0.9.86** was installed in the mlp-storage `.venv` at time of testing.  
-> Verify with: `cd mlp-storage && .venv/bin/pip show s3dlio | grep Version`
-
-### s3-ultra startup command
-
-```bash
-/home/eval/Documents/Code/s3-ultra/target/release/s3-ultra \
-  --port 9101 \
-  --access-key testkey \
-  --secret-key testsecret \
-  --db-path /tmp/s3-ultra-mlp-test
-```
-
-> **Note:** `--mgmt-port` flag causes a panic in this binary (axum router wildcard bug `src/mgmt.rs:167`) — never use it with s3-ultra 0.1.6.
-
-### `.env` used during tests
-
-```bash
-AWS_ACCESS_KEY_ID=testkey
-AWS_SECRET_ACCESS_KEY=testsecret
-AWS_ENDPOINT_URL=http://127.0.0.1:9101
-AWS_REGION=us-east-1
-STORAGE_LIBRARY=s3dlio
-BUCKET=mlp-s3dlio
-```
-
----
-
-## How to Repeat These Tests
-
-These exact steps reproduce the results in this document from scratch.
-
-### 1 — Verify dependencies
-
-```bash
-cd /home/eval/Documents/Code/mlp-storage
-
-# Confirm s3dlio version (must be 0.9.86 or compatible)
-.venv/bin/pip show s3dlio | grep Version
-
-# Confirm s3-ultra binary exists
-ls -lh /home/eval/Documents/Code/s3-ultra/target/release/s3-ultra
-
-# Confirm s3-cli is available
-which s3-cli
-```
-
-### 2 — Start s3-ultra
-
-```bash
-/home/eval/Documents/Code/s3-ultra/target/release/s3-ultra \
-  --port 9101 \
-  --access-key testkey \
-  --secret-key testsecret \
-  --db-path /tmp/s3-ultra-mlp-test &
-
-# Confirm it is listening
-sleep 1 && curl -s http://127.0.0.1:9101/ | head -5
-```
-
-> ⚠️ **Do NOT use `--mgmt-port`** — this flag causes a panic in s3-ultra 0.1.6 (axum router wildcard bug).
-
-### 3 — Create `.env`
-
-Back up the existing `.env` first, then write the s3-ultra config:
-
-```bash
-cp /home/eval/Documents/Code/mlp-storage/.env \
-   /home/eval/Documents/Code/mlp-storage/.env.backup
-
-cat > /home/eval/Documents/Code/mlp-storage/.env << 'EOF'
-AWS_ACCESS_KEY_ID=testkey
-AWS_SECRET_ACCESS_KEY=testsecret
-AWS_ENDPOINT_URL=http://127.0.0.1:9101
-AWS_REGION=us-east-1
-STORAGE_LIBRARY=s3dlio
-BUCKET=mlp-s3dlio
-EOF
-```
-
-### 4 — Create the bucket
-
-```bash
-AWS_ACCESS_KEY_ID=testkey \
-AWS_SECRET_ACCESS_KEY=testsecret \
-AWS_ENDPOINT_URL=http://127.0.0.1:9101 \
-  s3-cli mb s3://mlp-s3dlio
-```
-
-### 5 — Run data generation (one-time)
-
-```bash
-cd /home/eval/Documents/Code/mlp-storage
-bash tests/object-store/run_datagen.sh 2>&1 | tee /tmp/mlp-datagen.log
-```
-
-Generates 168 unet3d NPZ files to `s3://mlp-s3dlio/test-run/unet3d/`. Takes ~2 minutes.
-
-### 6 — Run training benchmark
-
-```bash
-bash tests/object-store/run_training.sh 2>&1 | tee /tmp/mlp-training.log
-```
-
-Runs 5 epochs (24 steps each) against the generated dataset. Takes ~65 seconds.
-
-### 7 — Run checkpointing benchmark
-
-```bash
-NP=8 CHECKPOINTS=2 bash tests/object-store/run_checkpointing.sh 2>&1 | tee /tmp/mlp-checkpoint.log
-```
-
-Saves and restores 2 LLaMA 3 8B checkpoints across 8 simulated ZeRO ranks. Takes ~2.5 minutes.
-
-### 8 — Restore `.env`
-
-```bash
-cp /home/eval/Documents/Code/mlp-storage/.env.backup \
-   /home/eval/Documents/Code/mlp-storage/.env
-```
-
-### 9 — (Optional) Clean up test data
-
-```bash
-set -o allexport; source /home/eval/Documents/Code/mlp-storage/.env.backup; set +o allexport
-# First, re-apply s3-ultra .env for cleanup
-cp <s3ultra-env> /home/eval/Documents/Code/mlp-storage/.env
-bash tests/object-store/run_cleanup.sh
-# Then restore original .env
-```
-
----
-
-## Test 1 — Data Generation (`run_datagen.sh`)
-
-**Script:** `tests/object-store/run_datagen.sh`  
-**Model:** unet3d (MLPerf Storage training dataset)  
-**Start:** 2026-04-25 09:49:57  
-**End:** 2026-04-25 09:51:47  
-**Duration:** ~1 min 50 sec
-
-### Parameters
-
-| Parameter | Value |
-|-----------|-------|
-| Workload | `unet3d_datagen` |
-| Files generated | 168 NPZ files |
-| File size | ~140 MB each (~140 MB × 168 = ~23.5 GB total logical) |
-| Destination | `s3://mlp-s3dlio/test-run/unet3d/` |
-| Generation method | DGEN (dgen-py zero-copy BytesView) |
-| Processes | 1 (NP=1) |
-
-### Output
-
-```
-[OUTPUT] Generation done
-Data Generation Method: DGEN (default)
-  dgen-py zero-copy BytesView — 155x faster than NumPy, 0 MB overhead
-Generating NPZ Data ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 168/168 0:01:44
-```
-
-**Status:** ✅ Complete — 168 files uploaded to `s3://mlp-s3dlio/test-run/unet3d/`
-
----
-
-## Test 2 — Training (`run_training.sh`)
-
-**Script:** `tests/object-store/run_training.sh`  
-**Model:** unet3d_h100 (1 simulated H100 accelerator)  
-**Start:** 2026-04-25 09:52:29  
-**End:** 2026-04-25 09:53:34  
-**Duration:** ~65 sec (5 epochs × ~10 sec each, plus startup)
-
-### Parameters
-
-| Parameter | Value |
-|-----------|-------|
-| Workload | `unet3d_h100` |
-| Simulated accelerators | 1 |
-| Epochs | 5 |
-| Steps per epoch | 24 |
-| Batch size | 7 |
-| Training files | 168 |
-| Dataset path | `s3://mlp-s3dlio/test-run/unet3d/` |
-
-### Per-Epoch Results
-
-| Epoch | Duration | Steps | AU (%) | Throughput (samples/sec) | Compute time/step (s) |
-|-------|----------|-------|--------|--------------------------|----------------------|
-| 1 | 19.94 s | 24 | 81.94 | 16.9766 | 0.3232 ± 0.0001 |
-| 2 | 10.00 s | 24 | 90.40 | 18.7230 | 0.3233 ± 0.0002 |
-| 3 | 9.87 s | 24 | 91.94 | 19.0459 | 0.3232 ± 0.0001 |
-| 4 | 9.74 s | 24 | 92.38 | 19.1415 | 0.3232 ± 0.0001 |
-| 5 | 9.75 s | 24 | 93.26 | 19.3203 | 0.3232 ± 0.0001 |
-
-### Aggregate Metrics
-
-```
-[METRIC] Number of Simulated Accelerators: 1
-[METRIC] Training Accelerator Utilization [AU] (%): 89.9832 (±4.1275)
-[METRIC] Training Throughput (samples/second): 18.6415 (±0.8547)
-[METRIC] Training I/O Throughput (MB/second): 2606.2476 (±119.4992)
-[METRIC] train_au_meet_expectation: fail
-```
-
-> **Note on `fail`:** The MLPerf Storage closed-submission threshold requires ≥ 3500 training files. This test used 168 files (a reduced dataset). Epoch 1 is slower because data is read from s3-ultra; epochs 2–5 benefit from OS page-cache warming.  
-> The benchmark executed fully and all metrics are valid for functional/performance evaluation purposes.
-
-### Validation Warnings
-
-MLPerf closed-submission `INVALID` flags were expected and non-blocking:
-- `storage_library = s3dlio` (custom, not standard)
-- `endpoint_url = http://127.0.0.1:9101` (local s3-ultra, not AWS)
-- `access_key_id` / `secret_access_key` overrides
-- `s3_force_path_style = true`
-- `multiprocessing_context = spawn` (required for Tokio/s3dlio compatibility)
-- `num_files_train = 168` (< 3500 minimum for closed submission)
-
-**Status:** ✅ Complete — all 5 epochs executed successfully
-
----
-
-## Test 3 — Checkpointing (`run_checkpointing.sh`)
-
-**Script:** `tests/object-store/run_checkpointing.sh`  
-**Model:** llama3_8b_checkpoint (LLaMA 3 8B ZeRO-sharded checkpoint)  
-**Start:** 2026-04-25 09:53:52  
-**End:** 2026-04-25 09:56:24  
-**Duration:** ~2 min 32 sec
-
-### Parameters
-
-| Parameter | Value |
-|-----------|-------|
-| Workload | `llama3_8b_checkpoint` |
-| Simulated accelerators (NP) | 8 |
-| Checkpoint cycles | 2 |
-| Checkpoint path | `s3://mlp-s3dlio/s3dlio/llama3-8b/` |
-| Chunk size | 32 MB per chunk |
-| Read workers | 2 (peak RAM ≤ 256 MB) |
-
-### Checkpoint Structure per Cycle
-
-Each checkpoint cycle writes and reads a full ZeRO-sharded LLaMA 3 8B state:
-- 8 × `zero_pp_rank_N_mp_rank_0_model_states.pt` (~1.87 GB each)
-- 8 × `zero_pp_rank_N_mp_rank_0_optim_states.pt` (~11.22 GB each)
-- **Total per checkpoint:** ~104 GB (model + optimizer states × 8 ranks)
-
-### Aggregate Metrics
-
-```
-[METRIC] Number of Simulated Accelerators: 8
-[METRIC] Checkpoint save duration (seconds): 50.5594 (±0.1017)
-[METRIC] Checkpoint save I/O Throughput (GB/second): 2.0709 (±0.0042)
-[METRIC] Checkpoint load duration (seconds): 11.8625 (±0.1422)
-[METRIC] Checkpoint load I/O Throughput (GB/second): 8.8278 (±0.1059)
-```
-
-### Individual File Throughput (representative samples)
-
-| Operation | File type | I/O time | Throughput |
-|-----------|-----------|----------|-----------|
-| Load | model_states (1.87 GB) | ~1.62 s | ~1.16 GB/s |
-| Load | optim_states (11.22 GB) | ~9.55–10.3 s | ~1.09–1.18 GB/s |
-| Load (checkpoint 1, aggregate) | all ranks | 12.0 s | **8.72 GB/s** |
-| Load (checkpoint 2, aggregate) | all ranks | 11.72 s | **8.93 GB/s** |
-
-> **Note:** Aggregate load throughput (8.7–8.9 GB/s) is much higher than per-file throughput (~1.1 GB/s) because all 8 ranks load their shards concurrently using streaming byte-range GETs.
-
-**Status:** ✅ Complete — 2 checkpoint save+load cycles successful
-
----
-
-## Summary
-
-| Test | Status | Key Metric |
-|------|--------|-----------|
-| Data generation | ✅ Pass | 168 files in ~1:50 via DGEN zero-copy |
-| Training | ✅ Pass | 18.64 samples/sec avg, 2606 MB/s I/O throughput |
-| Checkpointing | ✅ Pass | 8.83 GB/s aggregate load, 2.07 GB/s save |
-
-### Observations
-
-1. **s3-ultra works as a drop-in pseudo-S3 backend** for mlp-storage tests without requiring real object storage or network access.
-2. **Training epoch 1 latency** is higher (19.94 s vs ~10 s for epochs 2–5) due to cold s3-ultra reads; subsequent epochs benefit from OS page cache.
-3. **Checkpoint load** (8.83 GB/s aggregate) significantly outperforms save (2.07 GB/s) because 8 ranks read concurrently while write throughput is serialized per-object.
-4. **INVALID warnings** are expected in this configuration — the benchmark is not a closed-submission run (custom endpoint, reduced dataset). All tests executed and produced valid functional results.
-5. **s3dlio `multiprocessing_context=spawn`** is required to avoid Tokio runtime conflicts with Python forking; this is baked into the test scripts.
-
----
-
-## Artifacts
-
-| Artifact | Path |
-|----------|------|
-| Datagen log | `/tmp/mlp-datagen.log` |
-| Training log | `/tmp/mlp-training.log` |
-| Checkpoint log | `/tmp/mlp-checkpoint.log` |
-| Datagen results | `/tmp/mlperf_storage_results/training/unet3d/datagen/20260425_094957/` |
-| Training results | `/tmp/mlperf_storage_results/training/unet3d/run/20260425_095229/` |
-| Checkpoint results | `/tmp/dlio-checkpoint-20260425_095352/` |
diff --git a/tests/object-store/scaling-analysis-2026-04-25.md b/tests/object-store/scaling-analysis-2026-04-25.md
deleted file mode 100644
index 4139ac65..00000000
--- a/tests/object-store/scaling-analysis-2026-04-25.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# S3 Datagen Scaling Analysis — s3dlio vs s3torchconnector vs minio
-
-**Date**: April 25, 2026  
-**System**: Intel Xeon Platinum 8280L (Cascade Lake, 28 cores / 56 threads) — **no SHA-NI**  
-**Server**: s3-ultra local (`http://127.0.0.1:9101`)  
-**Dataset**: retinanet JPEG, 50,000 files × 322,957 bytes = **15,396 MiB** (benchmark subset)  
-**Setting**: `DLIO_MAX_AUTO_THREADS=8` → 8 write_threads/rank for all libraries  
-
----
-
-## Measured Results (28-core test machine, NP=1/2/4/8)
-
-| library | NP | elapsed (s) | throughput (MiB/s) | speedup vs NP=1 | user CPU (s) | %CPU |
-|:---:|:---:|---:|---:|---:|---:|---:|
-| s3dlio | 1 | 30.59 | 503 | 1.00× | 134.2 | 465% |
-| s3dlio | 2 | 19.69 | 782 | 1.55× | 138.0 | 747% |
-| s3dlio | 4 | 16.66 | 924 | 1.84× | 149.1 | 958% |
-| s3dlio | 8 | 14.56 | **1,057** | **2.10×** | 167.7 | 1240% |
-| s3torchconnector | 1 | 32.92 | 468 | 1.00× | 51.6 | 208% |
-| s3torchconnector | 2 | 19.22 | 801 | 1.71× | 53.7 | 368% |
-| s3torchconnector | 4 | 11.80 | 1,305 | 2.79× | 62.1 | 687% |
-| s3torchconnector | 8 | 8.86 | **1,738** | **3.71×** | 83.6 | 1206% |
-| minio | 1 | 53.09 | 290 | 1.00× | 104.4 | 220% |
-| minio | 2 | 29.83 | 516 | 1.78× | 107.2 | 405% |
-| minio | 4 | 22.18 | 694 | 2.39× | 117.9 | 602% |
-| minio | 8 | 17.48 | **881** | **3.04×** | 137.8 | 897% |
-
-### Scaling efficiency (actual / ideal-linear)
-
-| library | NP=1 | NP=2 | NP=4 | NP=8 |
-|:---:|:---:|:---:|:---:|:---:|
-| s3dlio | 100% | 78% | 46% | **26%** |
-| s3torchconnector | 100% | 86% | 70% | **46%** |
-| minio | 100% | 89% | 60% | **38%** |
-
----
-
-## Why s3dlio Scales Poorly on This 28-Core Machine
-
-The key metric is **average CPU cores consumed per rank at NP=1**:
-
-| library | cores needed at NP=1 | cores available per rank at NP=8 | over-subscribed? |
-|:---:|:---:|:---:|:---:|
-| s3dlio | **4.39** | 3.5 | **YES — 1.25×** |
-| s3torchconnector | 1.57 | 3.5 | no — 0.45× |
-| minio | 1.97 | 3.5 | no — 0.56× |
-
-s3dlio genuinely consumes ~4.4 cores per rank at NP=1, primarily due to **software SHA-256
-signing** (this CPU has no SHA-NI instruction set extension). At NP=8 on a 28-core machine,
-each rank is budgeted 28 ÷ 8 = **3.5 cores** — meaning s3dlio is CPU-starved from rank 4
-onward. The other two libraries need only ~1.6–2 cores per rank and have ample headroom at
-all NP levels.
-
-**This is not a Tokio thread design flaw.** s3dlio is right-sized for a larger machine.
-The 28-core test machine simply cannot provide 4.39 cores × 8 ranks = 35 cores worth of
-compute from a 28-core chip.
-
-s3torchconnector's advantage on this machine is that it has a persistent connection pool
-and a non-GIL-bound signing path, making it the most CPU-efficient option on SHA-NI-less
-hardware. minio's poor NP=1 result (GIL-bound PUTs) is rescued somewhat by NP scaling,
-since each process gets its own GIL.
-
----
-
-## Projection: 128-core Production System (NP=8, 16 cores/rank)
-
-On a 128-core machine, the CPU constraint disappears entirely for s3dlio. Each rank now has
-16 cores available vs 4.39 needed — over-provisioned by 3.6×.
-
-### Projected NP=8 throughputs
-
-| library | 28-core NP=8 (measured) | 128-core NP=8 (projected) | efficiency range | why |
-|:---:|:---:|:---:|:---:|:---|
-| **s3dlio** | 1,057 MiB/s (26%) | **2,600–3,600 MiB/s** | 65–90% | CPU bottleneck gone; SHA-256 has 16 cores/rank |
-| **s3torchconnector** | 1,738 MiB/s (46%) | **2,250–3,200 MiB/s** | 60–85% | Low per-rank CPU; may hit network/server ceiling |
-| **minio** | 881 MiB/s (38%) | **1,160–1,740 MiB/s** | 50–75% | GIL-bound per rank; linear if server keeps up |
-
-**Reversal**: s3dlio, which looks weakest on the 28-core test, is projected to be the
-**fastest library at NP=8 on 128 cores**. Its higher per-rank throughput at NP=1 (503 vs
-468 MiB/s) combined with near-linear scaling (once CPU-unconstrained) gives it the
-highest ceiling.
-
----
-
-## CPU Efficiency Summary
-
-| library | CPU-seconds per GiB/s (NP=1) | interpretation |
-|:---:|:---:|:---|
-| s3torchconnector | 113 s/GiB/s | Most CPU-efficient — persistent pool, non-GIL signing |
-| minio | 369 s/GiB/s | GIL-bound; low throughput inflates this ratio |
-| s3dlio | 273 s/GiB/s | High SHA-256 cost on no-SHA-NI CPU; disappears on SHA-NI hardware |
-
----
-
-## Tuning Recommendations for 128-Core Runs
-
-### Environment variable (set before calling `mlpstorage`)
-
-```bash
-# 128-core system, NP=8 — limit Tokio RT threads to match write_threads
-# Default: max(4, num_cpus) = 128 threads/rank × 8 ranks = 1,024 Tokio threads
-# Recommended: match to write_threads (32 on 128-core/NP=8 via auto-formula)
-export S3DLIO_RT_THREADS=32    # exact match to write_threads
-# OR
-export S3DLIO_RT_THREADS=64    # 2× write_threads, headroom for connection management
-```
-
-Why this matters: the auto-formula gives 32 write_threads/rank on 128-core/NP=8 (via
-`max(8, min(16×2, 32))`). The s3dlio Tokio RT default of 128 threads/rank is unnecessary
-for a Python caller driving 32 concurrent uploads — it adds scheduling noise with no
-throughput benefit.
-
-### mlp-storage code change (optional)
-
-`config.py` already computes the right `write_threads` automatically. The only
-quality-of-life improvement would be to auto-propagate `write_threads` into
-`S3DLIO_RT_THREADS` in `obj_store_lib.py` when `storage_library=s3dlio`:
-
-```python
-# In obj_store_lib.py, when initializing s3dlio:
-import os
-os.environ.setdefault('S3DLIO_RT_THREADS', str(write_threads))
-```
-
-This is optional — not a correctness issue.
-
----
-
-## Full Retinanet Datagen: Time Estimates
-
-### Dataset size
-
-```
-Default retinanet: 1,170,301 files × 322,957 bytes = 377,957 MB = 352 GiB
-Benchmark subset:     50,000 files                 =  15,396 MiB
-Scale factor:         1,170,301 / 50,000 = 23.41×
-```
-
-### 28-core machine, NP=8 (extrapolated from measured throughputs)
-
-| library | NP=8 throughput | estimated time (full dataset) |
-|:---:|:---:|:---:|
-| s3torchconnector | 1,738 MiB/s | **207 s (3.5 min)** |
-| s3dlio | 1,057 MiB/s | **341 s (5.7 min)** |
-| minio | 881 MiB/s | **409 s (6.8 min)** |
-
-> Note: these assume throughput is constant with file count. In practice the
-> benchmark overhead (process startup, listing) is amortized across more files,
-> so actual times may be slightly *faster* per MiB at 1.17M files.
-
-### 128-core machine, NP=8 (projected)
-
-| library | throughput range (MiB/s) | time range (s) | time range (min) |
-|:---:|:---:|:---:|:---:|
-| **s3dlio** | 2,600–3,600 | **100–138 s** | **1.7–2.3 min** |
-| **s3torchconnector** | 2,250–3,200 | **113–160 s** | **1.9–2.7 min** |
-| **minio** | 1,160–1,740 | **207–311 s** | **3.5–5.2 min** |
-
-On the 128-core production system s3dlio and s3torchconnector are essentially neck-and-neck
-(both ~2–3 min), with minio meaningfully slower (3.5–5 min). The key uncertainty is whether
-the s3-ultra server — also presumably on a large host — can sustain 2.5–3.5 GB/s of PUT
-throughput. If it becomes the bottleneck first, all three libraries converge at the server
-ceiling.
-
----
-
-## Key Conclusions
-
-1. **s3dlio's poor NP=4/8 scaling on 28 cores is a test-machine artifact**, not a library
-   flaw. The CPU cost of software SHA-256 (4.4 cores/rank) exceeds what a 28-core chip
-   can provide at NP=8. On SHA-NI hardware, or on a ≥96-core machine, this cost either
-   disappears or becomes immaterial.
-
-2. **s3torchconnector is the safe choice for SHA-NI-less hardware at any scale**. Its low
-   per-PUT CPU cost (1.6 cores/rank) leaves plenty of headroom and scales cleanly.
-
-3. **minio scales better than expected with NP** (3.04× at NP=8) because multiprocessing
-   gives each rank an independent GIL. But its single-rank ceiling is hard GIL-limited
-   (~290 MiB/s), so it cannot match the Rust libraries at any scale.
-
-4. **For the official benchmark submission (128-core, NP=8)**: expect 1.7–2.3 min datagen
-   with s3dlio and 1.9–2.7 min with s3torchconnector. Recommend running with
-   `S3DLIO_RT_THREADS=32` to avoid Tokio scheduling overhead.
-
-5. **No mlp-storage code changes are required** for the 128-core run. The existing
-   `write_threads` auto-formula already produces 32 threads/rank at 128-core/NP=8.
diff --git a/tests/object-store/show_results.sh b/tests/object-store/show_results.sh
new file mode 100755
index 00000000..95edc5d2
--- /dev/null
+++ b/tests/object-store/show_results.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# =============================================================================
+# show_results.sh — Print a summary table of all completed DLRM runs
+# =============================================================================
+#
+# Usage: ./show_results.sh
+#
+# =============================================================================
+
+VENV=/home/eval/Documents/Code/mlp-storage/.venv
+
+"${VENV}/bin/python3" - <<'PYEOF'
+import json, glob, os
+
+results_dir = "/home/eval/Documents/Code/mlp-storage/results/dlrm"
+files = sorted(glob.glob(f"{results_dir}/**/training_*_metadata.json", recursive=True))
+
+if not files:
+    print("No results found in", results_dir)
+    exit(0)
+
+print(f"{'Run':20s}  {'NP':>3}  {'Runtime(s)':>11}  {'MB/s':>8}  {'AU%':>6}  {'DLIO IO MB/s':>12}  {'Summary':8}")
+print("-" * 85)
+
+for f in files:
+    run_id = os.path.dirname(f).split("/")[-1]
+    d = json.load(open(f))
+    np_ = d.get("num_processes", "?")
+    runtime = d.get("runtime")
+
+    if runtime:
+        total_mb = 64 * 970
+        mbps = total_mb / runtime
+        rt_str = f"{runtime:.1f}"
+        mbps_str = f"{mbps:.0f}"
+    else:
+        rt_str = "?"
+        mbps_str = "?"
+
+    # DLIO summary
+    summary_path = os.path.join(os.path.dirname(f), "summary.json")
+    if os.path.exists(summary_path):
+        s = json.load(open(summary_path))
+        m = s.get("metric", {})
+        au_str = f"{m.get('train_au_mean_percentage', '?'):.1f}" if isinstance(m.get('train_au_mean_percentage'), float) else "?"
+        io_str = f"{m.get('train_io_mean_MB_per_second', '?'):.0f}" if isinstance(m.get('train_io_mean_MB_per_second'), float) else "?"
+        ok = m.get("train_au_meet_expectation", "?")
+    else:
+        au_str = "-"
+        io_str = "-"
+        ok = "no summary"
+
+    print(f"{run_id:20s}  {str(np_):>3}  {rt_str:>11}  {mbps_str:>8}  {au_str:>6}  {io_str:>12}  {ok}")
+
+PYEOF
diff --git a/tests/object-store/sweeps/sweep_dlrm_compute.sh b/tests/object-store/sweeps/sweep_dlrm_compute.sh
new file mode 100755
index 00000000..a79b84a8
--- /dev/null
+++ b/tests/object-store/sweeps/sweep_dlrm_compute.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+# =============================================================================
+# sweep_dlrm_compute.sh — DLRM computation_time sweep
+#
+# Phase 1 (this script):
+#   Sweep computation_time at NP=1: 375us, 1ms, 5ms, 10ms
+#   Uses s3dlio Rust-based Parquet generator and s3dlio reader throughout.
+#
+# Dataset: 200 files × 1,536,000 samples ≈ 234 GB in bucket mlp-dlrm
+#   (20% of full 1024-file spec; footer ~3.1 MiB < s3-ultra 4 MiB limit)
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#
+#   # Step 1 — generate data (one-time, takes a while):
+#   tests/object-store/sweep_dlrm_compute.sh datagen
+#
+#   # Step 2 — run the sweep:
+#   tests/object-store/sweep_dlrm_compute.sh
+#
+# After reviewing Phase 1 results, run Phase 2 (NP sweep) separately.
+# =============================================================================
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+VENV="${REPO}/.venv"
+RESULTS_DIR="${REPO}/results/dlrm_sweep"
+PYTHON="${VENV}/bin/python3"
+
+# Dataset: 20% of spec
+NUM_FILES=200
+SAMPLES_PER_FILE=1536000  # 250 RGs × 6144 → ~3.1 MiB footer (under s3-ultra 4 MiB limit)
+DATA_FOLDER="data/dlrm"
+
+# Phase 1: NP=1 only
+NP=1
+
+# computation_time values to sweep (seconds)
+COMP_TIMES=("0.000375" "0.001" "0.005" "0.010")
+COMP_LABELS=("375us"   "1ms"   "5ms"   "10ms")
+
+mkdir -p "${RESULTS_DIR}"
+
+cd "${REPO}"
+source .env
+
+# Override BUCKET to the dlrm-specific bucket
+export BUCKET=mlp-dlrm
+
+# ─── datagen ──────────────────────────────────────────────────────────────────
+if [[ "${1:-}" == "datagen" ]]; then
+    echo "============================================================"
+    echo "  DLRM datagen — s3dlio Rust Parquet generator"
+    echo "  ${NUM_FILES} files x ${SAMPLES_PER_FILE} samples = 718 GB"
+    echo "  Bucket: ${BUCKET}  Path: ${DATA_FOLDER}"
+    echo "  $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "============================================================"
+
+    RUST_LOG=s3dlio=info \
+    "${PYTHON}" -c "from mlpstorage_py.main import main; main()" \
+        training datagen \
+        --model dlrm \
+        --num-processes 1 \
+        --dlio-bin-path "${VENV}/bin" \
+        --object s3 \
+        --skip-validation \
+        --open \
+        --results-dir "${RESULTS_DIR}" \
+        --params \
+            dataset.num_files_train=${NUM_FILES} \
+            dataset.num_samples_per_file=${SAMPLES_PER_FILE} \
+            dataset.data_folder=${DATA_FOLDER} \
+            storage.storage_options.decode_mode=none \
+            storage.storage_options.storage_library=s3dlio
+
+    echo "============================================================"
+    echo "  Datagen complete: $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "============================================================"
+    exit 0
+fi
+
+# ─── Phase 1 sweep: computation_time at NP=1 ─────────────────────────────────
+SUMMARY_TSV="${RESULTS_DIR}/sweep_compute_NP1_$(date '+%Y%m%d_%H%M%S').tsv"
+echo -e "computation_time\tlabel\tNP\tau_pct\tsamples_per_sec\tio_mb_per_sec\tau_met" \
+    > "${SUMMARY_TSV}"
+
+for i in "${!COMP_TIMES[@]}"; do
+    CT="${COMP_TIMES[$i]}"
+    LABEL="${COMP_LABELS[$i]}"
+
+    echo ""
+    echo "============================================================"
+    echo "  computation_time=${CT} (${LABEL})  NP=${NP}"
+    echo "  $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "============================================================"
+
+    RUN_RESULTS="${RESULTS_DIR}/run_ct${LABEL}_NP${NP}"
+    mkdir -p "${RUN_RESULTS}"
+
+    RUST_LOG=s3dlio=info \
+    "${PYTHON}" -c "from mlpstorage_py.main import main; main()" \
+        training run \
+        --model dlrm \
+        --accelerator-type b200 \
+        --num-accelerators "${NP}" \
+        --num-client-hosts 1 \
+        --client-host-memory-in-gb 47 \
+        --dlio-bin-path "${VENV}/bin" \
+        --object s3 \
+        --skip-validation \
+        --open \
+        --results-dir "${RUN_RESULTS}" \
+        --params \
+            dataset.num_files_train=${NUM_FILES} \
+            dataset.num_samples_per_file=${SAMPLES_PER_FILE} \
+            dataset.data_folder=${DATA_FOLDER} \
+            train.computation_time=${CT} \
+            storage.storage_options.decode_mode=none \
+            storage.storage_options.storage_library=s3dlio
+
+    # Parse and append to summary
+    "${PYTHON}" - "${CT}" "${LABEL}" "${NP}" "${RUN_RESULTS}" \
+        >> "${SUMMARY_TSV}" 2>&1 <<'PYEOF'
+import json, glob, os, sys
+
+ct, label, np_, run_results = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]
+
+files = sorted(glob.glob(f"{run_results}/**/summary.json", recursive=True))
+if not files:
+    print(f"{ct}\t{label}\t{np_}\tN/A\tN/A\tN/A\tN/A")
+    sys.exit(0)
+
+d = json.load(open(files[-1]))
+m = d.get("metric", {})
+
+au   = m.get("train_au_mean_percentage",                 "N/A")
+sps  = m.get("train_throughput_mean_samples_per_second", "N/A")
+ioMB = m.get("train_io_mean_MB_per_second",              "N/A")
+met  = m.get("train_au_meet_expectation",                "N/A")
+
+def fmt(v): return f"{v:.2f}" if isinstance(v, float) else str(v)
+print(f"{ct}\t{label}\t{np_}\t{fmt(au)}\t{fmt(sps)}\t{fmt(ioMB)}\t{met}")
+PYEOF
+
+done
+
+# ─── summary table ────────────────────────────────────────────────────────────
+echo ""
+echo "============================================================"
+echo "  Phase 1 complete — computation_time sweep at NP=1"
+echo "  Results: ${SUMMARY_TSV}"
+echo "============================================================"
+echo ""
+column -t -s $'\t' "${SUMMARY_TSV}"
+echo ""
+echo "Next: review AU and I/O columns, pick 1-2 values, then run Phase 2 (NP sweep)."
diff --git a/tests/object-store/sweeps/sweep_dlrm_np.sh b/tests/object-store/sweeps/sweep_dlrm_np.sh
new file mode 100755
index 00000000..6a0ec338
--- /dev/null
+++ b/tests/object-store/sweeps/sweep_dlrm_np.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+# =============================================================================
+# sweep_dlrm_np.sh — DLRM NP (num-accelerators) sweep — Phase 2
+#
+# Sweeps NP=1,2,4,8 at two computation_time values (1ms and 5ms) that were
+# selected from the Phase 1 compute-time sweep results.
+#
+#   1ms  → I/O-bound baseline  (AU ~20% at NP=1, storage bottleneck)
+#   5ms  → balanced / AU sweet spot (AU ~79% at NP=1)
+#
+# All runs use a single host (127.0.0.1); NP controls both mpirun -n and
+# the --num-accelerators argument passed to the mlpstorage_py wrapper.
+#
+# Dataset: 200 files × 1,536,000 samples  (bucket: mlp-dlrm / data/dlrm)
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#   bash tests/object-store/sweep_dlrm_np.sh 2>&1
+# =============================================================================
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+VENV="${REPO}/.venv"
+RESULTS_DIR="${REPO}/results/dlrm_sweep"
+PYTHON="${VENV}/bin/python3"
+
+# Dataset (matches Phase 1)
+NUM_FILES=200
+SAMPLES_PER_FILE=1536000
+DATA_FOLDER="data/dlrm"
+
+# Fixed computation_time values chosen from Phase 1 results
+COMP_TIMES=("0.001" "0.005")
+COMP_LABELS=("1ms"  "5ms")
+
+# NP sweep
+NP_VALUES=(1 2 4 8)
+
+mkdir -p "${RESULTS_DIR}"
+
+cd "${REPO}"
+source .env
+export BUCKET=mlp-dlrm
+
+SUMMARY_TSV="${RESULTS_DIR}/sweep_np_$(date '+%Y%m%d_%H%M%S').tsv"
+echo -e "computation_time\tlabel\tNP\tau_pct\tsamples_per_sec\tio_mb_per_sec\tau_met" \
+    > "${SUMMARY_TSV}"
+
+for NP in "${NP_VALUES[@]}"; do
+    for i in "${!COMP_TIMES[@]}"; do
+        CT="${COMP_TIMES[$i]}"
+        LABEL="${COMP_LABELS[$i]}"
+
+        echo ""
+        echo "============================================================"
+        echo "  computation_time=${CT} (${LABEL})  NP=${NP}"
+        echo "  $(date '+%Y-%m-%d %H:%M:%S')"
+        echo "============================================================"
+
+        RUN_RESULTS="${RESULTS_DIR}/run_ct${LABEL}_NP${NP}"
+        mkdir -p "${RUN_RESULTS}"
+
+        RUST_LOG=s3dlio=info \
+        "${PYTHON}" -c "from mlpstorage_py.main import main; main()" \
+            training run \
+            --model dlrm \
+            --accelerator-type b200 \
+            --num-accelerators "${NP}" \
+            --num-client-hosts 1 \
+            --client-host-memory-in-gb 47 \
+            --dlio-bin-path "${VENV}/bin" \
+            --object s3 \
+            --skip-validation \
+            --open \
+            --results-dir "${RUN_RESULTS}" \
+            --params \
+                dataset.num_files_train=${NUM_FILES} \
+                dataset.num_samples_per_file=${SAMPLES_PER_FILE} \
+                dataset.data_folder=${DATA_FOLDER} \
+                train.computation_time=${CT} \
+                storage.storage_options.decode_mode=none \
+                storage.storage_options.storage_library=s3dlio
+
+        # Parse summary.json and append row to TSV
+        "${PYTHON}" - "${CT}" "${LABEL}" "${NP}" "${RUN_RESULTS}" \
+            >> "${SUMMARY_TSV}" 2>&1 <<'PYEOF'
+import json, glob, sys
+
+ct, label, np_, run_results = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]
+
+files = sorted(glob.glob(f"{run_results}/**/summary.json", recursive=True))
+if not files:
+    print(f"{ct}\t{label}\t{np_}\tN/A\tN/A\tN/A\tN/A")
+    sys.exit(0)
+
+d = json.load(open(files[-1]))
+m = d.get("metric", {})
+
+au   = m.get("train_au_mean_percentage",                 "N/A")
+sps  = m.get("train_throughput_mean_samples_per_second", "N/A")
+ioMB = m.get("train_io_mean_MB_per_second",              "N/A")
+met  = m.get("train_au_meet_expectation",                "N/A")
+
+def fmt(v): return f"{v:.2f}" if isinstance(v, float) else str(v)
+print(f"{ct}\t{label}\t{np_}\t{fmt(au)}\t{fmt(sps)}\t{fmt(ioMB)}\t{met}")
+PYEOF
+
+    done
+done
+
+# ─── summary table ────────────────────────────────────────────────────────────
+echo ""
+echo "============================================================"
+echo "  Phase 2 complete — NP sweep (1ms + 5ms compute time)"
+echo "  Results: ${SUMMARY_TSV}"
+echo "============================================================"
+echo ""
+column -t -s $'\t' "${SUMMARY_TSV}"
+echo ""
+echo "Expected pattern:"
+echo "  1ms: AU stays low (I/O-bound), throughput scales with NP until storage saturates"
+echo "  5ms: AU stays high (~80%), throughput scales linearly with NP (compute-bound)"
diff --git a/tests/object-store/sweeps/sweep_flux.sh b/tests/object-store/sweeps/sweep_flux.sh
new file mode 100755
index 00000000..c8b4ec14
--- /dev/null
+++ b/tests/object-store/sweeps/sweep_flux.sh
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+# Flux read-thread × NP scaling sweep
+# NP ∈ {1,2,4,8}, read_threads ∈ {1,2,4,8}  → 16 combos
+# (NP=8, RT=8) is gated on (NP=4, RT=4) passing
+#
+# Fixed params across all runs:
+#   computation_time = 0.05 s
+#   coalesce_rgs     = 1
+#   prefetch_workers = 2
+#   dataset.num_files_train = 500
+#
+# Usage: bash sweep_flux.sh [--logdir DIR]
+#        (default logdir: ./sweep_logs/<timestamp>)
+
+set -uo pipefail
+
+LOGDIR=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --logdir) LOGDIR="$2"; shift 2;;
+        *) echo "Unknown arg: $1"; exit 1;;
+    esac
+done
+[[ -z "$LOGDIR" ]] && LOGDIR="./sweep_logs/$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$LOGDIR"
+
+SUMMARY="$LOGDIR/summary.tsv"
+printf "NP\tRT\texitcode\tthroughput_GBs\tAU_pct\tduration_s\tlog\n" > "$SUMMARY"
+
+log_and_echo() { echo "$1" | tee -a "$2"; }
+
+run_combo() {
+    local np=$1
+    local rt=$2
+    local logfile="$LOGDIR/np${np}_rt${rt}.log"
+    local t_start t_end duration exitcode throughput au
+
+    {
+        echo ""
+        echo "========================================================"
+        echo "  NP=${np}  read_threads=${rt}  started: $(date)"
+        echo "========================================================"
+    } | tee "$logfile"
+
+    t_start=$(date +%s)
+
+    uv run mlpstorage training run \
+        --model flux \
+        --num-accelerators "$np" \
+        --accelerator-type b200 \
+        --client-host-memory-in-gb 47 \
+        --object s3 \
+        --skip-validation \
+        --open \
+        --params \
+            dataset.num_files_train=500 \
+            "train.computation_time=0.05" \
+            "storage.storage_options.coalesce_rgs=1" \
+            "storage.storage_options.prefetch_workers=2" \
+            "reader.read_threads=${rt}" \
+        2>&1 | tee -a "$logfile"
+    exitcode=${PIPESTATUS[0]}
+
+    t_end=$(date +%s)
+    duration=$(( t_end - t_start ))
+
+    # Extract throughput: match patterns like "1.923 GB/s" or "1923.4 MB/s"
+    throughput=$(grep -oP '\d+\.\d+\s*GB/s' "$logfile" 2>/dev/null \
+                 | tail -1 | grep -oP '\d+\.\d+' || true)
+    if [[ -z "$throughput" ]]; then
+        # try MB/s and convert
+        local mbs
+        mbs=$(grep -oP '\d+\.\d+\s*MB/s' "$logfile" 2>/dev/null \
+              | tail -1 | grep -oP '\d+\.\d+' || true)
+        [[ -n "$mbs" ]] && throughput=$(awk "BEGIN{printf \"%.3f\", $mbs/1024}") || throughput="N/A"
+    fi
+
+    # Extract accelerator utilisation: "AU=96.8" / "accelerator_util.*96.8" / "util.*96.8 %"
+    au=$(grep -iP 'accelerator.util|AU\s*[=:]\s*' "$logfile" 2>/dev/null \
+         | grep -oP '\d+\.\d+' | tail -1 || true)
+    [[ -z "$au" ]] && au="N/A"
+
+    local status="OK"
+    [[ $exitcode -ne 0 ]] && status="FAIL"
+
+    printf "%-4s\t%-4s\t%s(%s)\t%-14s\t%-8s\t%-12s\t%s\n" \
+        "$np" "$rt" "$exitcode" "$status" \
+        "${throughput}" "${au}" "${duration}" "${logfile}" >> "$SUMMARY"
+
+    {
+        echo ""
+        echo "  Finished: $(date)  exit=${exitcode}  duration=${duration}s"
+        echo "  throughput=${throughput} GB/s  AU=${au}%"
+        echo "========================================================"
+    } | tee -a "$logfile"
+
+    return $exitcode
+}
+
+# ── Print plan ────────────────────────────────────────────────────────────────
+echo ""
+echo "========================================================"
+echo "  Flux scaling sweep  —  $(date)"
+echo "  LOGDIR: $LOGDIR"
+echo "  Fixed: computation_time=0.05  coalesce_rgs=1  prefetch_workers=2"
+echo "  NP ∈ {1,2,4,8}  ×  read_threads ∈ {1,2,4,8}"
+echo "  (NP=8, RT=8) gated on (NP=4, RT=4) passing"
+echo "========================================================"
+echo ""
+
+NPS=(1 2 4 8)
+RTS=(1 2 4 8)
+np4_rt4_ok=false
+total=0
+passed=0
+
+for np in "${NPS[@]}"; do
+    for rt in "${RTS[@]}"; do
+        # Gate: skip (8,8) here — handled below
+        [[ $np -eq 8 && $rt -eq 8 ]] && continue
+
+        total=$(( total + 1 ))
+        echo ""
+        echo "─── Combo ${total}/15 : NP=${np}  RT=${rt} ───"
+
+        if run_combo "$np" "$rt"; then
+            passed=$(( passed + 1 ))
+            [[ $np -eq 4 && $rt -eq 4 ]] && np4_rt4_ok=true
+        else
+            echo "  *** NP=${np} RT=${rt} FAILED — continuing sweep ***"
+        fi
+    done
+done
+
+# ── Gate: (NP=8, RT=8) ────────────────────────────────────────────────────────
+echo ""
+echo "========================================================"
+if $np4_rt4_ok; then
+    echo "  GATE: NP=4 RT=4 PASSED → running NP=8 RT=8"
+    echo "========================================================"
+    total=$(( total + 1 ))
+    if run_combo 8 8; then
+        passed=$(( passed + 1 ))
+    fi
+else
+    echo "  GATE: NP=4 RT=4 did NOT pass → SKIPPING NP=8 RT=8"
+    echo "========================================================"
+    printf "%-4s\t%-4s\t%s\t%-14s\t%-8s\t%-12s\t%s\n" \
+        "8" "8" "SKIPPED" "N/A" "N/A" "N/A" "gated_on_4x4" >> "$SUMMARY"
+fi
+
+# ── Final summary ─────────────────────────────────────────────────────────────
+echo ""
+echo "========================================================"
+echo "  SWEEP COMPLETE  —  $(date)"
+echo "  Passed: ${passed}/${total}"
+echo "  Summary: $SUMMARY"
+echo "========================================================"
+echo ""
+cat "$SUMMARY"
diff --git a/tests/object-store/sweeps/sweep_retinanet_np.sh b/tests/object-store/sweeps/sweep_retinanet_np.sh
new file mode 100755
index 00000000..c9a2cf87
--- /dev/null
+++ b/tests/object-store/sweeps/sweep_retinanet_np.sh
@@ -0,0 +1,292 @@
+#!/usr/bin/env bash
+# =============================================================================
+# sweep_retinanet_np.sh — RetinaNet NP (num-accelerators) scaling sweep
+#
+# Sweeps NP=1, 2, 4 using the B200 computation_time (0.04755 s).
+# NP=8 is intentionally excluded — co-located s3-ultra saturates at NP≥4.
+#
+# Dataset : s3://mlp-retinanet/data/retinanet/  (50,000 JPEG files ≈ 15.4 GiB)
+# Format  : JPEG, 1 sample/file, ~323 KiB/file
+# Model   : retinanet, B200 accelerator
+# AU goal : ≥ 0.85 (85%)
+#
+# Key difference from UNet3D:
+#   RetinaNet uses many small objects (315 KiB × 50,000) vs few large objects
+#   (140 MiB × 7,200 for UNet3D). The iterable DataLoader path
+#   (TorchIterableDatasetSimple) issues 64 × NP concurrent GETs, which is
+#   essential for saturating the storage backend with small objects.
+#
+# Results per run are written to  results/retinanet_np_sweep/<timestamp>/
+# A TSV summary row is appended after each run, printed at the end.
+# A Markdown results doc is auto-generated at the end of the sweep.
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#   bash tests/object-store/sweep_retinanet_np.sh 2>&1 | tee sweep_retinanet_$(date +%Y%m%d_%H%M%S).log
+#
+#   # Full MLPerf dataset (must have been generated with NUM_FILES=1170301):
+#   NUM_FILES=1170301 bash tests/object-store/sweep_retinanet_np.sh 2>&1 | tee ...
+# =============================================================================
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+VENV="${REPO}/.venv"
+PYTHON="${VENV}/bin/python3"
+
+SWEEP_TS=$(date '+%Y%m%d_%H%M%S')
+RESULTS_BASE="${REPO}/results/retinanet_np_sweep"
+RESULTS_DIR="${RESULTS_BASE}/${SWEEP_TS}"
+mkdir -p "${RESULTS_DIR}"
+
+# ── Dataset parameters (must match the generated dataset) ────────────────────
+NUM_FILES="${NUM_FILES:-50000}"          # full MLPerf: 1170301
+SAMPLES_PER_FILE=1
+DATA_FOLDER="data/retinanet"
+STORAGE_ROOT="${STORAGE_ROOT:-mlp-retinanet}"
+COMP_TIME="0.04755"   # B200 retinanet computation time
+
+# ── NP values to sweep ────────────────────────────────────────────────────────
+NP_VALUES=(1 2 4)   # NP=8 excluded — co-located s3-ultra saturates at NP≥4
+
+# ── Load s3-ultra credentials ─────────────────────────────────────────────────
+if [[ ! -f "${REPO}/.env.s3-ultra" ]]; then
+    echo "ERROR: ${REPO}/.env.s3-ultra not found" >&2; exit 1
+fi
+set -o allexport
+source "${REPO}/.env.s3-ultra"
+set +o allexport
+unset BUCKET   # prevent env BUCKET from leaking into mlpstorage
+
+# ── Activate venv ─────────────────────────────────────────────────────────────
+source "${VENV}/bin/activate"
+
+# ── TSV header ────────────────────────────────────────────────────────────────
+SUMMARY_TSV="${RESULTS_DIR}/sweep_retinanet_np_${SWEEP_TS}.tsv"
+printf "NP\tau_pct\tsamples_per_sec\tio_mb_per_sec\twall_s\tau_met\n" \
+    > "${SUMMARY_TSV}"
+
+# ── Size estimate ─────────────────────────────────────────────────────────────
+TOTAL_MIB=$(( NUM_FILES * 322957 / 1024 / 1024 ))
+
+echo ""
+echo "════════════════════════════════════════════════════════════════"
+echo "  RetinaNet NP Scaling Sweep"
+echo "  NP values : ${NP_VALUES[*]}"
+echo "  Dataset   : s3://${STORAGE_ROOT}/${DATA_FOLDER}  (${NUM_FILES} files ≈ ${TOTAL_MIB} MiB)"
+echo "  Format    : JPEG, 1 sample/file, ~323 KiB/file"
+echo "  ct        : ${COMP_TIME} s  (B200)"
+echo "  DataLoader: TorchIterableDatasetSimple (64 in-flight GETs/worker)"
+echo "  Results   : ${RESULTS_DIR}"
+echo "  Started   : $(date '+%Y-%m-%d %H:%M:%S')"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+
+for NP in "${NP_VALUES[@]}"; do
+    RUN_DIR="${RESULTS_DIR}/NP${NP}"
+    mkdir -p "${RUN_DIR}"
+
+    echo ""
+    echo "────────────────────────────────────────────────────────────────"
+    echo "  NP=${NP}   $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "────────────────────────────────────────────────────────────────"
+
+    t_start=$(date +%s)
+
+    RUST_LOG=s3dlio=info \
+    "${PYTHON}" -c "from mlpstorage_py.main import main; main()" \
+        training run \
+        --model retinanet \
+        --accelerator-type b200 \
+        --num-accelerators "${NP}" \
+        --num-client-hosts 1 \
+        --client-host-memory-in-gb 47 \
+        --dlio-bin-path "${VENV}/bin" \
+        --object s3 \
+        --skip-validation \
+        --open \
+        --results-dir "${RUN_DIR}" \
+        --params \
+            storage.storage_root=${STORAGE_ROOT} \
+            dataset.num_files_train=${NUM_FILES} \
+            dataset.num_samples_per_file=${SAMPLES_PER_FILE} \
+            dataset.data_folder=${DATA_FOLDER} \
+            train.computation_time=${COMP_TIME} \
+            storage.storage_options.storage_library=s3dlio
+
+    t_end=$(date +%s)
+    wall=$(( t_end - t_start ))
+
+    # ── Parse summary.json → append TSV row ──────────────────────────────────
+    "${PYTHON}" - "${NP}" "${wall}" "${RUN_DIR}" \
+        >> "${SUMMARY_TSV}" 2>&1 <<'PYEOF'
+import json, glob, sys
+
+np_, wall, run_dir = sys.argv[1], sys.argv[2], sys.argv[3]
+
+files = sorted(glob.glob(f"{run_dir}/**/summary.json", recursive=True))
+if not files:
+    print(f"{np_}\tN/A\tN/A\tN/A\t{wall}\tN/A")
+    sys.exit(0)
+
+d   = json.load(open(files[-1]))
+m   = d.get("metric", {})
+
+au  = m.get("train_au_mean_percentage",                 None)
+sps = m.get("train_throughput_mean_samples_per_second", None)
+ioMB = m.get("train_io_mean_MB_per_second",             None)
+met = m.get("train_au_meet_expectation",                "N/A")
+
+def fmt(v, digits=2):
+    return f"{v:.{digits}f}" if isinstance(v, (int, float)) else "N/A"
+
+print(f"{np_}\t{fmt(au)}\t{fmt(sps)}\t{fmt(ioMB)}\t{wall}\t{met}")
+PYEOF
+
+    echo "  NP=${NP} done  (wall=${wall}s)"
+    echo "  Results: ${RUN_DIR}"
+done
+
+# ── Print TSV summary ─────────────────────────────────────────────────────────
+echo ""
+echo "════════════════════════════════════════════════════════════════"
+echo "  Sweep complete — $(date '+%Y-%m-%d %H:%M:%S')"
+echo ""
+echo "  TSV summary:"
+cat "${SUMMARY_TSV}"
+echo "════════════════════════════════════════════════════════════════"
+
+# ── Auto-generate Markdown results doc ───────────────────────────────────────
+MD_OUT="${REPO}/docs/RetinaNet_NP_Scaling_Results.md"
+
+"${PYTHON}" - "${SWEEP_TS}" "${COMP_TIME}" "${SUMMARY_TSV}" \
+             "${STORAGE_ROOT}/${DATA_FOLDER}" "${NUM_FILES}" \
+             "${MD_OUT}" <<'PYEOF'
+import sys, csv, datetime
+
+ts, ct, tsv_path, path, nfiles_str, md_out = sys.argv[1:]
+nfiles = int(nfiles_str)
+record_bytes = 322957
+total_mib = nfiles * record_bytes // (1024 * 1024)
+
+rows = []
+with open(tsv_path) as fh:
+    reader = csv.DictReader(fh, delimiter='\t')
+    for row in reader:
+        rows.append(row)
+
+date_str = datetime.datetime.strptime(ts, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M")
+
+def pass_fail(v):
+    if v in ("True", True):  return "✅ PASS"
+    if v in ("False", False): return "❌ FAIL"
+    return "—"
+
+lines = []
+lines.append(f"# RetinaNet NP Scaling Results")
+lines.append(f"")
+lines.append(f"**Sweep date**: {date_str}")
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## Test Environment")
+lines.append(f"")
+lines.append(f"| Parameter | Value |")
+lines.append(f"|-----------|-------|")
+lines.append(f"| Host | 24 vCPU VM (with hyperthreading), 48 GB RAM |")
+lines.append(f"| Object storage | s3-ultra (`http://127.0.0.1:9000`, co-located on test host) |")
+lines.append(f"| Bucket / path | `{path}` |")
+lines.append(f"| Dataset | {nfiles:,} JPEG files × 1 sample/file (≈ {total_mib:,} MiB) |")
+lines.append(f"| Record length | 322,957 bytes (~315 KiB / file) |")
+lines.append(f"| Batch size | 24 |")
+lines.append(f"| Read threads | 8 |")
+lines.append(f"| `computation_time` | {ct} s  (B200) |")
+lines.append(f"| DataLoader | `TorchIterableDatasetSimple` (64 in-flight GETs/worker) |")
+lines.append(f"| Epochs | 8 |")
+lines.append(f"| AU target | ≥ 85% |")
+lines.append(f"| Model config | `retinanet_b200.yaml` |")
+lines.append(f"| MPI invocation | `mpirun -n NP -host 127.0.0.1:NP` |")
+lines.append(f"")
+lines.append(f"> **⚠️ Co-located test configuration.** The s3-ultra storage server and all benchmark")
+lines.append(f"> processes run on the **same** 24 vCPU / 48 GB RAM host, sharing CPU cores, memory,")
+lines.append(f"> and the loopback network interface. In a real deployment storage would be a dedicated")
+lines.append(f"> remote system; the CPU/memory pressure that limits scaling here would not apply.")
+lines.append(f">")
+lines.append(f"> **AU (Accelerator Utilization)** — fraction of wall time the simulated GPU was")
+lines.append(f"> computing rather than waiting for I/O. AU ≥ 85% is the target threshold for")
+lines.append(f"> retinanet.")
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## NP Scaling Results")
+lines.append(f"")
+lines.append(f"| NP | AU% | Samples/s | I/O MiB/s | Wall time (s) | AU ≥ 85%? |")
+lines.append(f"|----|-----|-----------|-----------|---------------|-----------|")
+for r in rows:
+    pf = pass_fail(r.get("au_met", "N/A"))
+    lines.append(
+        f"| {r['NP']} "
+        f"| {r['au_pct']} "
+        f"| {r['samples_per_sec']} "
+        f"| {r['io_mb_per_sec']} "
+        f"| {r['wall_s']} "
+        f"| {pf} |"
+    )
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## Scaling Analysis")
+lines.append(f"")
+if len(rows) >= 2:
+    try:
+        au1  = float(rows[0]['au_pct'])
+        au2  = float(rows[1]['au_pct']) if len(rows) > 1 else None
+        au4  = float(rows[2]['au_pct']) if len(rows) > 2 else None
+        sps1 = float(rows[0]['samples_per_sec'])
+        sps2 = float(rows[1]['samples_per_sec']) if len(rows) > 1 else None
+        sps4 = float(rows[2]['samples_per_sec']) if len(rows) > 2 else None
+
+        lines.append(f"### Throughput Scaling Efficiency")
+        lines.append(f"")
+        lines.append(f"| Transition | Samples/s | Ideal | Efficiency |")
+        lines.append(f"|------------|-----------|-------|------------|")
+        if sps2 is not None:
+            eff = sps2 / (sps1 * 2) * 100
+            lines.append(f"| NP=1 → NP=2 | {sps1:.1f} → {sps2:.1f} | {sps1*2:.1f} | {eff:.1f}% |")
+        if sps4 is not None:
+            eff4 = sps4 / (sps1 * 4) * 100
+            lines.append(f"| NP=1 → NP=4 | {sps1:.1f} → {sps4:.1f} | {sps1*4:.1f} | {eff4:.1f}% |")
+        lines.append(f"")
+    except (ValueError, IndexError):
+        lines.append(f"*(throughput scaling table: parse error — check TSV)*")
+        lines.append(f"")
+
+lines.append(f"### Key Observations")
+lines.append(f"")
+lines.append(f"1. **NP=1 baseline** — establishes single-accelerator AU and throughput floor.")
+lines.append(f"   RetinaNet I/O is dominated by many small GETs (~315 KiB × files-per-worker);")
+lines.append(f"   the `TorchIterableDatasetSimple` path with 64 in-flight GETs/worker is")
+lines.append(f"   essential to keep the storage backend saturated.")
+lines.append(f"2. **NP=2 scaling** — first scaling step; both AU and throughput should improve")
+lines.append(f"   if the NP=1 run was I/O-bound (AU < 85%).")
+lines.append(f"3. **NP=4** — highest tested NP; co-located s3-ultra competes for CPU at this")
+lines.append(f"   level. If AU plateaus or degrades, the bottleneck has shifted from I/O to")
+lines.append(f"   SHA-256 signing CPU on this Cascade Lake host (no SHA-NI instruction).")
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## Raw Results Location")
+lines.append(f"")
+lines.append(f"Full per-run output in `results/retinanet_np_sweep/{ts}/NP{{1,2,4}}/` —")
+lines.append(f"each contains `summary.json`, per-epoch logs, and DLIO output.")
+
+with open(md_out, 'w') as fh:
+    fh.write('\n'.join(lines) + '\n')
+
+print(f"Markdown written to: {md_out}")
+PYEOF
+
+echo "════════════════════════════════════════════════════════════════"
+echo "  Markdown results doc: ${MD_OUT}"
+echo "  TSV summary         : ${SUMMARY_TSV}"
+echo "  Finished            : $(date '+%Y-%m-%d %H:%M:%S')"
+echo "════════════════════════════════════════════════════════════════"
diff --git a/tests/object-store/sweeps/sweep_unet3d_np.sh b/tests/object-store/sweeps/sweep_unet3d_np.sh
new file mode 100755
index 00000000..0f9bf859
--- /dev/null
+++ b/tests/object-store/sweeps/sweep_unet3d_np.sh
@@ -0,0 +1,270 @@
+#!/usr/bin/env bash
+# =============================================================================
+# sweep_unet3d_np.sh — UNet3D NP (num-accelerators) scaling sweep
+#
+# Sweeps NP=1, 2, 4 at the B200 computation_time (0.162 s = H100 ÷ 2).
+# NP=8 is intentionally excluded — co-located s3-ultra saturates at NP≥4.
+#
+# Dataset : s3://mlp-unet3d/data/unet3d/  (7,200 NPZ files ≈ 984 GiB)
+# Model   : unet3d, B200 accelerator, computation_time=0.162 s
+# AU goal : ≥ 0.90 (90%)
+#
+# Results per run are written to  results/unet3d_np_sweep/<timestamp>/
+# A TSV summary row is appended after each run, printed at the end.
+# A Markdown results doc is auto-generated at the end of the sweep.
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#   bash tests/object-store/sweep_unet3d_np.sh 2>&1 | tee sweep_unet3d_$(date +%Y%m%d_%H%M%S).log
+# =============================================================================
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+VENV="${REPO}/.venv"
+PYTHON="${VENV}/bin/python3"
+
+SWEEP_TS=$(date '+%Y%m%d_%H%M%S')
+RESULTS_BASE="${REPO}/results/unet3d_np_sweep"
+RESULTS_DIR="${RESULTS_BASE}/${SWEEP_TS}"
+mkdir -p "${RESULTS_DIR}"
+
+# ── Dataset parameters (must match the generated dataset) ────────────────────
+NUM_FILES=7200
+SAMPLES_PER_FILE=1
+DATA_FOLDER="data/unet3d"
+STORAGE_ROOT="${STORAGE_ROOT:-mlp-unet3d}"   # override: STORAGE_ROOT=mlp-flux bash sweep_unet3d_np.sh
+COMP_TIME="0.162"   # B200: H100 (0.323 s) ÷ 2
+
+# ── NP values to sweep ────────────────────────────────────────────
+NP_VALUES=(1 2 4)   # NP=8 excluded — co-located s3-ultra saturates at NP≥4
+
+# ── Load s3-ultra credentials ───────────────────────────────────────────────
+# NOTE: .env.s3-ultra sets BUCKET=mlp-flux (its default).  We do NOT export
+# BUCKET — instead we pass storage.storage_root on the CLI so the correct
+# bucket is always used regardless of what the env file contains.
+if [[ ! -f "${REPO}/.env.s3-ultra" ]]; then
+    echo "ERROR: ${REPO}/.env.s3-ultra not found" >&2; exit 1
+fi
+set -o allexport
+source "${REPO}/.env.s3-ultra"
+set +o allexport
+unset BUCKET   # prevent env BUCKET from leaking into mlpstorage
+
+# ── Activate venv ─────────────────────────────────────────────────────────────
+source "${VENV}/bin/activate"
+
+# ── TSV header ────────────────────────────────────────────────────────────────
+SUMMARY_TSV="${RESULTS_DIR}/sweep_unet3d_np_${SWEEP_TS}.tsv"
+printf "NP\tau_pct\tsamples_per_sec\tio_mb_per_sec\twall_s\tau_met\n" \
+    > "${SUMMARY_TSV}"
+
+echo ""
+echo "════════════════════════════════════════════════════════════════"
+echo "  UNet3D NP Scaling Sweep"
+echo "  NP values : ${NP_VALUES[*]}"
+echo "  Dataset   : s3://${STORAGE_ROOT}/${DATA_FOLDER}  (${NUM_FILES} files)"
+echo "  ct        : ${COMP_TIME} s  (B200 = H100 ÷ 2)"
+echo "  Results   : ${RESULTS_DIR}"
+echo "  Started   : $(date '+%Y-%m-%d %H:%M:%S')"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+
+for NP in "${NP_VALUES[@]}"; do
+    RUN_DIR="${RESULTS_DIR}/NP${NP}"
+    mkdir -p "${RUN_DIR}"
+
+    echo ""
+    echo "────────────────────────────────────────────────────────────────"
+    echo "  NP=${NP}   $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "────────────────────────────────────────────────────────────────"
+
+    t_start=$(date +%s)
+
+    RUST_LOG=s3dlio=info \
+    "${PYTHON}" -c "from mlpstorage_py.main import main; main()" \
+        training run \
+        --model unet3d \
+        --accelerator-type b200 \
+        --num-accelerators "${NP}" \
+        --num-client-hosts 1 \
+        --client-host-memory-in-gb 47 \
+        --dlio-bin-path "${VENV}/bin" \
+        --object s3 \
+        --skip-validation \
+        --open \
+        --results-dir "${RUN_DIR}" \
+        --params \
+            storage.storage_root=${STORAGE_ROOT} \
+            dataset.num_files_train=${NUM_FILES} \
+            dataset.num_samples_per_file=${SAMPLES_PER_FILE} \
+            dataset.data_folder=${DATA_FOLDER} \
+            train.computation_time=${COMP_TIME} \
+            storage.storage_options.decode_mode=none \
+            storage.storage_options.storage_library=s3dlio
+
+    t_end=$(date +%s)
+    wall=$(( t_end - t_start ))
+
+    # ── Parse summary.json → append TSV row ──────────────────────────────
+    "${PYTHON}" - "${NP}" "${wall}" "${RUN_DIR}" \
+        >> "${SUMMARY_TSV}" 2>&1 <<'PYEOF'
+import json, glob, sys
+
+np_, wall, run_dir = sys.argv[1], sys.argv[2], sys.argv[3]
+
+files = sorted(glob.glob(f"{run_dir}/**/summary.json", recursive=True))
+if not files:
+    print(f"{np_}\tN/A\tN/A\tN/A\t{wall}\tN/A")
+    sys.exit(0)
+
+d    = json.load(open(files[-1]))
+m    = d.get("metric", {})
+
+au   = m.get("train_au_mean_percentage",                 None)
+sps  = m.get("train_throughput_mean_samples_per_second", None)
+ioMB = m.get("train_io_mean_MB_per_second",              None)
+met  = m.get("train_au_meet_expectation",                "N/A")
+
+def fmt(v, digits=2):
+    return f"{v:.{digits}f}" if isinstance(v, (int, float)) else "N/A"
+
+print(f"{np_}\t{fmt(au)}\t{fmt(sps,1)}\t{fmt(ioMB,1)}\t{wall}\t{met}")
+PYEOF
+
+done
+
+# ── Print summary table ───────────────────────────────────────────────────────
+echo ""
+echo "════════════════════════════════════════════════════════════════"
+echo "  UNet3D NP Sweep — Summary"
+echo "════════════════════════════════════════════════════════════════"
+column -t -s $'\t' "${SUMMARY_TSV}"
+echo ""
+
+# ── Auto-generate Markdown results doc ───────────────────────────────────────
+MD_OUT="${RESULTS_DIR}/UNet3D_NP_Scaling_Results_${SWEEP_TS}.md"
+
+"${PYTHON}" - "${SUMMARY_TSV}" "${SWEEP_TS}" "${COMP_TIME}" \
+    "${NUM_FILES}" "${STORAGE_ROOT}/${DATA_FOLDER}" \
+    > "${MD_OUT}" 2>&1 <<'PYEOF'
+import csv, sys, datetime
+
+tsv_path, ts, ct, nfiles, path = sys.argv[1:]
+
+rows = []
+with open(tsv_path) as f:
+    reader = csv.DictReader(f, delimiter='\t')
+    for row in reader:
+        rows.append(row)
+
+date_str = datetime.datetime.strptime(ts, "%Y%m%d_%H%M%S").strftime("%B %d, %Y")
+
+def pass_fail(met):
+    if met == "True" or met is True:
+        return "✅ PASS"
+    if met == "False" or met is False:
+        return "❌ FAIL"
+    return met
+
+lines = []
+lines.append(f"# UNet3D Training — NP Scaling Study")
+lines.append(f"")
+lines.append(f"**Sweep date**: {date_str}")
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## Test Environment")
+lines.append(f"")
+lines.append(f"| Parameter | Value |")
+lines.append(f"|-----------|-------|")
+lines.append(f"| Host | 24 vCPU VM (with hyperthreading), 48 GB RAM |")
+lines.append(f"| Object storage | s3-ultra (`http://127.0.0.1:9000`, co-located on test host) |")
+lines.append(f"| Bucket / path | `{path}` |")
+lines.append(f"| Dataset | {nfiles} NPZ files × 1 sample/file (≈ 984 GiB) |")
+lines.append(f"| Record length | 146,600,628 bytes avg (σ = 68,341,808) |")
+lines.append(f"| Batch size | 7 |")
+lines.append(f"| Read threads | 4 |")
+lines.append(f"| `computation_time` | {ct} s  (B200 = H100 0.323 s ÷ 2) |")
+lines.append(f"| `decode_mode` | `none` |")
+lines.append(f"| Epochs | 5 |")
+lines.append(f"| AU target | ≥ 90% |")
+lines.append(f"| Model config | `unet3d_b200.yaml` |")
+lines.append(f"| MPI invocation | `mpirun -n NP -host 127.0.0.1:NP` |")
+lines.append(f"")
+lines.append(f"> **⚠️ Co-located test configuration.** The s3-ultra storage server and all benchmark")
+lines.append(f"> processes run on the **same** 24 vCPU / 48 GB RAM host, sharing CPU cores, memory,")
+lines.append(f"> and the loopback network interface. In a real deployment storage would be a dedicated")
+lines.append(f"> remote system; the CPU/memory pressure that limits scaling here would not apply.")
+lines.append(f">")
+lines.append(f"> **AU (Accelerator Utilization)** — fraction of wall time the simulated GPU was")
+lines.append(f"> computing rather than waiting for I/O. AU ≥ 90% is the target threshold for a")
+lines.append(f"> \"pass\" on unet3d.")
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## NP Scaling Results")
+lines.append(f"")
+lines.append(f"| NP | AU% | Samples/s | I/O MiB/s | Wall time (s) | AU ≥ 90%? |")
+lines.append(f"|----|-----|-----------|-----------|---------------|-----------|")
+for r in rows:
+    pf = pass_fail(r.get("au_met", "N/A"))
+    lines.append(
+        f"| {r['NP']} "
+        f"| {r['au_pct']} "
+        f"| {r['samples_per_sec']} "
+        f"| {r['io_mb_per_sec']} "
+        f"| {r['wall_s']} "
+        f"| {pf} |"
+    )
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## Scaling Analysis")
+lines.append(f"")
+if len(rows) >= 2:
+    try:
+        au1 = float(rows[0]['au_pct'])
+        au2 = float(rows[1]['au_pct']) if len(rows) > 1 else None
+        au4 = float(rows[2]['au_pct']) if len(rows) > 2 else None
+        sps1 = float(rows[0]['samples_per_sec'])
+        sps2 = float(rows[1]['samples_per_sec']) if len(rows) > 1 else None
+        sps4 = float(rows[2]['samples_per_sec']) if len(rows) > 2 else None
+
+        lines.append(f"### Throughput Scaling Efficiency")
+        lines.append(f"")
+        lines.append(f"| Transition | Samples/s | Ideal | Efficiency |")
+        lines.append(f"|------------|-----------|-------|------------|")
+        if sps2 is not None:
+            eff = sps2 / (sps1 * 2) * 100
+            lines.append(f"| NP=1 → NP=2 | {sps1:.1f} → {sps2:.1f} | {sps1*2:.1f} | {eff:.1f}% |")
+        if sps4 is not None:
+            eff4 = sps4 / (sps1 * 4) * 100
+            lines.append(f"| NP=1 → NP=4 | {sps1:.1f} → {sps4:.1f} | {sps1*4:.1f} | {eff4:.1f}% |")
+        lines.append(f"")
+    except (ValueError, IndexError):
+        lines.append(f"*(throughput scaling table: parse error — check TSV)*")
+        lines.append(f"")
+
+lines.append(f"### Key Observations")
+lines.append(f"")
+lines.append(f"1. **NP=1 baseline** — establishes single-accelerator AU and throughput floor.")
+lines.append(f"2. **NP=2 scaling** — first scaling step; throughput should nearly double if I/O-bound,")
+lines.append(f"   or AU should improve if NP=1 was CPU-throttled by co-located s3-ultra.")
+lines.append(f"3. **NP=4** — highest tested NP; co-located s3-ultra competes for CPU at this level.")
+lines.append(f"   If AU drops or throughput plateaus relative to NP=2, storage bandwidth is saturated.")
+lines.append(f"")
+lines.append(f"---")
+lines.append(f"")
+lines.append(f"## Raw Results Location")
+lines.append(f"")
+lines.append(f"Full per-run output in `results/unet3d_np_sweep/{ts}/NP{{1,2,4}}/` —")
+lines.append(f"each contains `summary.json`, per-epoch logs, and DLIO output.")
+
+print('\n'.join(lines))
+PYEOF
+
+echo "════════════════════════════════════════════════════════════════"
+echo "  Markdown results doc: ${MD_OUT}"
+echo "  TSV summary         : ${SUMMARY_TSV}"
+echo "  Finished            : $(date '+%Y-%m-%d %H:%M:%S')"
+echo "════════════════════════════════════════════════════════════════"
diff --git a/tests/object-store/test_retinanet.sh b/tests/object-store/test_retinanet.sh
new file mode 100755
index 00000000..19a7827a
--- /dev/null
+++ b/tests/object-store/test_retinanet.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+# =============================================================================
+# test_retinanet.sh — Single-run smoke test for RetinaNet training benchmark
+#
+# True smoke test: NP=1, 200 files, 1 epoch — fast end-to-end sanity check
+# that the pipeline works at all before turning up the heat.
+#
+# The [DATALOADER] log line should show:
+#   TorchIterableDatasetSimple(bulk-prefetch, N workers)
+# and the [INFO] streaming lines should show small chunk counts,
+# confirming the bounded sliding-window path (not thundering-herd) is active.
+#
+# Prerequisites:
+#   - s3-ultra running           (bash s3-ultra/scripts/start_s3ultra2.sh)
+#   - Dataset already generated  (bash tests/object-store/gen_retinanet_jpeg.sh)
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#   bash tests/object-store/test_retinanet.sh
+#
+#   # Override NP or file count:
+#   NP=2 bash tests/object-store/test_retinanet.sh
+#   NP=1 NUM_FILES=50000 bash tests/object-store/test_retinanet.sh
+# =============================================================================
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+NP="${NP:-1}"
+NUM_FILES="${NUM_FILES:-200}"           # smoke test: just 200 files; full dataset has 500k
+DATA_FOLDER="data/retinanet"
+STORAGE_ROOT="${STORAGE_ROOT:-mlp-retinanet}"
+
+cd "${REPO}"
+
+# Load credentials; unset BUCKET so env never controls the target bucket
+set -o allexport; source .env.s3-ultra; set +o allexport
+unset BUCKET
+
+source .venv/bin/activate
+
+echo ""
+echo "════════════════════════════════════════════════════════"
+echo "  RetinaNet Smoke Test"
+echo "  NP=${NP}   Bucket: s3://${STORAGE_ROOT}/${DATA_FOLDER}/"
+echo "  Files: ${NUM_FILES}   Endpoint: ${AWS_ENDPOINT_URL}"
+echo "  Started: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "════════════════════════════════════════════════════════"
+echo ""
+
+RUST_LOG=s3dlio=info \
+.venv/bin/python3 -c "from mlpstorage_py.main import main; main()" \
+    training run \
+    --model retinanet \
+    --accelerator-type b200 \
+    --num-accelerators "${NP}" \
+    --num-client-hosts 1 \
+    --client-host-memory-in-gb 47 \
+    --dlio-bin-path "${REPO}/.venv/bin" \
+    --object s3 \
+    --skip-validation \
+    --open \
+    --params \
+        storage.storage_root="${STORAGE_ROOT}" \
+        dataset.num_files_train="${NUM_FILES}" \
+        dataset.num_samples_per_file=1 \
+        dataset.data_folder="${DATA_FOLDER}" \
+        train.computation_time=0.04755 \
+        train.epochs=1 \
+        storage.storage_options.storage_library=s3dlio \
+    2>&1
+
+echo ""
+echo "════════════════════════════════════════════════════════"
+echo "  test_retinanet.sh complete — $(date '+%Y-%m-%d %H:%M:%S')"
+echo "  Check [DATALOADER] lines above for:"
+echo "    TorchIterableDatasetSimple(bulk-prefetch, N workers)"
+echo "════════════════════════════════════════════════════════"
diff --git a/tests/object-store/test_unet3d.sh b/tests/object-store/test_unet3d.sh
new file mode 100755
index 00000000..d1f8231f
--- /dev/null
+++ b/tests/object-store/test_unet3d.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Quick single-run test for UNet3D training benchmark (NP=1, B200)
+# Dataset: s3://mlp-unet3d/data/unet3d/  (7,200 NPZ files ~984 GiB)
+#
+# Usage:
+#   cd /home/eval/Documents/Code/mlp-storage
+#   bash tests/object-store/test_unet3d.sh
+#
+#   # Override NP:
+#   NP=2 bash tests/object-store/test_unet3d.sh
+set -euo pipefail
+
+REPO=/home/eval/Documents/Code/mlp-storage
+NP="${NP:-1}"
+
+cd "${REPO}"
+# Load credentials only — unset BUCKET so env never controls the target bucket
+set -o allexport; source .env.s3-ultra; set +o allexport
+unset BUCKET
+
+source .venv/bin/activate
+
+RUST_LOG=s3dlio=info \
+.venv/bin/python3 -c "from mlpstorage_py.main import main; main()" \
+    training run \
+    --model unet3d \
+    --accelerator-type b200 \
+    --num-accelerators "${NP}" \
+    --num-client-hosts 1 \
+    --client-host-memory-in-gb 47 \
+    --dlio-bin-path "${REPO}/.venv/bin" \
+    --object s3 \
+    --skip-validation \
+    --open \
+    --params \
+        storage.storage_root="${STORAGE_ROOT:-mlp-unet3d}" \
+        dataset.num_files_train=7200 \
+        dataset.num_samples_per_file=1 \
+        dataset.data_folder=data/unet3d \
+        train.computation_time=0.162 \
+        storage.storage_options.decode_mode=none \
+        storage.storage_options.storage_library=s3dlio \
+    2>&1
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
index 236a2f5b..43f5206d 100755
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -722,3 +722,90 @@ def test_skips_none_values(self, tmp_path):
         result = apply_yaml_config_overrides(args)
         assert result.debug is True  # Should not be overwritten
         assert result.loops == 5
+
+class TestParseArgumentsStorageFlagConsolidation:
+    """Regression tests for issue #367.
+
+    The CLI parser must not crash when a subcommand that doesn't define
+    --file / --object (reports, history, lockfile) is invoked, and must
+    still correctly consolidate those flags into data_access_protocol on
+    benchmark subcommands that do define them (training, checkpointing,
+    vectordb, kvcache).
+    """
+
+    @staticmethod
+    def _run(monkeypatch, argv):
+        """Invoke parse_arguments() with a synthetic sys.argv."""
+        from mlpstorage_py.cli_parser import parse_arguments
+        monkeypatch.setattr(sys, "argv", argv)
+        return parse_arguments()
+
+    # --- non-benchmark subcommands: must not raise AttributeError ---
+
+    def test_reportgen_does_not_crash_without_storage_flags(self, monkeypatch, tmp_path):
+        """Regression test for #367: `reports reportgen` must parse cleanly."""
+        args = self._run(
+            monkeypatch,
+            ["mlpstorage", "reports", "reportgen", "--results-dir", str(tmp_path)],
+        )
+        assert args.program == "reports"
+        assert args.command == "reportgen"
+        assert not hasattr(args, "file")
+        assert not hasattr(args, "object")
+
+    def test_history_does_not_crash_without_storage_flags(self, monkeypatch):
+        """`history show` must parse cleanly (no --file/--object on this parser)."""
+        args = self._run(monkeypatch, ["mlpstorage", "history", "show"])
+        assert args.program == "history"
+        assert args.command == "show"
+        assert not hasattr(args, "file")
+        assert not hasattr(args, "object")
+
+    def test_lockfile_does_not_crash_without_storage_flags(self, monkeypatch):
+        """`lockfile generate` must parse cleanly (no --file/--object on this parser)."""
+        args = self._run(monkeypatch, ["mlpstorage", "lockfile", "generate"])
+        assert args.program == "lockfile"
+        assert not hasattr(args, "file")
+        assert not hasattr(args, "object")
+
+    # --- benchmark subcommands: existing consolidation must still work ---
+
+    def test_training_run_consolidates_file_flag(self, monkeypatch, tmp_path):
+        """`training run --file` should set data_access_protocol='file'."""
+        args = self._run(
+            monkeypatch,
+            [
+                "mlpstorage", "training", "run",
+                "--model", "unet3d",
+                "--hosts", "localhost",
+                "--num-accelerators", "1",
+                "--accelerator-type", "h100",
+                "--client-host-memory-in-gb", "64",
+                "--data-dir", str(tmp_path / "data"),
+                "--results-dir", str(tmp_path / "results"),
+                "--file",
+            ],
+        )
+        assert args.data_access_protocol == "file"
+        assert not hasattr(args, "file")
+        assert not hasattr(args, "object")
+
+    def test_training_run_consolidates_object_flag(self, monkeypatch, tmp_path):
+        """`training run --object s3` should set data_access_protocol='s3'."""
+        args = self._run(
+            monkeypatch,
+            [
+                "mlpstorage", "training", "run",
+                "--model", "unet3d",
+                "--hosts", "localhost",
+                "--num-accelerators", "1",
+                "--accelerator-type", "h100",
+                "--client-host-memory-in-gb", "64",
+                "--data-dir", str(tmp_path / "data"),
+                "--results-dir", str(tmp_path / "results"),
+                "--object", "s3",
+            ],
+        )
+        assert args.data_access_protocol == "s3"
+        assert not hasattr(args, "file")
+        assert not hasattr(args, "object")
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index b6a324f2..418abf3d 100755
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -579,6 +579,61 @@ def test_uneven_distribution(self, mock_logger):
         # 7 processes across 3 hosts: 3, 2, 2 distribution
         assert '-n 7' in result
 
+    def test_mpi_btl_auto_by_default_single_host(self, mock_logger):
+        """No --mca btl flag is added in auto mode (default)."""
+        result = generate_mpi_prefix_cmd(
+            mpi_cmd=MPIRUN,
+            hosts=['host1'],
+            num_processes=4,
+            oversubscribe=False,
+            allow_run_as_root=False,
+            params=None,
+            logger=mock_logger
+        )
+        assert '--mca btl' not in result
+
+    def test_mpi_btl_tcp_single_host(self, mock_logger):
+        """--mca btl tcp,self is added when mpi_btl='tcp'."""
+        result = generate_mpi_prefix_cmd(
+            mpi_cmd=MPIRUN,
+            hosts=['host1'],
+            num_processes=4,
+            oversubscribe=False,
+            allow_run_as_root=False,
+            params=None,
+            logger=mock_logger,
+            mpi_btl='tcp'
+        )
+        assert '--mca btl tcp,self' in result
+
+    def test_mpi_btl_vader_single_host(self, mock_logger):
+        """--mca btl vader,self is added when mpi_btl='vader'."""
+        result = generate_mpi_prefix_cmd(
+            mpi_cmd=MPIRUN,
+            hosts=['host1'],
+            num_processes=4,
+            oversubscribe=False,
+            allow_run_as_root=False,
+            params=None,
+            logger=mock_logger,
+            mpi_btl='vader'
+        )
+        assert '--mca btl vader,self' in result
+
+    def test_mpi_btl_not_applied_for_multihost(self, mock_logger):
+        """--mca btl flags are never applied for multi-host runs."""
+        result = generate_mpi_prefix_cmd(
+            mpi_cmd=MPIRUN,
+            hosts=['host1', 'host2'],
+            num_processes=8,
+            oversubscribe=False,
+            allow_run_as_root=False,
+            params=None,
+            logger=mock_logger,
+            mpi_btl='tcp'
+        )
+        assert '--mca btl' not in result
+
 
 class TestCommandExecutor:
     """Tests for CommandExecutor class."""
diff --git a/uv.lock b/uv.lock
index adc1a57e..1be0c356 100755
--- a/uv.lock
+++ b/uv.lock
@@ -1,9 +1,10 @@
 version = 1
 requires-python = "==3.12.*"
 resolution-markers = [
-    "sys_platform == 'win32'",
-    "sys_platform == 'emscripten'",
-    "sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "sys_platform == 'linux'",
+]
+supported-markers = [
+    "sys_platform == 'linux'",
 ]
 
 [[package]]
@@ -26,7 +27,7 @@ name = "argon2-cffi"
 version = "25.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "argon2-cffi-bindings" },
+    { name = "argon2-cffi-bindings", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0e/89/ce5af8a7d472a67cc819d5d998aa8c82c5d860608c4db9f46f1162d7dab9/argon2_cffi-25.1.0.tar.gz", hash = "sha256:694ae5cc8a42f4c4e2bf2ca0e64e51e23a040c6a517a85074683d3959e1346c1", size = 45706 }
 wheels = [
@@ -38,20 +39,14 @@ name = "argon2-cffi-bindings"
 version = "25.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cffi" },
+    { name = "cffi", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5c/2d/db8af0df73c1cf454f71b2bbe5e356b8c1f8041c979f505b3d3186e520a9/argon2_cffi_bindings-25.1.0.tar.gz", hash = "sha256:b957f3e6ea4d55d820e40ff76f450952807013d361a65d7f28acc0acbf29229d", size = 1783441 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1d/57/96b8b9f93166147826da5f90376e784a10582dd39a393c99bb62cfcf52f0/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:aecba1723ae35330a008418a91ea6cfcedf6d31e5fbaa056a166462ff066d500", size = 54121 },
-    { url = "https://files.pythonhosted.org/packages/0a/08/a9bebdb2e0e602dde230bdde8021b29f71f7841bd54801bcfd514acb5dcf/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2630b6240b495dfab90aebe159ff784d08ea999aa4b0d17efa734055a07d2f44", size = 29177 },
-    { url = "https://files.pythonhosted.org/packages/b6/02/d297943bcacf05e4f2a94ab6f462831dc20158614e5d067c35d4e63b9acb/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7aef0c91e2c0fbca6fc68e7555aa60ef7008a739cbe045541e438373bc54d2b0", size = 31090 },
     { url = "https://files.pythonhosted.org/packages/c1/93/44365f3d75053e53893ec6d733e4a5e3147502663554b4d864587c7828a7/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e021e87faa76ae0d413b619fe2b65ab9a037f24c60a1e6cc43457ae20de6dc6", size = 81246 },
     { url = "https://files.pythonhosted.org/packages/09/52/94108adfdd6e2ddf58be64f959a0b9c7d4ef2fa71086c38356d22dc501ea/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3e924cfc503018a714f94a49a149fdc0b644eaead5d1f089330399134fa028a", size = 87126 },
     { url = "https://files.pythonhosted.org/packages/72/70/7a2993a12b0ffa2a9271259b79cc616e2389ed1a4d93842fac5a1f923ffd/argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c87b72589133f0346a1cb8d5ecca4b933e3c9b64656c9d175270a000e73b288d", size = 80343 },
     { url = "https://files.pythonhosted.org/packages/78/9a/4e5157d893ffc712b74dbd868c7f62365618266982b64accab26bab01edc/argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1db89609c06afa1a214a69a462ea741cf735b29a57530478c06eb81dd403de99", size = 86777 },
-    { url = "https://files.pythonhosted.org/packages/74/cd/15777dfde1c29d96de7f18edf4cc94c385646852e7c7b0320aa91ccca583/argon2_cffi_bindings-25.1.0-cp39-abi3-win32.whl", hash = "sha256:473bcb5f82924b1becbb637b63303ec8d10e84c8d241119419897a26116515d2", size = 27180 },
-    { url = "https://files.pythonhosted.org/packages/e2/c6/a759ece8f1829d1f162261226fbfd2c6832b3ff7657384045286d2afa384/argon2_cffi_bindings-25.1.0-cp39-abi3-win_amd64.whl", hash = "sha256:a98cd7d17e9f7ce244c0803cad3c23a7d379c301ba618a5fa76a67d116618b98", size = 31715 },
-    { url = "https://files.pythonhosted.org/packages/42/b9/f8d6fa329ab25128b7e98fd83a3cb34d9db5b059a9847eddb840a0af45dd/argon2_cffi_bindings-25.1.0-cp39-abi3-win_arm64.whl", hash = "sha256:b0fdbcf513833809c882823f98dc2f931cf659d9a1429616ac3adebb49f5db94", size = 27149 },
 ]
 
 [[package]]
@@ -59,8 +54,8 @@ name = "astunparse"
 version = "1.6.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "six" },
-    { name = "wheel" },
+    { name = "six", marker = "sys_platform == 'linux'" },
+    { name = "wheel", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f3/af/4182184d3c338792894f34a62672919db7ca008c89abee9b564dd34d8029/astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872", size = 18290 }
 wheels = [
@@ -90,12 +85,10 @@ name = "cffi"
 version = "2.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+    { name = "pycparser", marker = "implementation_name != 'PyPy' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271 },
-    { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048 },
     { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529 },
     { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097 },
     { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983 },
@@ -103,9 +96,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572 },
     { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963 },
     { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361 },
-    { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932 },
-    { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557 },
-    { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762 },
 ]
 
 [[package]]
@@ -114,7 +104,6 @@ version = "3.4.6"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/7b/60/e3bec1881450851b087e301bedc3daa9377a4d45f1c26aa90b0b235e38aa/charset_normalizer-3.4.6.tar.gz", hash = "sha256:1ae6b62897110aa7c79ea2f5dd38d1abca6db663687c0b1ad9aed6f6bae3d9d6", size = 143363 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e5/62/c0815c992c9545347aeea7859b50dc9044d147e2e7278329c6e02ac9a616/charset_normalizer-3.4.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ef7fedc7a6ecbe99969cd09632516738a97eeb8bd7258bf8a0f23114c057dab", size = 295154 },
     { url = "https://files.pythonhosted.org/packages/a8/37/bdca6613c2e3c58c7421891d80cc3efa1d32e882f7c4a7ee6039c3fc951a/charset_normalizer-3.4.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4ea868bc28109052790eb2b52a9ab33f3aa7adc02f96673526ff47419490e21", size = 199191 },
     { url = "https://files.pythonhosted.org/packages/6c/92/9934d1bbd69f7f398b38c5dae1cbf9cc672e7c34a4adf7b17c0a9c17d15d/charset_normalizer-3.4.6-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:836ab36280f21fc1a03c99cd05c6b7af70d2697e374c7af0b61ed271401a72a2", size = 218674 },
     { url = "https://files.pythonhosted.org/packages/af/90/25f6ab406659286be929fd89ab0e78e38aa183fc374e03aa3c12d730af8a/charset_normalizer-3.4.6-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f1ce721c8a7dfec21fcbdfe04e8f68174183cf4e8188e0645e92aa23985c57ff", size = 215259 },
@@ -127,29 +116,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/16/50/478cdda782c8c9c3fb5da3cc72dd7f331f031e7f1363a893cdd6ca0f8de0/charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:695f5c2823691a25f17bc5d5ffe79fa90972cc34b002ac6c843bb8a1720e950d", size = 203751 },
     { url = "https://files.pythonhosted.org/packages/75/fc/cc2fcac943939c8e4d8791abfa139f685e5150cae9f94b60f12520feaa9b/charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:231d4da14bcd9301310faf492051bee27df11f2bc7549bc0bb41fef11b82daa2", size = 216563 },
     { url = "https://files.pythonhosted.org/packages/a8/b7/a4add1d9a5f68f3d037261aecca83abdb0ab15960a3591d340e829b37298/charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a056d1ad2633548ca18ffa2f85c202cfb48b68615129143915b8dc72a806a923", size = 209265 },
-    { url = "https://files.pythonhosted.org/packages/6c/18/c094561b5d64a24277707698e54b7f67bd17a4f857bbfbb1072bba07c8bf/charset_normalizer-3.4.6-cp312-cp312-win32.whl", hash = "sha256:c2274ca724536f173122f36c98ce188fd24ce3dad886ec2b7af859518ce008a4", size = 144229 },
-    { url = "https://files.pythonhosted.org/packages/ab/20/0567efb3a8fd481b8f34f739ebddc098ed062a59fed41a8d193a61939e8f/charset_normalizer-3.4.6-cp312-cp312-win_amd64.whl", hash = "sha256:c8ae56368f8cc97c7e40a7ee18e1cedaf8e780cd8bc5ed5ac8b81f238614facb", size = 154277 },
-    { url = "https://files.pythonhosted.org/packages/15/57/28d79b44b51933119e21f65479d0864a8d5893e494cf5daab15df0247c17/charset_normalizer-3.4.6-cp312-cp312-win_arm64.whl", hash = "sha256:899d28f422116b08be5118ef350c292b36fc15ec2daeb9ea987c89281c7bb5c4", size = 142817 },
     { url = "https://files.pythonhosted.org/packages/2a/68/687187c7e26cb24ccbd88e5069f5ef00eba804d36dde11d99aad0838ab45/charset_normalizer-3.4.6-py3-none-any.whl", hash = "sha256:947cf925bc916d90adba35a64c82aace04fa39b46b52d4630ece166655905a69", size = 61455 },
 ]
 
-[[package]]
-name = "colorama"
-version = "0.4.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
-]
-
 [[package]]
 name = "coverage"
 version = "7.13.5"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/9d/e0/70553e3000e345daff267cec284ce4cbf3fc141b6da229ac52775b5428f1/coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179", size = 915967 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a0/c3/a396306ba7db865bf96fc1fb3b7fd29bcbf3d829df642e77b13555163cd6/coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01", size = 219554 },
-    { url = "https://files.pythonhosted.org/packages/a6/16/a68a19e5384e93f811dccc51034b1fd0b865841c390e3c931dcc4699e035/coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422", size = 219908 },
     { url = "https://files.pythonhosted.org/packages/29/72/20b917c6793af3a5ceb7fb9c50033f3ec7865f2911a1416b34a7cfa0813b/coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f", size = 251419 },
     { url = "https://files.pythonhosted.org/packages/8c/49/cd14b789536ac6a4778c453c6a2338bc0a2fb60c5a5a41b4008328b9acc1/coverage-7.13.5-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:03ccc709a17a1de074fb1d11f217342fb0d2b1582ed544f554fc9fc3f07e95f5", size = 254159 },
     { url = "https://files.pythonhosted.org/packages/9d/00/7b0edcfe64e2ed4c0340dac14a52ad0f4c9bd0b8b5e531af7d55b703db7c/coverage-7.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f4818d065964db3c1c66dc0fbdac5ac692ecbc875555e13374fdbe7eedb4376", size = 255270 },
@@ -160,9 +135,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/37/a6/f79fb37aa104b562207cc23cb5711ab6793608e246cae1e93f26b2236ed9/coverage-7.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9d44d7aa963820b1b971dbecd90bfe5fe8f81cff79787eb6cca15750bd2f79b9", size = 255404 },
     { url = "https://files.pythonhosted.org/packages/75/f0/ed15262a58ec81ce457ceb717b7f78752a1713556b19081b76e90896e8d4/coverage-7.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:7132bed4bd7b836200c591410ae7d97bf7ae8be6fc87d160b2bd881df929e7bf", size = 250903 },
     { url = "https://files.pythonhosted.org/packages/0f/e9/9129958f20e7e9d4d56d51d42ccf708d15cac355ff4ac6e736e97a9393d2/coverage-7.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a698e363641b98843c517817db75373c83254781426e94ada3197cabbc2c919c", size = 252780 },
-    { url = "https://files.pythonhosted.org/packages/a4/d7/0ad9b15812d81272db94379fe4c6df8fd17781cc7671fdfa30c76ba5ff7b/coverage-7.13.5-cp312-cp312-win32.whl", hash = "sha256:bdba0a6b8812e8c7df002d908a9a2ea3c36e92611b5708633c50869e6d922fdf", size = 222093 },
-    { url = "https://files.pythonhosted.org/packages/29/3d/821a9a5799fac2556bcf0bd37a70d1d11fa9e49784b6d22e92e8b2f85f18/coverage-7.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:d2c87e0c473a10bffe991502eac389220533024c8082ec1ce849f4218dded810", size = 222900 },
-    { url = "https://files.pythonhosted.org/packages/d4/fa/2238c2ad08e35cf4f020ea721f717e09ec3152aea75d191a7faf3ef009a8/coverage-7.13.5-cp312-cp312-win_arm64.whl", hash = "sha256:bf69236a9a81bdca3bff53796237aab096cdbf8d78a66ad61e992d9dac7eb2de", size = 221515 },
     { url = "https://files.pythonhosted.org/packages/9e/ee/a4cf96b8ce1e566ed238f0659ac2d3f007ed1d14b181bcb684e19561a69a/coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61", size = 211346 },
 ]
 
@@ -171,7 +143,7 @@ name = "cuda-bindings"
 version = "13.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cuda-pathfinder", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "cuda-pathfinder", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/52/c8/b2589d68acf7e3d63e2be330b84bc25712e97ed799affbca7edd7eae25d6/cuda_bindings-13.2.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e865447abfb83d6a98ad5130ed3c70b1fc295ae3eeee39fd07b4ddb0671b6788", size = 5722404 },
@@ -231,37 +203,39 @@ nvtx = [
 
 [[package]]
 name = "dgen-py"
-version = "0.2.3"
+version = "0.2.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "zstandard" },
+    { name = "zstandard", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ad/9f/e04c2c79bd91937593d79bb480c83c67141922da26ba39cff6d5f38e1673/dgen_py-0.2.3.tar.gz", hash = "sha256:fbebb1fc6b24f77abc78baaec82218c6377c1a84d8caf2f055899c1cee050ecd", size = 208444 }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/ee/f839357750c2229643abf2627b43d0f12d6984e79ba6891522a3aabc52b6/dgen_py-0.2.4.tar.gz", hash = "sha256:a1820092a1ac4a793ceda1db30de66339b7a75fd8e609f6cb6be84c31ecdb625", size = 217909 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/55/42/b24dd7f7794b3a999290fa461d745caf9e1bad07643caf912f575b833b10/dgen_py-0.2.3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:44eb5b802cf5cb721c76e30d1e94cbf86cc9d64dab44caef127f82fe6f253d6d", size = 392290 },
+    { url = "https://files.pythonhosted.org/packages/2b/91/2dae75d696c0f9e380acc7bcda09ccddb70d27455dab59e0c90424fe5881/dgen_py-0.2.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e49af6efdbd11860f24ce804bd1a1b3b6b71a1f5f5de55b33977f14ad9bc41ab", size = 394488 },
+    { url = "https://files.pythonhosted.org/packages/a9/54/2f7d900bee5be6177a3c7b25fe50699217c722efa0fc2f05a4366bb3cfec/dgen_py-0.2.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:8acba9dfc8512e9dcfa1b4496d11b8511a35c7a4611290f769792a250e61a4f7", size = 404759 },
 ]
 
 [[package]]
 name = "dlio-benchmark"
-version = "3.0.1"
-source = { git = "https://github.com/russfellows/dlio_benchmark.git?branch=feat%2Fparquet-dgen-streaming#842fb9b0bd9d26c773433b4d0805922040206b50" }
+version = "3.0.2"
+source = { git = "https://github.com/russfellows/dlio_benchmark.git?rev=21c0723de897add728158943d369abd4b333f7dc#21c0723de897add728158943d369abd4b333f7dc" }
 dependencies = [
-    { name = "dgen-py" },
-    { name = "h5py" },
-    { name = "hydra-core" },
-    { name = "mpi4py" },
-    { name = "numpy" },
-    { name = "omegaconf" },
-    { name = "pandas" },
-    { name = "pillow" },
-    { name = "psutil" },
-    { name = "pyarrow" },
-    { name = "pydftracer" },
-    { name = "pyyaml" },
-    { name = "s3dlio" },
-    { name = "tensorflow" },
-    { name = "torch" },
-    { name = "typing-extensions" },
+    { name = "dgen-py", marker = "sys_platform == 'linux'" },
+    { name = "h5py", marker = "sys_platform == 'linux'" },
+    { name = "hydra-core", marker = "sys_platform == 'linux'" },
+    { name = "mpi4py", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "omegaconf", marker = "sys_platform == 'linux'" },
+    { name = "pandas", marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "psutil", marker = "sys_platform == 'linux'" },
+    { name = "pyarrow", marker = "sys_platform == 'linux'" },
+    { name = "pydftracer", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "s3dlio", marker = "sys_platform == 'linux'" },
+    { name = "s3torchconnector", marker = "sys_platform == 'linux'" },
+    { name = "tensorflow", marker = "sys_platform == 'linux'" },
+    { name = "torch", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -304,7 +278,7 @@ name = "google-pasta"
 version = "0.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "six" },
+    { name = "six", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/35/4a/0bd53b36ff0323d10d5f24ebd67af2de10a1117f5cf4d7add90df92756f1/google-pasta-0.2.0.tar.gz", hash = "sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e", size = 40430 }
 wheels = [
@@ -316,20 +290,17 @@ name = "grpcio"
 version = "1.80.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905 }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616 },
-    { url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204 },
     { url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866 },
     { url = "https://files.pythonhosted.org/packages/ae/f6/fdd975a2cb4d78eb67769a7b3b3830970bfa2e919f1decf724ae4445f42c/grpcio-1.80.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921", size = 7273060 },
     { url = "https://files.pythonhosted.org/packages/db/f0/a3deb5feba60d9538a962913e37bd2e69a195f1c3376a3dd44fe0427e996/grpcio-1.80.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411", size = 6782121 },
     { url = "https://files.pythonhosted.org/packages/ca/84/36c6dcfddc093e108141f757c407902a05085e0c328007cb090d56646cdf/grpcio-1.80.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd", size = 7383811 },
     { url = "https://files.pythonhosted.org/packages/7c/ef/f3a77e3dc5b471a0ec86c564c98d6adfa3510d38f8ee99010410858d591e/grpcio-1.80.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f", size = 8393860 },
     { url = "https://files.pythonhosted.org/packages/9b/8d/9d4d27ed7f33d109c50d6b5ce578a9914aa68edab75d65869a17e630a8d1/grpcio-1.80.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f", size = 7830132 },
-    { url = "https://files.pythonhosted.org/packages/14/e4/9990b41c6d7a44e1e9dee8ac11d7a9802ba1378b40d77468a7761d1ad288/grpcio-1.80.0-cp312-cp312-win32.whl", hash = "sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193", size = 4140904 },
-    { url = "https://files.pythonhosted.org/packages/2f/2c/296f6138caca1f4b92a31ace4ae1b87dab692fc16a7a3417af3bb3c805bf/grpcio-1.80.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff", size = 4880944 },
 ]
 
 [[package]]
@@ -337,18 +308,14 @@ name = "h5py"
 version = "3.16.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/db/33/acd0ce6863b6c0d7735007df01815403f5589a21ff8c2e1ee2587a38f548/h5py-3.16.0.tar.gz", hash = "sha256:a0dbaad796840ccaa67a4c144a0d0c8080073c34c76d5a6941d6818678ef2738", size = 446526 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c8/c0/5d4119dba94093bbafede500d3defd2f5eab7897732998c04b54021e530b/h5py-3.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c5313566f4643121a78503a473f0fb1e6dcc541d5115c44f05e037609c565c4d", size = 3685604 },
-    { url = "https://files.pythonhosted.org/packages/b0/42/c84efcc1d4caebafb1ecd8be4643f39c85c47a80fe254d92b8b43b1eadaf/h5py-3.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42b012933a83e1a558c673176676a10ce2fd3759976a0fedee1e672d1e04fc9d", size = 3061940 },
     { url = "https://files.pythonhosted.org/packages/89/84/06281c82d4d1686fde1ac6b0f307c50918f1c0151062445ab3b6fa5a921d/h5py-3.16.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:ff24039e2573297787c3063df64b60aab0591980ac898329a08b0320e0cf2527", size = 5198852 },
     { url = "https://files.pythonhosted.org/packages/9e/e9/1a19e42cd43cc1365e127db6aae85e1c671da1d9a5d746f4d34a50edb577/h5py-3.16.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:dfc21898ff025f1e8e67e194965a95a8d4754f452f83454538f98f8a3fcb207e", size = 5405250 },
     { url = "https://files.pythonhosted.org/packages/b7/8e/9790c1655eabeb85b92b1ecab7d7e62a2069e53baefd58c98f0909c7a948/h5py-3.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:698dd69291272642ffda44a0ecd6cd3bda5faf9621452d255f57ce91487b9794", size = 5190108 },
     { url = "https://files.pythonhosted.org/packages/51/d7/ab693274f1bd7e8c5f9fdd6c7003a88d59bedeaf8752716a55f532924fbb/h5py-3.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2b2c02b0a160faed5fb33f1ba8a264a37ee240b22e049ecc827345d0d9043074", size = 5419216 },
-    { url = "https://files.pythonhosted.org/packages/03/c1/0976b235cf29ead553e22f2fb6385a8252b533715e00d0ae52ed7b900582/h5py-3.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:96b422019a1c8975c2d5dadcf61d4ba6f01c31f92bbde6e4649607885fe502d6", size = 3182868 },
-    { url = "https://files.pythonhosted.org/packages/14/d9/866b7e570b39070f92d47b0ff1800f0f8239b6f9e45f02363d7112336c1f/h5py-3.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:39c2838fb1e8d97bcf1755e60ad1f3dd76a7b2a475928dc321672752678b96db", size = 2653286 },
 ]
 
 [[package]]
@@ -356,9 +323,9 @@ name = "hydra-core"
 version = "1.3.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "antlr4-python3-runtime" },
-    { name = "omegaconf" },
-    { name = "packaging" },
+    { name = "antlr4-python3-runtime", marker = "sys_platform == 'linux'" },
+    { name = "omegaconf", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494 }
 wheels = [
@@ -388,7 +355,7 @@ name = "jinja2"
 version = "3.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markupsafe" },
+    { name = "markupsafe", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115 }
 wheels = [
@@ -400,14 +367,14 @@ name = "keras"
 version = "3.13.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "h5py" },
-    { name = "ml-dtypes" },
-    { name = "namex" },
-    { name = "numpy" },
-    { name = "optree" },
-    { name = "packaging" },
-    { name = "rich" },
+    { name = "absl-py", marker = "sys_platform == 'linux'" },
+    { name = "h5py", marker = "sys_platform == 'linux'" },
+    { name = "ml-dtypes", marker = "sys_platform == 'linux'" },
+    { name = "namex", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "optree", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "rich", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/09/e9/400582e5f3dbd815d2a373f7de7717dd1bc8349274e9ac1b9ac47410b123/keras-3.13.2.tar.gz", hash = "sha256:62f0123488ac87c929c988617e14f293f7bc993811837d08bb37eff77adc85a9", size = 1155875 }
 wheels = [
@@ -420,15 +387,10 @@ version = "18.1.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/6e/5c/ca35e19a4f142adffa27e3d652196b7362fa612243e2b916845d801454fc/libclang-18.1.1.tar.gz", hash = "sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250", size = 39612 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4b/49/f5e3e7e1419872b69f6f5e82ba56e33955a74bd537d8a1f5f1eff2f3668a/libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a", size = 25836045 },
-    { url = "https://files.pythonhosted.org/packages/e2/e5/fc61bbded91a8830ccce94c5294ecd6e88e496cc85f6704bf350c0634b70/libclang-18.1.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5", size = 26502641 },
-    { url = "https://files.pythonhosted.org/packages/db/ed/1df62b44db2583375f6a8a5e2ca5432bbdc3edb477942b9b7c848c720055/libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8", size = 26420207 },
     { url = "https://files.pythonhosted.org/packages/1d/fc/716c1e62e512ef1c160e7984a73a5fc7df45166f2ff3f254e71c58076f7c/libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b", size = 24515943 },
     { url = "https://files.pythonhosted.org/packages/3c/3d/f0ac1150280d8d20d059608cf2d5ff61b7c3b7f7bcf9c0f425ab92df769a/libclang-18.1.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592", size = 23784972 },
     { url = "https://files.pythonhosted.org/packages/fe/2f/d920822c2b1ce9326a4c78c0c2b4aa3fde610c7ee9f631b600acb5376c26/libclang-18.1.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe", size = 20259606 },
     { url = "https://files.pythonhosted.org/packages/2d/c2/de1db8c6d413597076a4259cea409b83459b2db997c003578affdd32bf66/libclang-18.1.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f", size = 24921494 },
-    { url = "https://files.pythonhosted.org/packages/0b/2d/3f480b1e1d31eb3d6de5e3ef641954e5c67430d5ac93b7fa7e07589576c7/libclang-18.1.1-py2.py3-none-win_amd64.whl", hash = "sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb", size = 26415083 },
-    { url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112 },
 ]
 
 [[package]]
@@ -445,7 +407,7 @@ name = "markdown-it-py"
 version = "4.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mdurl" },
+    { name = "mdurl", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070 }
 wheels = [
@@ -458,17 +420,12 @@ version = "3.0.3"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615 },
-    { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020 },
     { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332 },
     { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947 },
     { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962 },
     { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760 },
     { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529 },
     { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015 },
-    { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540 },
-    { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105 },
-    { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906 },
 ]
 
 [[package]]
@@ -485,11 +442,11 @@ name = "minio"
 version = "7.2.20"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "argon2-cffi" },
-    { name = "certifi" },
-    { name = "pycryptodome" },
-    { name = "typing-extensions" },
-    { name = "urllib3" },
+    { name = "argon2-cffi", marker = "sys_platform == 'linux'" },
+    { name = "certifi", marker = "sys_platform == 'linux'" },
+    { name = "pycryptodome", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
+    { name = "urllib3", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/40/df/6dfc6540f96a74125a11653cce717603fd5b7d0001a8e847b3e54e72d238/minio-7.2.20.tar.gz", hash = "sha256:95898b7a023fbbfde375985aa77e2cd6a0762268db79cf886f002a9ea8e68598", size = 136113 }
 wheels = [
@@ -501,54 +458,56 @@ name = "ml-dtypes"
 version = "0.5.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a8/b8/3c70881695e056f8a32f8b941126cf78775d9a4d7feba8abcb52cb7b04f2/ml_dtypes-0.5.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a174837a64f5b16cab6f368171a1a03a27936b31699d167684073ff1c4237dac", size = 676927 },
     { url = "https://files.pythonhosted.org/packages/54/0f/428ef6881782e5ebb7eca459689448c0394fa0a80bea3aa9262cba5445ea/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a7f7c643e8b1320fd958bf098aa7ecf70623a42ec5154e3be3be673f4c34d900", size = 5028464 },
     { url = "https://files.pythonhosted.org/packages/3a/cb/28ce52eb94390dda42599c98ea0204d74799e4d8047a0eb559b6fd648056/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ad459e99793fa6e13bd5b7e6792c8f9190b4e5a1b45c63aba14a4d0a7f1d5ff", size = 5009002 },
-    { url = "https://files.pythonhosted.org/packages/f5/f0/0cfadd537c5470378b1b32bd859cf2824972174b51b873c9d95cfd7475a5/ml_dtypes-0.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:c1a953995cccb9e25a4ae19e34316671e4e2edaebe4cf538229b1fc7109087b7", size = 212222 },
-    { url = "https://files.pythonhosted.org/packages/16/2e/9acc86985bfad8f2c2d30291b27cd2bb4c74cea08695bd540906ed744249/ml_dtypes-0.5.4-cp312-cp312-win_arm64.whl", hash = "sha256:9bad06436568442575beb2d03389aa7456c690a5b05892c471215bfd8cf39460", size = 160793 },
 ]
 
 [[package]]
 name = "mlpstorage"
-version = "2.0.0b1"
+version = "3.0.2"
 source = { editable = "." }
 dependencies = [
-    { name = "dlio-benchmark" },
-    { name = "minio" },
-    { name = "packaging" },
-    { name = "psutil" },
-    { name = "pyarrow" },
-    { name = "python-dotenv" },
-    { name = "pyyaml" },
-    { name = "rich" },
-    { name = "s3dlio" },
-    { name = "s3torchconnector" },
+    { name = "dlio-benchmark", marker = "sys_platform == 'linux'" },
+    { name = "minio", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "psutil", marker = "sys_platform == 'linux'" },
+    { name = "pyarrow", marker = "sys_platform == 'linux'" },
+    { name = "python-dotenv", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "rich", marker = "sys_platform == 'linux'" },
+    { name = "s3dlio", marker = "sys_platform == 'linux'" },
+    { name = "s3torchconnector", marker = "sys_platform == 'linux'" },
 ]
 
 [package.optional-dependencies]
 full = [
-    { name = "dlio-benchmark" },
+    { name = "dlio-benchmark", marker = "sys_platform == 'linux'" },
 ]
 test = [
-    { name = "pytest" },
-    { name = "pytest-cov" },
-    { name = "pytest-mock" },
+    { name = "pytest", marker = "sys_platform == 'linux'" },
+    { name = "pytest-cov", marker = "sys_platform == 'linux'" },
+    { name = "pytest-mock", marker = "sys_platform == 'linux'" },
 ]
 vectordb = [
-    { name = "numpy" },
-    { name = "pandas" },
-    { name = "pymilvus" },
-    { name = "tabulate" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "pandas", marker = "sys_platform == 'linux'" },
+    { name = "pymilvus", marker = "sys_platform == 'linux'" },
+    { name = "tabulate", marker = "sys_platform == 'linux'" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "pytest", marker = "sys_platform == 'linux'" },
 ]
 
 [package.metadata]
 requires-dist = [
-    { name = "dlio-benchmark", git = "https://github.com/russfellows/dlio_benchmark.git?branch=feat%2Fparquet-dgen-streaming" },
-    { name = "dlio-benchmark", marker = "extra == 'full'", git = "https://github.com/russfellows/dlio_benchmark.git?branch=feat%2Fparquet-dgen-streaming" },
+    { name = "dlio-benchmark", git = "https://github.com/russfellows/dlio_benchmark.git?rev=21c0723de897add728158943d369abd4b333f7dc" },
+    { name = "dlio-benchmark", marker = "extra == 'full'", git = "https://github.com/russfellows/dlio_benchmark.git?rev=21c0723de897add728158943d369abd4b333f7dc" },
     { name = "minio", specifier = ">=7.2.20" },
     { name = "numpy", marker = "extra == 'vectordb'", specifier = ">=1.24" },
     { name = "packaging", specifier = ">=21.0" },
@@ -562,27 +521,24 @@ requires-dist = [
     { name = "python-dotenv", specifier = ">=1.0.0" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "rich", specifier = ">=13.0" },
-    { name = "s3dlio", specifier = ">=0.9.95" },
+    { name = "s3dlio", specifier = ">=0.9.100" },
     { name = "s3torchconnector", specifier = ">=1.5.0" },
     { name = "tabulate", marker = "extra == 'vectordb'", specifier = ">=0.9" },
 ]
 
+[package.metadata.requires-dev]
+dev = [{ name = "pytest", specifier = ">=9.0.2" }]
+
 [[package]]
 name = "mpi4py"
 version = "4.1.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/62/74/28ea85b0b949cad827ea50720e00e814e88c8fd536c27c3c491e4f025724/mpi4py-4.1.1.tar.gz", hash = "sha256:eb2c8489bdbc47fdc6b26ca7576e927a11b070b6de196a443132766b3d0a2a22", size = 500518 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/36/b3/2e7df40608f2188dca16e38f8030add1071f06b1cd94dd8a4e16b9acbd84/mpi4py-4.1.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:1586f5d1557abed9cba7e984d18f32e787b353be0986e599974db177ae36329a", size = 1422849 },
-    { url = "https://files.pythonhosted.org/packages/6d/ed/970bd3edc0e614eccc726fa406255b88f728a8bc059e81f96f28d6ede0af/mpi4py-4.1.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ba85e4778d63c750226de95115c92b709f38d7e661be660a275da4f0992ee197", size = 1326982 },
     { url = "https://files.pythonhosted.org/packages/5d/c3/f9a5d1f9ba52ac6386bf3d3550027f42a6b102b0432113cc43294420feb2/mpi4py-4.1.1-cp310-abi3-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0a8332884626994d9ef48da233dc7a0355f4868dd7ff59f078d5813a2935b930", size = 1373127 },
     { url = "https://files.pythonhosted.org/packages/84/d1/1fe75025df801d817ed49371c719559f742f3f263323442d34dbe3366af3/mpi4py-4.1.1-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6e0352860f0b3e18bc0dcb47e42e583ccb9472f89752d711a6fca46a38670554", size = 1225134 },
-    { url = "https://files.pythonhosted.org/packages/40/44/d653fec0e4ca8181645da4bfb2763017625e5b3f151b208fadd932cb1766/mpi4py-4.1.1-cp310-abi3-win_amd64.whl", hash = "sha256:0f46dfe666a599e4bd2641116b2b4852a3ed9d37915edf98fae471d666663128", size = 1478863 },
-    { url = "https://files.pythonhosted.org/packages/ff/2c/e201cd4828555f10306a5439875cbd0ecfba766ace01ff5c6df43f795650/mpi4py-4.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4403a7cec985be9963efc626193e6df3f63f5ada0c26373c28e640e623e56c3", size = 1669517 },
-    { url = "https://files.pythonhosted.org/packages/7b/53/18d978c3a19deecf38217ce54319e6c9162fec3569c4256c039b66eac2f4/mpi4py-4.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a2ffccc9f3a8c7c957403faad594d650c60234ac08cbedf45beaa96602debe9", size = 1454721 },
     { url = "https://files.pythonhosted.org/packages/ee/15/b908d1d23a4bd2bd7b2e98de5df23b26e43145119fe294728bf89211b935/mpi4py-4.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ed3d9b619bf197a290f7fd67eb61b1c2a5c204afd9621651a50dc0b1c1280d45", size = 1448977 },
     { url = "https://files.pythonhosted.org/packages/5d/19/088a2d37e80e0feb7851853b2a71cbe6f9b18bdf0eab680977864ea83aab/mpi4py-4.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0699c194db5d95fc2085711e4e0013083bd7ae9a88438e1fd64ddb67e9b0cf9e", size = 1318737 },
-    { url = "https://files.pythonhosted.org/packages/97/3a/526261f39bf096e5ff396d18b76740a58d872425612ff84113dd85c2c08e/mpi4py-4.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:0abf5490c3d49c30542b461bfc5ad88dd7d147a4bdb456b7163640577fdfef88", size = 1725676 },
 ]
 
 [[package]]
@@ -618,17 +574,10 @@ version = "2.4.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272 },
-    { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573 },
-    { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782 },
-    { url = "https://files.pythonhosted.org/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8", size = 6552038 },
     { url = "https://files.pythonhosted.org/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121", size = 15670666 },
     { url = "https://files.pythonhosted.org/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e", size = 16645480 },
     { url = "https://files.pythonhosted.org/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44", size = 17020036 },
     { url = "https://files.pythonhosted.org/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d", size = 18368643 },
-    { url = "https://files.pythonhosted.org/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827", size = 5961117 },
-    { url = "https://files.pythonhosted.org/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a", size = 12320584 },
-    { url = "https://files.pythonhosted.org/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec", size = 10221450 },
 ]
 
 [[package]]
@@ -672,7 +621,7 @@ name = "nvidia-cudnn-cu13"
 version = "9.19.0.56"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-cublas", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f1/84/26025437c1e6b61a707442184fa0c03d083b661adf3a3eecfd6d21677740/nvidia_cudnn_cu13-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:6ed29ffaee1176c612daf442e4dd6cfeb6a0caa43ddcbeb59da94953030b1be4", size = 433781201 },
@@ -684,7 +633,7 @@ name = "nvidia-cufft"
 version = "12.0.0.61"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8b/ae/f417a75c0259e85c1d2f83ca4e960289a5f814ed0cea74d18c353d3e989d/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2708c852ef8cd89d1d2068bdbece0aa188813a0c934db3779b9b1faa8442e5f5", size = 214053554 },
@@ -714,9 +663,9 @@ name = "nvidia-cusolver"
 version = "12.0.4.66"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
-    { name = "nvidia-cusparse", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
-    { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-cublas", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparse", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c8/c3/b30c9e935fc01e3da443ec0116ed1b2a009bb867f5324d3f2d7e533e776b/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:02c2457eaa9e39de20f880f4bd8820e6a1cfb9f9a34f820eb12a155aa5bc92d2", size = 223467760 },
@@ -728,7 +677,7 @@ name = "nvidia-cusparse"
 version = "12.6.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f8/94/5c26f33738ae35276672f12615a64bd008ed5be6d1ebcb23579285d960a9/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:80bcc4662f23f1054ee334a15c72b8940402975e0eab63178fc7e670aa59472c", size = 162155568 },
@@ -785,8 +734,8 @@ name = "omegaconf"
 version = "2.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "antlr4-python3-runtime" },
-    { name = "pyyaml" },
+    { name = "antlr4-python3-runtime", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120 }
 wheels = [
@@ -807,21 +756,16 @@ name = "optree"
 version = "0.19.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/63/7b078bc36d5a206c21b03565a818ede38ff0fbf014e92085ec467ef10adb/optree-0.19.0.tar.gz", hash = "sha256:bc1991a948590756409e76be4e29efd4a487a185056d35db6c67619c19ea27a1", size = 175199 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/2d/bf/5cbbf61a27f94797c3d9786f6230223023a943b60f5e893d52368f10b8b1/optree-0.19.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7ec4b2ce49622c6be2c8634712b6c63cc274835bac89a56e3ab2ca863a32ff4b", size = 418100 },
-    { url = "https://files.pythonhosted.org/packages/00/9e/65899e6470f5df289ccdbe9e228fb0cd0ae45ccda8e32c92d6efae1530ef/optree-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f0978603623b4b1f794f05f6bbed0645cb7e219f4a5a349b2a2bd4514d84ac82", size = 388582 },
     { url = "https://files.pythonhosted.org/packages/d1/dc/f4826835be660181f1b4444ac92b51dda96d4634d3c2271e14598da7bf2a/optree-0.19.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8c9e52c50ed3f3f8b1cf4e47a20a7c5e77175b4f84b2ecf390a76f0d1dd91da6", size = 407457 },
     { url = "https://files.pythonhosted.org/packages/ce/b0/89283ac1dd1ead3aa3d7a6b45a26846f457bded79a83b6828fc1ed9a6db3/optree-0.19.0-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:3fe3e5f7a30a7d08ddba0a34e48f5483f6c4d7bb710375434ad3633170c73c48", size = 471230 },
     { url = "https://files.pythonhosted.org/packages/2a/a2/47f620f87b0544b2e0eb0b3c661682bd0ea1c79f6e38f9147bc0f835c973/optree-0.19.0-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8315527e1f14a91173fe6871847da7b949048ec61ff8b3e507fc286e75b0aa3c", size = 469442 },
     { url = "https://files.pythonhosted.org/packages/84/e9/b9ae18404135de53809fb994b754ac0eac838d8c4dfa8a10a811d8dec91d/optree-0.19.0-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:938fb15d140ab65148f4e6975048facbef83a9210353fbedd471ac39e7544339", size = 468840 },
     { url = "https://files.pythonhosted.org/packages/0a/e5/a77df15a62b37bb14c81b5757e2a0573f57e7c06d125a410ad2cd7cefb72/optree-0.19.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b8209570340135a7e586c90f393f3c6359e8a49c40d783196721cc487e51d9c", size = 451408 },
     { url = "https://files.pythonhosted.org/packages/8c/43/1aa431cee19cd98c4229e468767021f9a92195d9431857e28198a3a3ce2f/optree-0.19.0-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:1397dc925026917531a43fda32054ae1e77e5ed9bf8284bcae6354c19c26e14a", size = 412544 },
-    { url = "https://files.pythonhosted.org/packages/5b/b9/b94fd3a116b80951d692a82f4135ae84b3d78bd1b092250aff76a3366138/optree-0.19.0-cp312-cp312-win32.whl", hash = "sha256:68f58e8f8b75c76c51e61e3dc2d9e94609bafb0e1a6459e6d525ced905cd9a74", size = 312033 },
-    { url = "https://files.pythonhosted.org/packages/9e/7f/31fa1b2311038bfc355ad6e4e4e63d028719cb67fb3ebe6fb76ff2124105/optree-0.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:5c44ca0f579ed3e0ca777a5711d4a6c1b374feacf1bb4fe9cfe85297b0c8d237", size = 335374 },
-    { url = "https://files.pythonhosted.org/packages/09/86/863bc3f42f83113f5c6a5beaf4fec3c3481a76872f3244d0e64fb9ebd3b0/optree-0.19.0-cp312-cp312-win_arm64.whl", hash = "sha256:0461f796b4ade3fab519d821b0fa521f07e2af70206b76aac75fcfdc2e051fca", size = 345868 },
 ]
 
 [[package]]
@@ -830,8 +774,6 @@ version = "3.11.8"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/9d/1b/2024d06792d0779f9dbc51531b61c24f76c75b9f4ce05e6f3377a1814cea/orjson-3.11.8.tar.gz", hash = "sha256:96163d9cdc5a202703e9ad1b9ae757d5f0ca62f4fa0cc93d1f27b0e180cc404e", size = 5603832 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/01/f6/8d58b32ab32d9215973a1688aebd098252ee8af1766c0e4e36e7831f0295/orjson-3.11.8-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1cd0b77e77c95758f8e1100139844e99f3ccc87e71e6fc8e1c027e55807c549f", size = 229233 },
-    { url = "https://files.pythonhosted.org/packages/a9/8b/2ffe35e71f6b92622e8ea4607bf33ecf7dfb51b3619dcfabfd36cbe2d0a5/orjson-3.11.8-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6a3d159d5ffa0e3961f353c4b036540996bf8b9697ccc38261c0eac1fd3347a6", size = 128772 },
     { url = "https://files.pythonhosted.org/packages/27/d2/1f8682ae50d5c6897a563cb96bc106da8c9cb5b7b6e81a52e4cc086679b9/orjson-3.11.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76070a76e9c5ae661e2d9848f216980d8d533e0f8143e6ed462807b242e3c5e8", size = 131946 },
     { url = "https://files.pythonhosted.org/packages/52/4b/5500f76f0eece84226e0689cb48dcde081104c2fa6e2483d17ca13685ffb/orjson-3.11.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:54153d21520a71a4c82a0dbb4523e468941d549d221dc173de0f019678cf3813", size = 130368 },
     { url = "https://files.pythonhosted.org/packages/da/4e/58b927e08fbe9840e6c920d9e299b051ea667463b1f39a56e668669f8508/orjson-3.11.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:469ac2125611b7c5741a0b3798cd9e5786cbad6345f9f400c77212be89563bec", size = 135540 },
@@ -842,9 +784,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/cf/eb284847487821a5d415e54149a6449ba9bfc5872ce63ab7be41b8ec401c/orjson-3.11.8-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3f262401086a3960586af06c054609365e98407151f5ea24a62893a40d80dbbb", size = 423742 },
     { url = "https://files.pythonhosted.org/packages/44/09/e12423d327071c851c13e76936f144a96adacfc037394dec35ac3fc8d1e8/orjson-3.11.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8e8c6218b614badf8e229b697865df4301afa74b791b6c9ade01d19a9953a942", size = 147806 },
     { url = "https://files.pythonhosted.org/packages/b3/6d/37c2589ba864e582ffe7611643314785c6afb1f83c701654ef05daa8fcc7/orjson-3.11.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:093d489fa039ddade2db541097dbb484999fcc65fc2b0ff9819141e2ab364f25", size = 136485 },
-    { url = "https://files.pythonhosted.org/packages/be/c9/135194a02ab76b04ed9a10f68624b7ebd238bbe55548878b11ff15a0f352/orjson-3.11.8-cp312-cp312-win32.whl", hash = "sha256:e0950ed1bcb9893f4293fd5c5a7ee10934fbf82c4101c70be360db23ce24b7d2", size = 131966 },
-    { url = "https://files.pythonhosted.org/packages/ed/9a/9796f8fbe3cf30ce9cb696748dbb535e5c87be4bf4fe2e9ca498ef1fa8cf/orjson-3.11.8-cp312-cp312-win_amd64.whl", hash = "sha256:3cf17c141617b88ced4536b2135c552490f07799f6ad565948ea07bef0dcb9a6", size = 127441 },
-    { url = "https://files.pythonhosted.org/packages/cc/47/5aaf54524a7a4a0dd09dd778f3fa65dd2108290615b652e23d944152bc8e/orjson-3.11.8-cp312-cp312-win_arm64.whl", hash = "sha256:48854463b0572cc87dac7d981aa72ed8bf6deedc0511853dc76b8bbd5482d36d", size = 127364 },
 ]
 
 [[package]]
@@ -861,20 +800,15 @@ name = "pandas"
 version = "3.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
-    { name = "python-dateutil" },
-    { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "python-dateutil", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/da/99/b342345300f13440fe9fe385c3c481e2d9a595ee3bab4d3219247ac94e9a/pandas-3.0.2.tar.gz", hash = "sha256:f4753e73e34c8d83221ba58f232433fca2748be8b18dbca02d242ed153945043", size = 4645855 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f3/b0/c20bd4d6d3f736e6bd6b55794e9cd0a617b858eaad27c8f410ea05d953b7/pandas-3.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:232a70ebb568c0c4d2db4584f338c1577d81e3af63292208d615907b698a0f18", size = 10347921 },
-    { url = "https://files.pythonhosted.org/packages/35/d0/4831af68ce30cc2d03c697bea8450e3225a835ef497d0d70f31b8cdde965/pandas-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:970762605cff1ca0d3f71ed4f3a769ea8f85fc8e6348f6e110b8fea7e6eb5a14", size = 9888127 },
     { url = "https://files.pythonhosted.org/packages/61/a9/16ea9346e1fc4a96e2896242d9bc674764fb9049b0044c0132502f7a771e/pandas-3.0.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aff4e6f4d722e0652707d7bcb190c445fe58428500c6d16005b02401764b1b3d", size = 10399577 },
     { url = "https://files.pythonhosted.org/packages/c4/a8/3a61a721472959ab0ce865ef05d10b0d6bfe27ce8801c99f33d4fa996e65/pandas-3.0.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef8b27695c3d3dc78403c9a7d5e59a62d5464a7e1123b4e0042763f7104dc74f", size = 10880030 },
     { url = "https://files.pythonhosted.org/packages/da/65/7225c0ea4d6ce9cb2160a7fb7f39804871049f016e74782e5dade4d14109/pandas-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f8d68083e49e16b84734eb1a4dcae4259a75c90fb6e2251ab9a00b61120c06ab", size = 11409468 },
     { url = "https://files.pythonhosted.org/packages/fa/5b/46e7c76032639f2132359b5cf4c785dd8cf9aea5ea64699eac752f02b9db/pandas-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:32cc41f310ebd4a296d93515fcac312216adfedb1894e879303987b8f1e2b97d", size = 11936381 },
-    { url = "https://files.pythonhosted.org/packages/7b/8b/721a9cff6fa6a91b162eb51019c6243b82b3226c71bb6c8ef4a9bd65cbc6/pandas-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:a4785e1d6547d8427c5208b748ae2efb64659a21bd82bf440d4262d02bfa02a4", size = 9744993 },
-    { url = "https://files.pythonhosted.org/packages/d5/18/7f0bd34ae27b28159aa80f2a6799f47fda34f7fb938a76e20c7b7fe3b200/pandas-3.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:08504503f7101300107ecdc8df73658e4347586db5cfdadabc1592e9d7e7a0fd", size = 9056118 },
 ]
 
 [[package]]
@@ -883,17 +817,12 @@ version = "12.1.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/1f/42/5c74462b4fd957fcd7b13b04fb3205ff8349236ea74c7c375766d6c82288/pillow-12.1.1.tar.gz", hash = "sha256:9ad8fa5937ab05218e2b6a4cff30295ad35afd2f83ac592e68c0d871bb0fdbc4", size = 46980264 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/07/d3/8df65da0d4df36b094351dce696f2989bec731d4f10e743b1c5f4da4d3bf/pillow-12.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab323b787d6e18b3d91a72fc99b1a2c28651e4358749842b8f8dfacd28ef2052", size = 5262803 },
-    { url = "https://files.pythonhosted.org/packages/d6/71/5026395b290ff404b836e636f51d7297e6c83beceaa87c592718747e670f/pillow-12.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:adebb5bee0f0af4909c30db0d890c773d1a92ffe83da908e2e9e720f8edf3984", size = 4657601 },
     { url = "https://files.pythonhosted.org/packages/b1/2e/1001613d941c67442f745aff0f7cc66dd8df9a9c084eb497e6a543ee6f7e/pillow-12.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb66b7cc26f50977108790e2456b7921e773f23db5630261102233eb355a3b79", size = 6234995 },
     { url = "https://files.pythonhosted.org/packages/07/26/246ab11455b2549b9233dbd44d358d033a2f780fa9007b61a913c5b2d24e/pillow-12.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aee2810642b2898bb187ced9b349e95d2a7272930796e022efaf12e99dccd293", size = 8045012 },
     { url = "https://files.pythonhosted.org/packages/b2/8b/07587069c27be7535ac1fe33874e32de118fbd34e2a73b7f83436a88368c/pillow-12.1.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a0b1cd6232e2b618adcc54d9882e4e662a089d5768cd188f7c245b4c8c44a397", size = 6349638 },
     { url = "https://files.pythonhosted.org/packages/ff/79/6df7b2ee763d619cda2fb4fea498e5f79d984dae304d45a8999b80d6cf5c/pillow-12.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7aac39bcf8d4770d089588a2e1dd111cbaa42df5a94be3114222057d68336bd0", size = 7041540 },
     { url = "https://files.pythonhosted.org/packages/2c/5e/2ba19e7e7236d7529f4d873bdaf317a318896bac289abebd4bb00ef247f0/pillow-12.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ab174cd7d29a62dd139c44bf74b698039328f45cb03b4596c43473a46656b2f3", size = 6462613 },
     { url = "https://files.pythonhosted.org/packages/03/03/31216ec124bb5c3dacd74ce8efff4cc7f52643653bad4825f8f08c697743/pillow-12.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:339ffdcb7cbeaa08221cd401d517d4b1fe7a9ed5d400e4a8039719238620ca35", size = 7166745 },
-    { url = "https://files.pythonhosted.org/packages/1f/e7/7c4552d80052337eb28653b617eafdef39adfb137c49dd7e831b8dc13bc5/pillow-12.1.1-cp312-cp312-win32.whl", hash = "sha256:5d1f9575a12bed9e9eedd9a4972834b08c97a352bd17955ccdebfeca5913fa0a", size = 6328823 },
-    { url = "https://files.pythonhosted.org/packages/3d/17/688626d192d7261bbbf98846fc98995726bddc2c945344b65bec3a29d731/pillow-12.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:21329ec8c96c6e979cd0dfd29406c40c1d52521a90544463057d2aaa937d66a6", size = 7033367 },
-    { url = "https://files.pythonhosted.org/packages/ed/fe/a0ef1f73f939b0eca03ee2c108d0043a87468664770612602c63266a43c4/pillow-12.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:af9a332e572978f0218686636610555ae3defd1633597be015ed50289a03c523", size = 2453811 },
 ]
 
 [[package]]
@@ -911,12 +840,9 @@ version = "7.34.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/6b/6b/a0e95cad1ad7cc3f2c6821fcab91671bd5b78bd42afb357bb4765f29bc41/protobuf-7.34.1.tar.gz", hash = "sha256:9ce42245e704cc5027be797c1db1eb93184d44d1cdd71811fb2d9b25ad541280", size = 454708 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/11/3325d41e6ee15bf1125654301211247b042563bcc898784351252549a8ad/protobuf-7.34.1-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:d8b2cc79c4d8f62b293ad9b11ec3aebce9af481fa73e64556969f7345ebf9fc7", size = 429247 },
     { url = "https://files.pythonhosted.org/packages/eb/9d/aa69df2724ff63efa6f72307b483ce0827f4347cc6d6df24b59e26659fef/protobuf-7.34.1-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:5185e0e948d07abe94bb76ec9b8416b604cfe5da6f871d67aad30cbf24c3110b", size = 325753 },
     { url = "https://files.pythonhosted.org/packages/92/e8/d174c91fd48e50101943f042b09af9029064810b734e4160bbe282fa1caa/protobuf-7.34.1-cp310-abi3-manylinux2014_s390x.whl", hash = "sha256:403b093a6e28a960372b44e5eb081775c9b056e816a8029c61231743d63f881a", size = 340198 },
     { url = "https://files.pythonhosted.org/packages/53/1b/3b431694a4dc6d37b9f653f0c64b0a0d9ec074ee810710c0c3da21d67ba7/protobuf-7.34.1-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:8ff40ce8cd688f7265326b38d5a1bed9bfdf5e6723d49961432f83e21d5713e4", size = 324267 },
-    { url = "https://files.pythonhosted.org/packages/85/29/64de04a0ac142fb685fd09999bc3d337943fb386f3a0ec57f92fd8203f97/protobuf-7.34.1-cp310-abi3-win32.whl", hash = "sha256:34b84ce27680df7cca9f231043ada0daa55d0c44a2ddfaa58ec1d0d89d8bf60a", size = 426628 },
-    { url = "https://files.pythonhosted.org/packages/4d/87/cb5e585192a22b8bd457df5a2c16a75ea0db9674c3a0a39fc9347d84e075/protobuf-7.34.1-cp310-abi3-win_amd64.whl", hash = "sha256:e97b55646e6ce5cbb0954a8c28cd39a5869b59090dfaa7df4598a7fba869468c", size = 437901 },
     { url = "https://files.pythonhosted.org/packages/88/95/608f665226bca68b736b79e457fded9a2a38c4f4379a4a7614303d9db3bc/protobuf-7.34.1-py3-none-any.whl", hash = "sha256:bb3812cd53aefea2b028ef42bd780f5b96407247f20c6ef7c679807e9d188f11", size = 170715 },
 ]
 
@@ -926,14 +852,10 @@ version = "7.2.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090 },
-    { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859 },
     { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560 },
     { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997 },
     { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972 },
     { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266 },
-    { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737 },
-    { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617 },
 ]
 
 [[package]]
@@ -942,13 +864,10 @@ version = "23.0.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575 },
-    { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540 },
     { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940 },
     { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063 },
     { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045 },
     { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741 },
-    { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678 },
 ]
 
 [[package]]
@@ -966,17 +885,12 @@ version = "3.23.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/8e/a6/8452177684d5e906854776276ddd34eca30d1b1e15aa1ee9cefc289a33f5/pycryptodome-3.23.0.tar.gz", hash = "sha256:447700a657182d60338bab09fdb27518f8856aecd80ae4c6bdddb67ff5da44ef", size = 4921276 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/db/6c/a1f71542c969912bb0e106f64f60a56cc1f0fabecf9396f45accbe63fa68/pycryptodome-3.23.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:187058ab80b3281b1de11c2e6842a357a1f71b42cb1e15bce373f3d238135c27", size = 2495627 },
-    { url = "https://files.pythonhosted.org/packages/6e/4e/a066527e079fc5002390c8acdd3aca431e6ea0a50ffd7201551175b47323/pycryptodome-3.23.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cfb5cd445280c5b0a4e6187a7ce8de5a07b5f3f897f235caa11f1f435f182843", size = 1640362 },
     { url = "https://files.pythonhosted.org/packages/50/52/adaf4c8c100a8c49d2bd058e5b551f73dfd8cb89eb4911e25a0c469b6b4e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67bd81fcbe34f43ad9422ee8fd4843c8e7198dd88dd3d40e6de42ee65fbe1490", size = 2182625 },
     { url = "https://files.pythonhosted.org/packages/5f/e9/a09476d436d0ff1402ac3867d933c61805ec2326c6ea557aeeac3825604e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8987bd3307a39bc03df5c8e0e3d8be0c4c3518b7f044b0f4c15d1aa78f52575", size = 2268954 },
     { url = "https://files.pythonhosted.org/packages/f9/c5/ffe6474e0c551d54cab931918127c46d70cab8f114e0c2b5a3c071c2f484/pycryptodome-3.23.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa0698f65e5b570426fc31b8162ed4603b0c2841cbb9088e2b01641e3065915b", size = 2308534 },
     { url = "https://files.pythonhosted.org/packages/18/28/e199677fc15ecf43010f2463fde4c1a53015d1fe95fb03bca2890836603a/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:53ecbafc2b55353edcebd64bf5da94a2a2cdf5090a6915bcca6eca6cc452585a", size = 2181853 },
     { url = "https://files.pythonhosted.org/packages/ce/ea/4fdb09f2165ce1365c9eaefef36625583371ee514db58dc9b65d3a255c4c/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:156df9667ad9f2ad26255926524e1c136d6664b741547deb0a86a9acf5ea631f", size = 2342465 },
     { url = "https://files.pythonhosted.org/packages/22/82/6edc3fc42fe9284aead511394bac167693fb2b0e0395b28b8bedaa07ef04/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:dea827b4d55ee390dc89b2afe5927d4308a8b538ae91d9c6f7a5090f397af1aa", size = 2267414 },
-    { url = "https://files.pythonhosted.org/packages/59/fe/aae679b64363eb78326c7fdc9d06ec3de18bac68be4b612fc1fe8902693c/pycryptodome-3.23.0-cp37-abi3-win32.whl", hash = "sha256:507dbead45474b62b2bbe318eb1c4c8ee641077532067fec9c1aa82c31f84886", size = 1768484 },
-    { url = "https://files.pythonhosted.org/packages/54/2f/e97a1b8294db0daaa87012c24a7bb714147c7ade7656973fd6c736b484ff/pycryptodome-3.23.0-cp37-abi3-win_amd64.whl", hash = "sha256:c75b52aacc6c0c260f204cbdd834f76edc9fb0d8e0da9fbf8352ef58202564e2", size = 1799636 },
-    { url = "https://files.pythonhosted.org/packages/18/3d/f9441a0d798bf2b1e645adc3265e55706aead1255ccdad3856dbdcffec14/pycryptodome-3.23.0-cp37-abi3-win_arm64.whl", hash = "sha256:11eeeb6917903876f134b56ba11abe95c0b0fd5e3330def218083c7d98bbcb3c", size = 1703675 },
 ]
 
 [[package]]
@@ -1002,14 +916,14 @@ name = "pymilvus"
 version = "2.6.12"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cachetools" },
-    { name = "grpcio" },
-    { name = "orjson" },
-    { name = "pandas" },
-    { name = "protobuf" },
-    { name = "python-dotenv" },
-    { name = "requests" },
-    { name = "setuptools" },
+    { name = "cachetools", marker = "sys_platform == 'linux'" },
+    { name = "grpcio", marker = "sys_platform == 'linux'" },
+    { name = "orjson", marker = "sys_platform == 'linux'" },
+    { name = "pandas", marker = "sys_platform == 'linux'" },
+    { name = "protobuf", marker = "sys_platform == 'linux'" },
+    { name = "python-dotenv", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/2c/d7/c5d1381248a33975ccc864a0f980f93270ecc35354de8646c8a16443cccb/pymilvus-2.6.12.tar.gz", hash = "sha256:8323e990dc305e607fef525498eb779e42940a69e0691dde009cd02d48845f7a", size = 1584521 }
 wheels = [
@@ -1021,11 +935,10 @@ name = "pytest"
 version = "9.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "iniconfig" },
-    { name = "packaging" },
-    { name = "pluggy" },
-    { name = "pygments" },
+    { name = "iniconfig", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "pluggy", marker = "sys_platform == 'linux'" },
+    { name = "pygments", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901 }
 wheels = [
@@ -1037,9 +950,9 @@ name = "pytest-cov"
 version = "7.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "coverage" },
-    { name = "pluggy" },
-    { name = "pytest" },
+    { name = "coverage", marker = "sys_platform == 'linux'" },
+    { name = "pluggy", marker = "sys_platform == 'linux'" },
+    { name = "pytest", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592 }
 wheels = [
@@ -1051,7 +964,7 @@ name = "pytest-mock"
 version = "3.15.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pytest" },
+    { name = "pytest", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/68/14/eb014d26be205d38ad5ad20d9a80f7d201472e08167f0bb4361e251084a9/pytest_mock-3.15.1.tar.gz", hash = "sha256:1849a238f6f396da19762269de72cb1814ab44416fa73a8686deac10b0d87a0f", size = 34036 }
 wheels = [
@@ -1063,7 +976,7 @@ name = "python-dateutil"
 version = "2.9.0.post0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "six" },
+    { name = "six", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
 wheels = [
@@ -1085,16 +998,11 @@ version = "6.0.3"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063 },
-    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973 },
     { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116 },
     { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011 },
     { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870 },
     { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089 },
     { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181 },
-    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658 },
-    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003 },
-    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344 },
 ]
 
 [[package]]
@@ -1102,10 +1010,10 @@ name = "requests"
 version = "2.33.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "certifi" },
-    { name = "charset-normalizer" },
-    { name = "idna" },
-    { name = "urllib3" },
+    { name = "certifi", marker = "sys_platform == 'linux'" },
+    { name = "charset-normalizer", marker = "sys_platform == 'linux'" },
+    { name = "idna", marker = "sys_platform == 'linux'" },
+    { name = "urllib3", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5f/a4/98b9c7c6428a668bf7e42ebb7c79d576a1c3c1e3ae2d47e674b468388871/requests-2.33.1.tar.gz", hash = "sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517", size = 134120 }
 wheels = [
@@ -1117,8 +1025,8 @@ name = "rich"
 version = "14.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markdown-it-py" },
-    { name = "pygments" },
+    { name = "markdown-it-py", marker = "sys_platform == 'linux'" },
+    { name = "pygments", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582 }
 wheels = [
@@ -1127,15 +1035,15 @@ wheels = [
 
 [[package]]
 name = "s3dlio"
-version = "0.9.95"
+version = "0.9.100"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/13/bf/b17bf94e1fd7c58b2f93d53192b61271f14538b847d98fd40ef2cc652d61/s3dlio-0.9.95.tar.gz", hash = "sha256:55f79071d244cccf7a49714c33c024639a24723dd88c7cac629c63daa89d0d96", size = 1481201 }
+sdist = { url = "https://files.pythonhosted.org/packages/33/98/23ed0451a8668e352206dea740920d85dceefadf0a6d427d1571d17e845e/s3dlio-0.9.100.tar.gz", hash = "sha256:b2d3dc9f037bcef5e2e171ab1988c1be730849730bee6570f484eb0f02c9a862", size = 1564701 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7c/c3/502a898baa514cf796f11572508f3a78a93574d45ce7d36bcd34e2e7fe40/s3dlio-0.9.95-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93d4f6d929e743a74428d4a6e944fbb85bd6a9cfffbdc36d6635e89f0919a5ba", size = 10258346 },
-    { url = "https://files.pythonhosted.org/packages/91/4f/d394679708a4fb7c0f362076b7f92a0933201d258a90b6b28f0529dacf98/s3dlio-0.9.95-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dd5f1d71c3655346a879a5c3e49142c3d916a6df3505a823f983b0b1abb5bd5", size = 10613865 },
+    { url = "https://files.pythonhosted.org/packages/3b/80/e7a16ae10aa9374b29ae7dc175eaba3910f604c2f2d2ae8955488a13c821/s3dlio-0.9.100-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:090f61effc0eec32a876a62a921287961e92aec57eb0f21449bf5a89d9e9ada2", size = 12416760 },
+    { url = "https://files.pythonhosted.org/packages/ce/38/44ad05689f5f66e503eb095b442f37271e74bde1948fadf1312284173ae3/s3dlio-0.9.100-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb48f3d295071b5226ad6062544003abaa2defadac695424a015db04126f5d57", size = 12842294 },
 ]
 
 [[package]]
@@ -1143,8 +1051,8 @@ name = "s3torchconnector"
 version = "1.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "s3torchconnectorclient" },
-    { name = "torch" },
+    { name = "s3torchconnectorclient", marker = "sys_platform == 'linux'" },
+    { name = "torch", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0f/24/a3422bc7e3d8f2a55a64250a6d5a07416c49d6f5695879445ff72c695612/s3torchconnector-1.5.0.tar.gz", hash = "sha256:44167d8e7bc0fce6d97627fc10aa7e215f4b58e0bb7037e87858c41eefd5b5af", size = 103050 }
 
@@ -1154,8 +1062,6 @@ version = "1.5.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/a5/8d/e04febe3e7ff7c91bc4678a16bec1c87674fc9c160c75a8f8745e516e563/s3torchconnectorclient-1.5.0.tar.gz", hash = "sha256:09ffceca1fd025abd8a4a4cbd94b3f70a7c8ccfbf3e0f76337e180f95ce58e61", size = 85516 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ca/ca/65c66f2b4cc331f3d8fb92961f90edf8e9964fa6890ef7f335fbf9d7989f/s3torchconnectorclient-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:83ae3c096da011af6e57947d2530814a4f78935bf1336117547984da34e1cdec", size = 2124261 },
-    { url = "https://files.pythonhosted.org/packages/e6/20/629141bf19c24fedda41f9c710e55439d6303784cc1ca8e367367a51e08b/s3torchconnectorclient-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1eba5cfc67d7e2bd3cd51400105288a979096cfb293c604d19cdd880f960c396", size = 2019312 },
     { url = "https://files.pythonhosted.org/packages/7d/51/288b8857991cffa36b833c7128897766fb84f3a4a60a5cc3dfe6e2546f8a/s3torchconnectorclient-1.5.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7c0d11b4da0271414ffa370718bbbfb5454dac2ad546d89c7c6c49831e2eb7e5", size = 3594664 },
     { url = "https://files.pythonhosted.org/packages/35/d3/9354e5620c3839393ff9afe2435f5e42bb63eb829edd93395cb0a3b1aa39/s3torchconnectorclient-1.5.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0f5277d76b4d1e12cd6f96823cf5911c51a7a614acbabb4ee4133d8caa332df1", size = 3747379 },
 ]
@@ -1183,7 +1089,7 @@ name = "sympy"
 version = "1.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mpmath" },
+    { name = "mpmath", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921 }
 wheels = [
@@ -1204,16 +1110,16 @@ name = "tensorboard"
 version = "2.20.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "grpcio" },
-    { name = "markdown" },
-    { name = "numpy" },
-    { name = "packaging" },
-    { name = "pillow" },
-    { name = "protobuf" },
-    { name = "setuptools" },
-    { name = "tensorboard-data-server" },
-    { name = "werkzeug" },
+    { name = "absl-py", marker = "sys_platform == 'linux'" },
+    { name = "grpcio", marker = "sys_platform == 'linux'" },
+    { name = "markdown", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "protobuf", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "tensorboard-data-server", marker = "sys_platform == 'linux'" },
+    { name = "werkzeug", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9c/d9/a5db55f88f258ac669a92858b70a714bbbd5acd993820b41ec4a96a4d77f/tensorboard-2.20.0-py3-none-any.whl", hash = "sha256:9dc9f978cb84c0723acf9a345d96c184f0293d18f166bb8d59ee098e6cfaaba6", size = 5525680 },
@@ -1225,7 +1131,6 @@ version = "0.7.2"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/7a/13/e503968fefabd4c6b2650af21e110aa8466fe21432cd7c43a84577a89438/tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb", size = 2356 },
-    { url = "https://files.pythonhosted.org/packages/b7/85/dabeaf902892922777492e1d253bb7e1264cadce3cea932f7ff599e53fea/tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60", size = 4823598 },
     { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363 },
 ]
 
@@ -1234,33 +1139,31 @@ name = "tensorflow"
 version = "2.20.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "astunparse" },
-    { name = "flatbuffers" },
-    { name = "gast" },
-    { name = "google-pasta" },
-    { name = "grpcio" },
-    { name = "h5py" },
-    { name = "keras" },
-    { name = "libclang" },
-    { name = "ml-dtypes" },
-    { name = "numpy" },
-    { name = "opt-einsum" },
-    { name = "packaging" },
-    { name = "protobuf" },
-    { name = "requests" },
-    { name = "setuptools" },
-    { name = "six" },
-    { name = "tensorboard" },
-    { name = "termcolor" },
-    { name = "typing-extensions" },
-    { name = "wrapt" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/35/31/47712f425c09cc8b8dba39c6c45aee939c4636a6feb8c81376a4eae653e0/tensorflow-2.20.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:52b122f0232fd7ab10f28d537ce08470d0b6dcac7fff9685432daac7f8a06c8f", size = 200540302 },
+    { name = "absl-py", marker = "sys_platform == 'linux'" },
+    { name = "astunparse", marker = "sys_platform == 'linux'" },
+    { name = "flatbuffers", marker = "sys_platform == 'linux'" },
+    { name = "gast", marker = "sys_platform == 'linux'" },
+    { name = "google-pasta", marker = "sys_platform == 'linux'" },
+    { name = "grpcio", marker = "sys_platform == 'linux'" },
+    { name = "h5py", marker = "sys_platform == 'linux'" },
+    { name = "keras", marker = "sys_platform == 'linux'" },
+    { name = "libclang", marker = "sys_platform == 'linux'" },
+    { name = "ml-dtypes", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "opt-einsum", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "protobuf", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "six", marker = "sys_platform == 'linux'" },
+    { name = "tensorboard", marker = "sys_platform == 'linux'" },
+    { name = "termcolor", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
+    { name = "wrapt", marker = "sys_platform == 'linux'" },
+]
+wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/b4/f028a5de27d0fda10ba6145bc76e40c37ff6d2d1e95b601adb5ae17d635e/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bfbfb3dd0e22bffc45fe1e922390d27753e99261fab8a882e802cf98a0e078f", size = 259533109 },
     { url = "https://files.pythonhosted.org/packages/9c/d1/6aa15085d672056d5f08b5f28b1c7ce01c4e12149a23b0c98e3c79d04441/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25265b0bc527e0d54b1e9cc60c44a24f44a809fe27666b905f0466471f9c52ec", size = 620682547 },
-    { url = "https://files.pythonhosted.org/packages/f9/37/b97abb360b551fbf5870a0ee07e39ff9c655e6e3e2f839bc88be81361842/tensorflow-2.20.0-cp312-cp312-win_amd64.whl", hash = "sha256:1590cbf87b6bcbd34d8e9ad70d0c696135e0aa71be31803b27358cf7ed63f8fc", size = 331887041 },
 ]
 
 [[package]]
@@ -1279,24 +1182,22 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-bindings", marker = "sys_platform == 'linux'" },
     { name = "cuda-toolkit", extra = ["cublas", "cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux'" },
-    { name = "filelock" },
-    { name = "fsspec" },
-    { name = "jinja2" },
-    { name = "networkx" },
+    { name = "filelock", marker = "sys_platform == 'linux'" },
+    { name = "fsspec", marker = "sys_platform == 'linux'" },
+    { name = "jinja2", marker = "sys_platform == 'linux'" },
+    { name = "networkx", marker = "sys_platform == 'linux'" },
     { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" },
-    { name = "setuptools" },
-    { name = "sympy" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "sympy", marker = "sys_platform == 'linux'" },
     { name = "triton", marker = "sys_platform == 'linux'" },
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6f/8b/69e3008d78e5cee2b30183340cc425081b78afc5eff3d080daab0adda9aa/torch-2.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4b5866312ee6e52ea625cd211dcb97d6a2cdc1131a5f15cc0d87eec948f6dd34", size = 80606338 },
     { url = "https://files.pythonhosted.org/packages/13/16/42e5915ebe4868caa6bac83a8ed59db57f12e9a61b7d749d584776ed53d5/torch-2.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f99924682ef0aa6a4ab3b1b76f40dc6e273fca09f367d15a524266db100a723f", size = 419731115 },
     { url = "https://files.pythonhosted.org/packages/1a/c9/82638ef24d7877510f83baf821f5619a61b45568ce21c0a87a91576510aa/torch-2.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:0f68f4ac6d95d12e896c3b7a912b5871619542ec54d3649cf48cc1edd4dd2756", size = 530712279 },
-    { url = "https://files.pythonhosted.org/packages/1c/ff/6756f1c7ee302f6d202120e0f4f05b432b839908f9071157302cedfc5232/torch-2.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:fbf39280699d1b869f55eac536deceaa1b60bd6788ba74f399cc67e60a5fab10", size = 114556047 },
 ]
 
 [[package]]
@@ -1317,15 +1218,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614 },
 ]
 
-[[package]]
-name = "tzdata"
-version = "2025.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521 },
-]
-
 [[package]]
 name = "urllib3"
 version = "2.6.3"
@@ -1340,7 +1232,7 @@ name = "werkzeug"
 version = "3.1.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markupsafe" },
+    { name = "markupsafe", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b5/43/76ded108b296a49f52de6bac5192ca1c4be84e886f9b5c9ba8427d9694fd/werkzeug-3.1.7.tar.gz", hash = "sha256:fb8c01fe6ab13b9b7cdb46892b99b1d66754e1d7ab8e542e865ec13f526b5351", size = 875700 }
 wheels = [
@@ -1352,7 +1244,7 @@ name = "wheel"
 version = "0.46.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "packaging" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/89/24/a2eb353a6edac9a0303977c4cb048134959dd2a51b48a269dfc9dde00c8a/wheel-0.46.3.tar.gz", hash = "sha256:e3e79874b07d776c40bd6033f8ddf76a7dad46a7b8aa1b2787a83083519a1803", size = 60605 }
 wheels = [
@@ -1365,17 +1257,12 @@ version = "2.1.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255 },
-    { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848 },
     { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433 },
     { url = "https://files.pythonhosted.org/packages/3a/9f/742c7c7cdf58b59085a1ee4b6c37b013f66ac33673a7ef4aaed5e992bc33/wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8", size = 123013 },
     { url = "https://files.pythonhosted.org/packages/e8/44/2c3dd45d53236b7ed7c646fcf212251dc19e48e599debd3926b52310fafb/wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c", size = 117326 },
     { url = "https://files.pythonhosted.org/packages/74/e2/b17d66abc26bd96f89dec0ecd0ef03da4a1286e6ff793839ec431b9fae57/wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c", size = 121444 },
     { url = "https://files.pythonhosted.org/packages/3c/62/e2977843fdf9f03daf1586a0ff49060b1b2fc7ff85a7ea82b6217c1ae36e/wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1", size = 116237 },
     { url = "https://files.pythonhosted.org/packages/88/dd/27fc67914e68d740bce512f11734aec08696e6b17641fef8867c00c949fc/wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2", size = 120563 },
-    { url = "https://files.pythonhosted.org/packages/ec/9f/b750b3692ed2ef4705cb305bd68858e73010492b80e43d2a4faa5573cbe7/wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0", size = 58198 },
-    { url = "https://files.pythonhosted.org/packages/8e/b2/feecfe29f28483d888d76a48f03c4c4d8afea944dbee2b0cd3380f9df032/wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63", size = 60441 },
-    { url = "https://files.pythonhosted.org/packages/44/e1/e328f605d6e208547ea9fd120804fcdec68536ac748987a68c47c606eea8/wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf", size = 58836 },
     { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993 },
 ]
 
@@ -1385,8 +1272,6 @@ version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738 },
-    { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436 },
     { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019 },
     { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012 },
     { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148 },
@@ -1399,7 +1284,4 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517 },
     { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292 },
     { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237 },
-    { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922 },
-    { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276 },
-    { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679 },
 ]