EarthyScience · Shane98c · Jun 23, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/.prettierignore b/.prettierignore
@@ -0,0 +1,6 @@
+dist
+node_modules
+tests/data
+
+# Vendored third-party source — keep byte-identical to upstream
+src/vendor
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -45,8 +45,7 @@
   "license": "MIT",
   "dependencies": {
     "@msgpack/msgpack": "^3.1.3",
-    "flatbuffers": "^25.9.23",
-    "fzstd": "^0.1.1"
+    "flatbuffers": "^25.9.23"
   },
   "devDependencies": {
     "@types/node": "^20.0.0",

diff --git a/src/format/flatbuffers/manifest-parser.ts b/src/format/flatbuffers/manifest-parser.ts
@@ -5,6 +5,7 @@
  */
 
 import { ByteBuffer } from "flatbuffers";
+import { decompress } from "../../vendor/fzstd/index.js";
 import { Manifest as FbsManifest } from "./generated/manifest.js";
 import { ArrayManifest as FbsArrayManifest } from "./generated/array-manifest.js";
 import { ChunkRef as FbsChunkRef } from "./generated/chunk-ref.js";
@@ -18,6 +19,70 @@ import {
   type ObjectId8,
 } from "./types.js";
 
+/** compression_algorithm value for zstd dictionary-compressed locations. */
+const COMPRESSION_ALG_ZSTD_DICT = 1;
+
+/** Upper bound on a decompressed location, matching the Rust
+ * `MAX_DECOMPRESSED_LOCATION_SIZE`. */
+const MAX_DECOMPRESSED_LOCATION_SIZE = 1024;
+
+/**
+ * Bytes that a zstd decoder allocates up front for `frame`, read from the frame
+ * header: the advertised frame content size, falling back to the window size.
+ * Mirrors the allocation in the vendored fzstd `rzfh`. Returns `Infinity` for a
+ * missing/short/unrecognized header so callers reject rather than trust it.
+ *
+ * Used to bound allocation *before* decompressing an untrusted
+ * `compressed_location`: the size check on the decompressed output alone is too
+ * late, since the decoder would already have allocated per the advertised size.
+ */
+function zstdFrameAllocSize(frame: Uint8Array): number {
+  // Magic number 0xFD2FB528 (little-endian), then the frame header descriptor.
+  if (
+    frame.length < 5 ||
+    frame[0] !== 0x28 ||
+    frame[1] !== 0xb5 ||
+    frame[2] !== 0x2f ||
+    frame[3] !== 0xfd
+  ) {
+    return Infinity;
+  }
+  const flg = frame[4];
+  const singleSegment = (flg >> 5) & 1;
+  const dictIdFlag = flg & 3;
+  const contentSizeFlag = flg >> 6;
+
+  let pos = 5;
+  let windowSize = 0;
+  if (!singleSegment) {
+    if (pos >= frame.length) return Infinity;
+    const wd = frame[pos++];
+    const base = 1 << (10 + (wd >> 3));
+    windowSize = base + (base >> 3) * (wd & 7);
-    const base = 1 << (10 + (wd >> 3));
-    windowSize = base + (base >> 3) * (wd & 7);
+    const base = 2 ** (10 + (wd >> 3));
+    windowSize = base + (base / 8) * (wd & 7);
-    const base = 1 << (10 + (wd >> 3));
-    windowSize = base + (base >> 3) * (wd & 7);
+    const base = 2 ** (10 + (wd >> 3));
+    windowSize = base + (base / 8) * (wd & 7);
+  }
+  pos += dictIdFlag === 3 ? 4 : dictIdFlag; // skip dictionary id
+
+  // Frame content size: 0/1/2/4/8 bytes per the flags (matching fzstd's rzfh).
+  const fcsBytes = contentSizeFlag ? 1 << contentSizeFlag : singleSegment;
+  let contentSize = 0;
+  if (fcsBytes) {
+    if (pos + fcsBytes > frame.length) return Infinity;
+    for (let i = 0; i < fcsBytes; i++) {
+      contentSize += frame[pos + i] * 2 ** (8 * i);
+    }
+    if (contentSizeFlag === 1) contentSize += 256;
+    if (singleSegment) windowSize = contentSize;
+  }
+  return contentSize || windowSize;
+}
+
+/**
+ * Decode a ChunkRef's `compressed_location` byte vector into a location string.
+ * Returns `null` when this manifest has no location dictionary (locations are
+ * stored uncompressed).
+ */
+type LocationDecoder = (compressed: Uint8Array) => string;
+
 /** Parse a Manifest from FlatBuffer data */
 export function parseManifest(data: Uint8Array): Manifest {
   const bb = new ByteBuffer(data);
@@ -30,20 +95,66 @@ export function parseManifest(data: Uint8Array): Manifest {
     idObj.bb!.bytes().slice(idObj.bb_pos, idObj.bb_pos + 12),
   );
 
+  // Build the location decoder once per manifest from its zstd dictionary.
+  const decodeLocation = makeLocationDecoder(fbsManifest);
+
   // Parse arrays
   const arraysLength = fbsManifest.arraysLength();
   const arrays: ArrayManifest[] = [];
   for (let i = 0; i < arraysLength; i++) {
     const fbsArray = fbsManifest.arrays(i);
     if (fbsArray) {
-      arrays.push(parseArrayManifest(fbsArray));
+      arrays.push(parseArrayManifest(fbsArray, decodeLocation));
     }
   }
 
   return { id, arrays };
 }
 
-function parseArrayManifest(fbsArray: FbsArrayManifest): ArrayManifest {
+/**
+ * Build the function that turns a ChunkRef's `compressed_location` into a
+ * location string, using the manifest's zstd dictionary.
+ *
+ * Mirrors the Rust `Manifest::decompressor`: a dictionary is only used when
+ * `compression_algorithm == ZSTD_DICT` and `location_dictionary` is present.
+ * When a `compressed_location` is encountered without a dictionary, decoding
+ * throws (matching the Rust `MissingLocationCompressionDictionary` error).
+ */
+function makeLocationDecoder(fbsManifest: FbsManifest): LocationDecoder {
+  const dictionary =
+    fbsManifest.compressionAlgorithm() === COMPRESSION_ALG_ZSTD_DICT
+      ? fbsManifest.locationDictionaryArray()
+      : null;
+  // `fatal` makes invalid UTF-8 throw, matching the Rust `String::from_utf8`.
+  const textDecoder = new TextDecoder("utf-8", { fatal: true });
+  return (compressed: Uint8Array): string => {
+    if (!dictionary) {
+      throw new Error(
+        "ChunkRef has a compressed_location but the manifest has no location dictionary",
+      );
+    }
+    // Reject before decompressing: a malformed frame can advertise a huge size
+    // and make the decoder allocate far beyond the bound (or OOM) before the
+    // post-decompress length check below would run.
+    if (zstdFrameAllocSize(compressed) > MAX_DECOMPRESSED_LOCATION_SIZE) {
+      throw new Error(
+        `Compressed location declares a size over ${MAX_DECOMPRESSED_LOCATION_SIZE} bytes`,
+      );
+    }
+    const decompressed = decompress(compressed, undefined, dictionary);
+    if (decompressed.length > MAX_DECOMPRESSED_LOCATION_SIZE) {
+      throw new Error(
+        `Decompressed location exceeds ${MAX_DECOMPRESSED_LOCATION_SIZE} bytes`,
+      );
+    }
+    return textDecoder.decode(decompressed);
+  };
+}
+
+function parseArrayManifest(
+  fbsArray: FbsArrayManifest,
+  decodeLocation: LocationDecoder,
+): ArrayManifest {
   // Parse node_id (required)
   const nodeIdObj = fbsArray.nodeId();
   if (!nodeIdObj)
@@ -58,14 +169,17 @@ function parseArrayManifest(fbsArray: FbsArrayManifest): ArrayManifest {
   for (let i = 0; i < refsLength; i++) {
     const fbsRef = fbsArray.refs(i);
     if (fbsRef) {
-      refs.push(parseChunkRef(fbsRef));
+      refs.push(parseChunkRef(fbsRef, decodeLocation));
     }
   }
 
   return { nodeId, refs };
 }
 
-function parseChunkRef(fbsRef: FbsChunkRef): ChunkRef {
+function parseChunkRef(
+  fbsRef: FbsChunkRef,
+  decodeLocation: LocationDecoder,
+): ChunkRef {
   // Parse index (required, vector of uint32)
   const indexLength = fbsRef.indexLength();
   const index: number[] = [];
@@ -89,8 +203,18 @@ function parseChunkRef(fbsRef: FbsChunkRef): ChunkRef {
       )
     : null;
 
-  // Parse location (optional string)
-  const location = fbsRef.location();
+  // Resolve the virtual location. Per the Rust `ref_to_payload` priority
+  // (chunk_id -> compressed_location -> location), a dictionary-compressed
+  // location takes precedence over the plain `location` string and is decoded
+  // here at parse time. `chunk_id` refs are native, not virtual, so they never
+  // carry a location — skip decoding for them.
+  let location = fbsRef.location();
+  if (chunkId === null) {
+    const compressed = fbsRef.compressedLocationArray();
+    if (compressed) {
+      location = decodeLocation(compressed);
+    }
+  }
 
   // Parse checksum fields
   const checksumEtag = fbsRef.checksumEtag();
@@ -193,12 +317,14 @@ function binarySearchChunkRef(
   return null;
 }
 
-/** Extract the payload type from a ChunkRef */
+/**
+ * Extract the payload type from a ChunkRef.
+ *
+ * Priority matches the Rust `ref_to_payload`: chunk_id (native) ->
+ * location (virtual; already decoded from compressed_location at parse time)
+ * -> inline.
+ */
 export function getChunkPayload(ref: ChunkRef): ChunkPayload {
-  if (ref.inline !== null) {
-    return { type: "inline", data: ref.inline };
-  }
-
   if (ref.chunkId !== null) {
     return {
       type: "native",
@@ -219,5 +345,9 @@ export function getChunkPayload(ref: ChunkRef): ChunkPayload {
     };
   }
 
-  throw new Error("Invalid ChunkRef: no inline, chunkId, or location");
+  if (ref.inline !== null) {
+    return { type: "inline", data: ref.inline };
+  }
+
+  throw new Error("Invalid ChunkRef: no chunkId, location, or inline");
 }
diff --git a/src/format/flatbuffers/repo-parser.ts b/src/format/flatbuffers/repo-parser.ts
@@ -7,7 +7,7 @@
 import { ByteBuffer } from "flatbuffers";
 import * as flexbuffers from "flatbuffers/js/flexbuffers.js";
 import { Repo as FbsRepo } from "./generated/repo.js";
-import { decompress } from "fzstd";
+import { decompress } from "../../vendor/fzstd/index.js";
 import {
   parseHeader,
   getDataAfterHeader,

diff --git a/src/reader/session.ts b/src/reader/session.ts
@@ -7,7 +7,7 @@ import type {
   Storage,
   RequestOptions,
 } from "../storage/storage.js";
-import { decompress } from "fzstd";
+import { decompress } from "../vendor/fzstd/index.js";
 import { LRUCache } from "../cache/lru.js";
 import { singleFlight, type SingleFlight } from "../cache/single-flight.js";
 import {

diff --git a/src/vendor/fzstd/LICENSE b/src/vendor/fzstd/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Arjun Barrett
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/vendor/fzstd/README.md b/src/vendor/fzstd/README.md
@@ -0,0 +1,86 @@
+# Vendored `fzstd`
+
+This directory contains a vendored copy of [`fzstd`](https://github.com/101arrowz/fzstd)
+(MIT, © 2020 Arjun Barrett) used for zstd decompression — both the outer
+metadata-file decompression and the dictionary-compressed virtual chunk
+locations in manifests.
+
+## Why it's vendored (not an npm dependency)
+
+- **Released `fzstd` (0.1.1) has no dictionary support.** Its `decompress()`
+  takes no dictionary argument, and icechunk's `compressed_location` fields are
+  zstd frames compressed against a per-manifest dictionary — undecodable
+  without it. Dictionary support exists only in the unmerged
+  [fzstd#18](https://github.com/101arrowz/fzstd/pull/18).
+- **The PR branch can't be cleanly `npm install`ed** — its `package.json`
+  `main`/`module` point at build outputs that aren't committed, and there's no
+  `prepare` hook, so a git/`github:` install resolves to missing files.
+- **`zstdify` (the other dictionary-capable JS zstd lib) silently corrupts
+  data** — it returned wrong bytes (same length, no error) for ~32% of real
+  icechunk metadata fixtures, at both 1.4.0 and `main` (the PR#3 "fix" does not
+  address it). `fzstd` is byte-exact vs the reference `zstd` CLI on all of them.
+
+So we vendor the dictionary-capable source and decompress with it everywhere.
+
+## Upstream source
+
+| | |
+|---|---|
+| Repo | `101arrowz/fzstd` |
+| Branch | `handlerug:push-qxkytkvukoqs` (= the head of fzstd#18) |
+| Commit | `c039e52` "Support loading a dictionary for decompression" |
+| File | `src/index.ts` → `index.ts` here (with a header prepended) |
+| License | `LICENSE` (copied verbatim) |
+
+`index.ts` is `@ts-nocheck`'d and excluded from Prettier (`.prettierignore`) to
+stay byte-identical to upstream apart from the documented patch below.
+
+## Local modifications
+
+There is **one** deliberate change from upstream, marked in `index.ts` with a
+`LOCAL PATCH (icechunk-js)` comment (and summarized in the file header):
+
+- **Dictionary/output-buffer aliasing fix** — in `decompress()`, the
+  no-output-buffer fast path is guarded with `!dic`:
+  `if (!dic && st.w.length == st.u)`. When a dictionary is loaded, `rdic()`
+  replaces `st.w` with the dictionary history; reusing it as the output buffer
+  aliases back-reference reads with output writes and silently corrupts the
+  result whenever the dictionary-content length equals the decompressed size.
+  - Submitted upstream: **https://github.com/handlerug/fzstd/pull/1**
+    (against the fzstd#18 branch). When it merges and a release is published,
+    drop this patch on the next re-vendor.
+  - Regression coverage: `tests/format/flatbuffers/manifest-dictionary.test.ts`
+    ("decodes a frame whose output aliases the dictionary buffer").
+
+> Note: the size-bound (`MAX_DECOMPRESSED_LOCATION_SIZE`) and UTF-8 strictness
+> guards are **not** here — they live in the consumer
+> (`src/format/flatbuffers/manifest-parser.ts`), since bounding untrusted input
+> is the caller's job, not the decoder's.
+
+## Re-vendoring (updating the copy)
+
+```sh
+# Fetch the upstream source for the pinned branch/commit:
+gh api "repos/handlerug/fzstd/contents/src/index.ts?ref=push-qxkytkvukoqs" \
+  --jq '.content' | base64 -d > /tmp/fzstd-index.ts
+```
+
+Then:
+1. Re-prepend the provenance header (top of the current `index.ts`).
+2. **Re-apply the local patch** — add `!dic &&` to the fast-path guard in
+   `decompress()` (search for `st.w.length == st.u`).
+3. Re-run the suite: `npm test` (the aliasing + dictionary tests must pass).
+
+## When this can be deleted
+
+Once fzstd#18 **and** the aliasing fix land upstream and a release is published
+to npm: delete this directory, add `fzstd@<version>` to `dependencies`, and
+repoint the four imports (`src/reader/session.ts`,
+`src/format/flatbuffers/{repo,manifest}-parser.ts`, and the two parser tests).
+
+## Related
+
+- Consumer: `src/format/flatbuffers/manifest-parser.ts` (dictionary decode,
+  decode priority matching the Rust `ref_to_payload`, size/UTF-8 guards).
+- Test fixtures: `tests/fixtures/dictionary/` (generated with the `zstd` CLI;
+  generation commands documented in the test file headers).