Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
dist
node_modules
tests/data

# Vendored third-party source — keep byte-identical to upstream
src/vendor
9 changes: 1 addition & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@
"license": "MIT",
"dependencies": {
"@msgpack/msgpack": "^3.1.3",
"flatbuffers": "^25.9.23",
"fzstd": "^0.1.1"
"flatbuffers": "^25.9.23"
},
"devDependencies": {
"@types/node": "^20.0.0",
Expand Down
154 changes: 142 additions & 12 deletions src/format/flatbuffers/manifest-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*/

import { ByteBuffer } from "flatbuffers";
import { decompress } from "../../vendor/fzstd/index.js";
import { Manifest as FbsManifest } from "./generated/manifest.js";
import { ArrayManifest as FbsArrayManifest } from "./generated/array-manifest.js";
import { ChunkRef as FbsChunkRef } from "./generated/chunk-ref.js";
Expand All @@ -18,6 +19,70 @@ import {
type ObjectId8,
} from "./types.js";

/** compression_algorithm value for zstd dictionary-compressed locations. */
const COMPRESSION_ALG_ZSTD_DICT = 1;

/** Upper bound on a decompressed location, matching the Rust
* `MAX_DECOMPRESSED_LOCATION_SIZE`. */
const MAX_DECOMPRESSED_LOCATION_SIZE = 1024;

/**
* Bytes that a zstd decoder allocates up front for `frame`, read from the frame
* header: the advertised frame content size, falling back to the window size.
* Mirrors the allocation in the vendored fzstd `rzfh`. Returns `Infinity` for a
* missing/short/unrecognized header so callers reject rather than trust it.
*
* Used to bound allocation *before* decompressing an untrusted
* `compressed_location`: the size check on the decompressed output alone is too
* late, since the decoder would already have allocated per the advertised size.
*/
function zstdFrameAllocSize(frame: Uint8Array): number {
// Magic number 0xFD2FB528 (little-endian), then the frame header descriptor.
if (
frame.length < 5 ||
frame[0] !== 0x28 ||
frame[1] !== 0xb5 ||
frame[2] !== 0x2f ||
frame[3] !== 0xfd
) {
return Infinity;
}
const flg = frame[4];
const singleSegment = (flg >> 5) & 1;
const dictIdFlag = flg & 3;
const contentSizeFlag = flg >> 6;

let pos = 5;
let windowSize = 0;
if (!singleSegment) {
if (pos >= frame.length) return Infinity;
const wd = frame[pos++];
const base = 1 << (10 + (wd >> 3));
windowSize = base + (base >> 3) * (wd & 7);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-high high

In JavaScript, bitwise operators (like << and >>) operate on 32-bit signed integers. If the window descriptor wd has a large exponent (e.g., wd >> 3 >= 21), 10 + (wd >> 3) will be >= 31, causing 1 << (10 + (wd >> 3)) to overflow or wrap around to a negative or incorrect value. Similarly, base >> 3 will overflow if base is >= 2^31.

To prevent this and ensure correct handling of large window sizes (especially when validating untrusted input), use standard arithmetic exponentiation (2 ** ...) and division (/ 8) instead of bitwise shifts.

Suggested change
const base = 1 << (10 + (wd >> 3));
windowSize = base + (base >> 3) * (wd & 7);
const base = 2 ** (10 + (wd >> 3));
windowSize = base + (base / 8) * (wd & 7);

}
pos += dictIdFlag === 3 ? 4 : dictIdFlag; // skip dictionary id

// Frame content size: 0/1/2/4/8 bytes per the flags (matching fzstd's rzfh).
const fcsBytes = contentSizeFlag ? 1 << contentSizeFlag : singleSegment;
let contentSize = 0;
if (fcsBytes) {
if (pos + fcsBytes > frame.length) return Infinity;
for (let i = 0; i < fcsBytes; i++) {
contentSize += frame[pos + i] * 2 ** (8 * i);
}
if (contentSizeFlag === 1) contentSize += 256;
if (singleSegment) windowSize = contentSize;
}
return contentSize || windowSize;
}

/**
* Decode a ChunkRef's `compressed_location` byte vector into a location string.
* Returns `null` when this manifest has no location dictionary (locations are
* stored uncompressed).
*/
type LocationDecoder = (compressed: Uint8Array) => string;

/** Parse a Manifest from FlatBuffer data */
export function parseManifest(data: Uint8Array): Manifest {
const bb = new ByteBuffer(data);
Expand All @@ -30,20 +95,66 @@ export function parseManifest(data: Uint8Array): Manifest {
idObj.bb!.bytes().slice(idObj.bb_pos, idObj.bb_pos + 12),
);

// Build the location decoder once per manifest from its zstd dictionary.
const decodeLocation = makeLocationDecoder(fbsManifest);

// Parse arrays
const arraysLength = fbsManifest.arraysLength();
const arrays: ArrayManifest[] = [];
for (let i = 0; i < arraysLength; i++) {
const fbsArray = fbsManifest.arrays(i);
if (fbsArray) {
arrays.push(parseArrayManifest(fbsArray));
arrays.push(parseArrayManifest(fbsArray, decodeLocation));
}
}

return { id, arrays };
}

function parseArrayManifest(fbsArray: FbsArrayManifest): ArrayManifest {
/**
* Build the function that turns a ChunkRef's `compressed_location` into a
* location string, using the manifest's zstd dictionary.
*
* Mirrors the Rust `Manifest::decompressor`: a dictionary is only used when
* `compression_algorithm == ZSTD_DICT` and `location_dictionary` is present.
* When a `compressed_location` is encountered without a dictionary, decoding
* throws (matching the Rust `MissingLocationCompressionDictionary` error).
*/
function makeLocationDecoder(fbsManifest: FbsManifest): LocationDecoder {
const dictionary =
fbsManifest.compressionAlgorithm() === COMPRESSION_ALG_ZSTD_DICT
? fbsManifest.locationDictionaryArray()
: null;
// `fatal` makes invalid UTF-8 throw, matching the Rust `String::from_utf8`.
const textDecoder = new TextDecoder("utf-8", { fatal: true });
return (compressed: Uint8Array): string => {
if (!dictionary) {
throw new Error(
"ChunkRef has a compressed_location but the manifest has no location dictionary",
);
}
// Reject before decompressing: a malformed frame can advertise a huge size
// and make the decoder allocate far beyond the bound (or OOM) before the
// post-decompress length check below would run.
if (zstdFrameAllocSize(compressed) > MAX_DECOMPRESSED_LOCATION_SIZE) {
throw new Error(
`Compressed location declares a size over ${MAX_DECOMPRESSED_LOCATION_SIZE} bytes`,
);
}
const decompressed = decompress(compressed, undefined, dictionary);
if (decompressed.length > MAX_DECOMPRESSED_LOCATION_SIZE) {
throw new Error(
`Decompressed location exceeds ${MAX_DECOMPRESSED_LOCATION_SIZE} bytes`,
);
}
return textDecoder.decode(decompressed);
};
}

function parseArrayManifest(
fbsArray: FbsArrayManifest,
decodeLocation: LocationDecoder,
): ArrayManifest {
// Parse node_id (required)
const nodeIdObj = fbsArray.nodeId();
if (!nodeIdObj)
Expand All @@ -58,14 +169,17 @@ function parseArrayManifest(fbsArray: FbsArrayManifest): ArrayManifest {
for (let i = 0; i < refsLength; i++) {
const fbsRef = fbsArray.refs(i);
if (fbsRef) {
refs.push(parseChunkRef(fbsRef));
refs.push(parseChunkRef(fbsRef, decodeLocation));
}
}

return { nodeId, refs };
}

function parseChunkRef(fbsRef: FbsChunkRef): ChunkRef {
function parseChunkRef(
fbsRef: FbsChunkRef,
decodeLocation: LocationDecoder,
): ChunkRef {
// Parse index (required, vector of uint32)
const indexLength = fbsRef.indexLength();
const index: number[] = [];
Expand All @@ -89,8 +203,18 @@ function parseChunkRef(fbsRef: FbsChunkRef): ChunkRef {
)
: null;

// Parse location (optional string)
const location = fbsRef.location();
// Resolve the virtual location. Per the Rust `ref_to_payload` priority
// (chunk_id -> compressed_location -> location), a dictionary-compressed
// location takes precedence over the plain `location` string and is decoded
// here at parse time. `chunk_id` refs are native, not virtual, so they never
// carry a location — skip decoding for them.
let location = fbsRef.location();
if (chunkId === null) {
const compressed = fbsRef.compressedLocationArray();
if (compressed) {
location = decodeLocation(compressed);
}
}

// Parse checksum fields
const checksumEtag = fbsRef.checksumEtag();
Expand Down Expand Up @@ -193,12 +317,14 @@ function binarySearchChunkRef(
return null;
}

/** Extract the payload type from a ChunkRef */
/**
* Extract the payload type from a ChunkRef.
*
* Priority matches the Rust `ref_to_payload`: chunk_id (native) ->
* location (virtual; already decoded from compressed_location at parse time)
* -> inline.
*/
export function getChunkPayload(ref: ChunkRef): ChunkPayload {
if (ref.inline !== null) {
return { type: "inline", data: ref.inline };
}

if (ref.chunkId !== null) {
return {
type: "native",
Expand All @@ -219,5 +345,9 @@ export function getChunkPayload(ref: ChunkRef): ChunkPayload {
};
}

throw new Error("Invalid ChunkRef: no inline, chunkId, or location");
if (ref.inline !== null) {
return { type: "inline", data: ref.inline };
}

throw new Error("Invalid ChunkRef: no chunkId, location, or inline");
}
2 changes: 1 addition & 1 deletion src/format/flatbuffers/repo-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import { ByteBuffer } from "flatbuffers";
import * as flexbuffers from "flatbuffers/js/flexbuffers.js";
import { Repo as FbsRepo } from "./generated/repo.js";
import { decompress } from "fzstd";
import { decompress } from "../../vendor/fzstd/index.js";
import {
parseHeader,
getDataAfterHeader,
Expand Down
2 changes: 1 addition & 1 deletion src/reader/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import type {
Storage,
RequestOptions,
} from "../storage/storage.js";
import { decompress } from "fzstd";
import { decompress } from "../vendor/fzstd/index.js";
import { LRUCache } from "../cache/lru.js";
import { singleFlight, type SingleFlight } from "../cache/single-flight.js";
import {
Expand Down
21 changes: 21 additions & 0 deletions src/vendor/fzstd/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020 Arjun Barrett

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
86 changes: 86 additions & 0 deletions src/vendor/fzstd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Vendored `fzstd`

This directory contains a vendored copy of [`fzstd`](https://github.com/101arrowz/fzstd)
(MIT, © 2020 Arjun Barrett) used for zstd decompression — both the outer
metadata-file decompression and the dictionary-compressed virtual chunk
locations in manifests.

## Why it's vendored (not an npm dependency)

- **Released `fzstd` (0.1.1) has no dictionary support.** Its `decompress()`
takes no dictionary argument, and icechunk's `compressed_location` fields are
zstd frames compressed against a per-manifest dictionary — undecodable
without it. Dictionary support exists only in the unmerged
[fzstd#18](https://github.com/101arrowz/fzstd/pull/18).
- **The PR branch can't be cleanly `npm install`ed** — its `package.json`
`main`/`module` point at build outputs that aren't committed, and there's no
`prepare` hook, so a git/`github:` install resolves to missing files.
- **`zstdify` (the other dictionary-capable JS zstd lib) silently corrupts
data** — it returned wrong bytes (same length, no error) for ~32% of real
icechunk metadata fixtures, at both 1.4.0 and `main` (the PR#3 "fix" does not
address it). `fzstd` is byte-exact vs the reference `zstd` CLI on all of them.

So we vendor the dictionary-capable source and decompress with it everywhere.

## Upstream source

| | |
|---|---|
| Repo | `101arrowz/fzstd` |
| Branch | `handlerug:push-qxkytkvukoqs` (= the head of fzstd#18) |
| Commit | `c039e52` "Support loading a dictionary for decompression" |
| File | `src/index.ts` → `index.ts` here (with a header prepended) |
| License | `LICENSE` (copied verbatim) |

`index.ts` is `@ts-nocheck`'d and excluded from Prettier (`.prettierignore`) to
stay byte-identical to upstream apart from the documented patch below.

## Local modifications

There is **one** deliberate change from upstream, marked in `index.ts` with a
`LOCAL PATCH (icechunk-js)` comment (and summarized in the file header):

- **Dictionary/output-buffer aliasing fix** — in `decompress()`, the
no-output-buffer fast path is guarded with `!dic`:
`if (!dic && st.w.length == st.u)`. When a dictionary is loaded, `rdic()`
replaces `st.w` with the dictionary history; reusing it as the output buffer
aliases back-reference reads with output writes and silently corrupts the
result whenever the dictionary-content length equals the decompressed size.
- Submitted upstream: **https://github.com/handlerug/fzstd/pull/1**
(against the fzstd#18 branch). When it merges and a release is published,
drop this patch on the next re-vendor.
- Regression coverage: `tests/format/flatbuffers/manifest-dictionary.test.ts`
("decodes a frame whose output aliases the dictionary buffer").

> Note: the size-bound (`MAX_DECOMPRESSED_LOCATION_SIZE`) and UTF-8 strictness
> guards are **not** here — they live in the consumer
> (`src/format/flatbuffers/manifest-parser.ts`), since bounding untrusted input
> is the caller's job, not the decoder's.

## Re-vendoring (updating the copy)

```sh
# Fetch the upstream source for the pinned branch/commit:
gh api "repos/handlerug/fzstd/contents/src/index.ts?ref=push-qxkytkvukoqs" \
--jq '.content' | base64 -d > /tmp/fzstd-index.ts
```

Then:
1. Re-prepend the provenance header (top of the current `index.ts`).
2. **Re-apply the local patch** — add `!dic &&` to the fast-path guard in
`decompress()` (search for `st.w.length == st.u`).
3. Re-run the suite: `npm test` (the aliasing + dictionary tests must pass).

## When this can be deleted

Once fzstd#18 **and** the aliasing fix land upstream and a release is published
to npm: delete this directory, add `fzstd@<version>` to `dependencies`, and
repoint the four imports (`src/reader/session.ts`,
`src/format/flatbuffers/{repo,manifest}-parser.ts`, and the two parser tests).

## Related

- Consumer: `src/format/flatbuffers/manifest-parser.ts` (dictionary decode,
decode priority matching the Rust `ref_to_payload`, size/UTF-8 guards).
- Test fixtures: `tests/fixtures/dictionary/` (generated with the `zstd` CLI;
generation commands documented in the test file headers).
Loading
Loading