From a395f92a7754c8396a18637943017fb15db47b7f Mon Sep 17 00:00:00 2001 From: Rohit Ghumare Date: Wed, 27 May 2026 17:52:46 +0100 Subject: [PATCH] fix(vector-index): preserve byteOffset + byteLength in base64 round-trip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buffer.from(b64, 'base64') returns a slice of Node's shared 8KB pool, and new Float32Array(buf.buffer) ignores byteOffset/byteLength — minting a 2048-element view over the entire pool. Same hazard on the encode side when the source Float32Array is itself a sub-view (e.g. .subarray() or a typed-array set into a larger buffer). The encode path now passes byteOffset/byteLength explicitly; decode mints the view at the correct offset with length scaled by Float32Array.BYTES_PER_ELEMENT. Reported as 'dimensions seen on disk: 2048' index-startup crashes in #455 / #469 / #584 / #587. Two regression tests added: - 384-dim x 5 vectors round-trip (within pool threshold, hits the decode bug) - subarray sub-view encode (hits the encode bug) --- src/state/vector-index.ts | 18 ++++++++++++++++-- test/vector-index.test.ts | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/state/vector-index.ts b/src/state/vector-index.ts index 03c8248e..d4b8bda7 100644 --- a/src/state/vector-index.ts +++ b/src/state/vector-index.ts @@ -1,9 +1,23 @@ +// Pass byteOffset + byteLength explicitly so the round-trip survives +// Node's Buffer pool. Buffer.from(b64, "base64") returns a slice of a +// shared 8KB pool (poolSize), and `new Float32Array(buf.buffer)` ignores +// the slice metadata — it would mint a 2048-element view over the whole +// pool. Same risk on the encode side if the input Float32Array is itself +// a sliced view. Reported as a phantom "2048 dimensions on disk" crash +// in #455 / #469 / #584 / #587. function float32ToBase64(arr: Float32Array): string { - return Buffer.from(arr.buffer).toString("base64"); + return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength).toString( + "base64", + ); } function base64ToFloat32(b64: string): Float32Array { - return new Float32Array(Buffer.from(b64, "base64").buffer); + const buf = Buffer.from(b64, "base64"); + return new Float32Array( + buf.buffer, + buf.byteOffset, + buf.byteLength / Float32Array.BYTES_PER_ELEMENT, + ); } function cosineSimilarity(a: Float32Array, b: Float32Array): number { diff --git a/test/vector-index.test.ts b/test/vector-index.test.ts index 7415e557..317f4f03 100644 --- a/test/vector-index.test.ts +++ b/test/vector-index.test.ts @@ -76,4 +76,43 @@ describe("VectorIndex", () => { const results = index.search(new Float32Array([1, 0, 0])); expect(results[0].score).toBe(0); }); + + it("round-trip preserves dim + identity for pooled-Buffer sizes (#587)", () => { + // 384-dim floats = 1536 bytes, comfortably inside Node's 8KB Buffer + // pool. Without explicit byteOffset/byteLength in the base64 round-trip, + // deserialise reads pool offset 0 and reports the entire pool as a + // 2048-element view, which the live index then rejects with + // "dimensions seen on disk: 2048". + const DIM = 384; + const vecs = Array.from({ length: 5 }, (_, n) => { + const v = new Float32Array(DIM); + for (let i = 0; i < DIM; i++) v[i] = n * 1000 + i; + return v; + }); + vecs.forEach((v, n) => index.add(`obs_${n}`, "ses_1", v)); + + const restored = VectorIndex.deserialize(index.serialize()); + expect(restored.size).toBe(5); + const { mismatches } = restored.validateDimensions(DIM); + expect(mismatches).toEqual([]); + for (let n = 0; n < 5; n++) { + const results = restored.search(vecs[n], 1); + expect(results[0].obsId).toBe(`obs_${n}`); + expect(results[0].score).toBeCloseTo(1.0, 4); + } + }); + + it("preserves bytes when source Float32Array is itself a sliced view (#587)", () => { + // The encode side has the same risk: passing arr.buffer drops the + // slice metadata if arr is a sub-view (subarray / typedArray.set). + const backing = new Float32Array(8); + for (let i = 0; i < 8; i++) backing[i] = i; + const slice = backing.subarray(2, 6); // values 2, 3, 4, 5 + + index.add("obs_slice", "ses_1", slice); + const restored = VectorIndex.deserialize(index.serialize()); + const results = restored.search(new Float32Array([2, 3, 4, 5]), 1); + expect(results[0].obsId).toBe("obs_slice"); + expect(results[0].score).toBeCloseTo(1.0, 4); + }); });