From 60e7d6da4152986cf71af3d4b86950295ca13cbe Mon Sep 17 00:00:00 2001 From: George Petrakis Date: Thu, 21 May 2026 10:08:38 +0300 Subject: [PATCH] fix(vector-index): respect Buffer slice metadata in base64 round-trip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buffer.from(b64, "base64") may return a slice of Node's internal pool (poolSize=8192). Calling new Float32Array(buf.buffer) ignores byteOffset and byteLength, producing a 2048-element view over the whole pool instead of the actual decoded slice — the root cause of the phantom "dimensions seen on disk: 2048" crashes reported in #455 and #469. Pass byteOffset/byteLength to both helpers so the Float32Array covers only the decoded bytes. Add a regression test that serializes and deserialises 384-dim vectors (1536 bytes, within pool threshold) and asserts correct dimension and nearest-neighbour identity after reload. Fixes #584 Signed-off-by: George Petrakis --- src/state/vector-index.ts | 5 +++-- test/vector-index.test.ts | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/state/vector-index.ts b/src/state/vector-index.ts index 03c8248e..b37c2cb8 100644 --- a/src/state/vector-index.ts +++ b/src/state/vector-index.ts @@ -1,9 +1,10 @@ function float32ToBase64(arr: Float32Array): string { - return Buffer.from(arr.buffer).toString("base64"); + return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength).toString("base64"); } function base64ToFloat32(b64: string): Float32Array { - return new Float32Array(Buffer.from(b64, "base64").buffer); + const buf = Buffer.from(b64, "base64"); + return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4); } function cosineSimilarity(a: Float32Array, b: Float32Array): number { diff --git a/test/vector-index.test.ts b/test/vector-index.test.ts index 7415e557..8decde32 100644 --- a/test/vector-index.test.ts +++ b/test/vector-index.test.ts @@ -76,4 +76,30 @@ describe("VectorIndex", () => { const results = index.search(new Float32Array([1, 0, 0])); expect(results[0].score).toBe(0); }); + + it("serialize round-trip preserves dimension and values for pooled-buffer sizes", () => { + // 384 floats = 1536 bytes, small enough for Node to pool the decoded Buffer. + // Without the byteOffset/byteLength fix, deserialized length becomes 2048 + // and values read from pool offset 0 instead of the slice. + const DIM = 384; + const vecs = Array.from({ length: 5 }, (_, n) => { + const v = new Float32Array(DIM); + for (let i = 0; i < DIM; i++) v[i] = n * 1000 + i; + return v; + }); + + vecs.forEach((v, n) => index.add(`obs_${n}`, "ses_1", v)); + const restored = VectorIndex.deserialize(index.serialize()); + + expect(restored.size).toBe(5); + // Dimension must be exactly DIM, not 2048 (Buffer.poolSize / 4). + const { mismatches } = restored.validateDimensions(DIM); + expect(mismatches).toEqual([]); + // Each vector must be its own nearest neighbour after reload. + for (let n = 0; n < 5; n++) { + const results = restored.search(vecs[n], 1); + expect(results[0].obsId).toBe(`obs_${n}`); + expect(results[0].score).toBeCloseTo(1.0, 4); + } + }); });