Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions src/state/vector-index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
// Pass byteOffset + byteLength explicitly so the round-trip survives
// Node's Buffer pool. Buffer.from(b64, "base64") returns a slice of a
// shared 8KB pool (poolSize), and `new Float32Array(buf.buffer)` ignores
// the slice metadata — it would mint a 2048-element view over the whole
// pool. Same risk on the encode side if the input Float32Array is itself
// a sliced view. Reported as a phantom "2048 dimensions on disk" crash
// in #455 / #469 / #584 / #587.
function float32ToBase64(arr: Float32Array): string {
return Buffer.from(arr.buffer).toString("base64");
return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength).toString(
"base64",
);
}

function base64ToFloat32(b64: string): Float32Array {
return new Float32Array(Buffer.from(b64, "base64").buffer);
const buf = Buffer.from(b64, "base64");
return new Float32Array(
buf.buffer,
buf.byteOffset,
buf.byteLength / Float32Array.BYTES_PER_ELEMENT,
);
}

function cosineSimilarity(a: Float32Array, b: Float32Array): number {
Expand Down
39 changes: 39 additions & 0 deletions test/vector-index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,43 @@ describe("VectorIndex", () => {
const results = index.search(new Float32Array([1, 0, 0]));
expect(results[0].score).toBe(0);
});

it("round-trip preserves dim + identity for pooled-Buffer sizes (#587)", () => {
// 384-dim floats = 1536 bytes, comfortably inside Node's 8KB Buffer
// pool. Without explicit byteOffset/byteLength in the base64 round-trip,
// deserialise reads pool offset 0 and reports the entire pool as a
// 2048-element view, which the live index then rejects with
// "dimensions seen on disk: 2048".
const DIM = 384;
const vecs = Array.from({ length: 5 }, (_, n) => {
const v = new Float32Array(DIM);
for (let i = 0; i < DIM; i++) v[i] = n * 1000 + i;
return v;
});
vecs.forEach((v, n) => index.add(`obs_${n}`, "ses_1", v));

const restored = VectorIndex.deserialize(index.serialize());
expect(restored.size).toBe(5);
const { mismatches } = restored.validateDimensions(DIM);
expect(mismatches).toEqual([]);
for (let n = 0; n < 5; n++) {
const results = restored.search(vecs[n], 1);
expect(results[0].obsId).toBe(`obs_${n}`);
expect(results[0].score).toBeCloseTo(1.0, 4);
}
});

it("preserves bytes when source Float32Array is itself a sliced view (#587)", () => {
// The encode side has the same risk: passing arr.buffer drops the
// slice metadata if arr is a sub-view (subarray / typedArray.set).
const backing = new Float32Array(8);
for (let i = 0; i < 8; i++) backing[i] = i;
const slice = backing.subarray(2, 6); // values 2, 3, 4, 5

index.add("obs_slice", "ses_1", slice);
const restored = VectorIndex.deserialize(index.serialize());
const results = restored.search(new Float32Array([2, 3, 4, 5]), 1);
expect(results[0].obsId).toBe("obs_slice");
expect(results[0].score).toBeCloseTo(1.0, 4);
});
});