diff --git a/README.md b/README.md index 1e8cff7..651bb9d 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,11 @@ npm install @textfilters/core ```ts import { createCachedTextProcessor, + createPreparedText, createTextRangePipeline, createTextPipeline, lowerNfkc, + type AllocationAwareRangeScanner, type TextCensor, type TextRangeScanner, } from "@textfilters/core"; @@ -63,15 +65,31 @@ const scanner: TextRangeScanner = ({ text }) => const rangeSafeText = createTextRangePipeline() .use(scanner) .censor("secret message"); + +const allocationAwareScanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: (input) => input.hints.hasDot, + scan: (input, sink) => { + const index = input.codePoints.indexOf("."); + if (index >= 0) sink({ range: [index, index + 1] }); + }, +}; + +const prepared = createPreparedText("a.b"); +const hasRange = allocationAwareScanner.check(prepared); ``` ## API - `createTextPipeline()` - `createTextRangePipeline()` +- `checkTextRanges(value, scanners)` +- `createPreparedText(value)` +- `createTextHints(text, codePoints)` - `createTextScanInput(value)` - `createTextRangeScanResult(ranges, metadata)` - `runTextRangeScanner(scanner, input)` +- `scanPreparedTextRanges(scanner, input, sink)` - `scanTextRanges(value, scanners)` - `createCachedTextProcessor(processor, options)` - `normalizeTextInput(value)` @@ -140,6 +158,22 @@ code point ranges before masking. A scanner can be a function or an object with a `scan()` method. Scanners receive `TextScanInput`, which contains the normalized source text and its code point array. +`PreparedText` extends that input with reusable `TextHints`, including generic +length, ASCII, digit, whitespace, punctuation, and common delimiter facts. +These hints are computed once by `createPreparedText()` and reused across +registered scanners. They are intentionally generic; URL, email, phone, +profanity, spam, and future packages keep their own package-specific detection +logic. + +`AllocationAwareRangeScanner` separates a cheap pre-scan `check()` gate from +sink-based `scan()`. A true `check()` result means the scanner is eligible to +scan the prepared input; it is not itself proof that a range exists. +`scan()` streams `RangeMatch` values into a `RangeMatchSink`; returning `false` +from the sink requests early stop. Use `createTextRangePipeline().check()` or +`scanPreparedTextRanges()` when callers need to confirm an actual emitted +range. Legacy scanner functions and scanner objects remain supported and +continue to return range arrays or `TextRangeScanResult`. + `createTextRangePipeline()` collects ranges from registered scanners, merges overlaps in code point order, and masks once with `censorCodePointRanges()`. This keeps scanner packages independent while diff --git a/src/contracts.ts b/src/contracts.ts index 94573e5..ec4b622 100644 --- a/src/contracts.ts +++ b/src/contracts.ts @@ -61,13 +61,43 @@ export interface TextPipeline { export type TextRange = readonly [start: number, end: number]; export type TextCodePointRange = readonly [start: number, end: number]; +export interface TextHints { + readonly textLength: number; + readonly codePointLength: number; + readonly isEmpty: boolean; + readonly hasAsciiOnly: boolean; + readonly hasNonAscii: boolean; + readonly hasDigit: boolean; + readonly digitCount: number; + readonly hasAsciiLetter: boolean; + readonly hasWhitespace: boolean; + readonly hasPunctuation: boolean; + readonly punctuationCount: number; + readonly hasAtSign: boolean; + readonly hasDot: boolean; + readonly hasSlash: boolean; + readonly hasColon: boolean; + readonly hasPlus: boolean; +} + export interface TextScanInput { readonly text: string; readonly codePoints: readonly string[]; } +export interface PreparedText extends TextScanInput { + readonly hints: TextHints; +} + export type TextRangeScanMetadata = Readonly>; +export interface RangeMatch { + readonly range: TextCodePointRange; + readonly metadata?: TextRangeScanMetadata; +} + +export type RangeMatchSink = (match: RangeMatch) => boolean | void; + export interface TextRangeScanResult { readonly ranges: readonly TextCodePointRange[]; readonly metadata?: TextRangeScanMetadata; @@ -81,12 +111,22 @@ export type TextRangeScannerFunction = ( input: TextScanInput, ) => TextRangeScannerOutput; +export interface AllocationAwareRangeScanner { + readonly name?: string; + readonly allocationAware: true; + check(input: PreparedText): boolean; + scan(input: PreparedText, sink: RangeMatchSink): boolean | void; +} + +export interface LegacyTextRangeScanner { + readonly name?: string; + scan(input: TextScanInput): TextRangeScannerOutput; +} + export type TextRangeScanner = | TextRangeScannerFunction - | { - readonly name?: string; - scan(input: TextScanInput): TextRangeScannerOutput; - }; + | LegacyTextRangeScanner + | AllocationAwareRangeScanner; export interface TextRangePipelineScanResult { readonly text: string; @@ -103,6 +143,7 @@ export interface TextRangePipelineCensorResult { export interface TextRangePipeline { use(scanner: TextRangeScanner): TextRangePipeline; + check(value: unknown): boolean; scan(value: unknown): TextRangePipelineScanResult; censor(value: unknown, mask?: string): string; process(value: unknown, mask?: string): TextRangePipelineCensorResult; diff --git a/src/index.ts b/src/index.ts index 41d157f..ef1067e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,11 @@ export type { + AllocationAwareRangeScanner, CachedTextProcessor, CachedTextProcessorOptions, + LegacyTextRangeScanner, + PreparedText, + RangeMatch, + RangeMatchSink, TextCensor, TextCodePointRange, TextGuard, @@ -13,6 +18,7 @@ export type { TextPipelineProcessedResult, TextPipelineProcessResult, TextRange, + TextHints, TextRangePipeline, TextRangePipelineCensorResult, TextRangePipelineScanResult, @@ -43,9 +49,13 @@ export { export { createTextPipeline } from "./pipeline.js"; export { mergeCodePointRanges, mergeRanges } from "./ranges.js"; export { + checkTextRanges, + createPreparedText, + createTextHints, createTextRangePipeline, createTextRangeScanResult, createTextScanInput, runTextRangeScanner, + scanPreparedTextRanges, scanTextRanges, } from "./scanner.js"; diff --git a/src/scanner.ts b/src/scanner.ts index 9077471..9d574fb 100644 --- a/src/scanner.ts +++ b/src/scanner.ts @@ -1,5 +1,10 @@ import type { + AllocationAwareRangeScanner, + PreparedText, + RangeMatch, + RangeMatchSink, TextCodePointRange, + TextHints, TextRangePipeline, TextRangePipelineCensorResult, TextRangePipelineScanResult, @@ -13,6 +18,10 @@ import { normalizeTextInput } from "./input.js"; import { censorCodePointRanges } from "./masking.js"; import { mergeCodePointRanges } from "./ranges.js"; +const UNICODE_PUNCTUATION_RE = /\p{P}/u; +const UNICODE_DECIMAL_DIGIT_RE = /\p{Decimal_Number}/u; +const preparedTextCache = new WeakSet(); + export function createTextScanInput(value: unknown): TextScanInput { const text = normalizeTextInput(value); return { @@ -21,6 +30,97 @@ export function createTextScanInput(value: unknown): TextScanInput { }; } +export function createPreparedText(value: unknown): PreparedText { + const text = normalizeTextInput(value); + const codePoints = Array.from(text); + const prepared = { + text, + codePoints, + hints: createTextHints(text, codePoints), + }; + preparedTextCache.add(prepared); + return prepared; +} + +export function createTextHints( + text: string, + codePoints: readonly string[] = Array.from(text), +): TextHints { + let digitCount = 0; + let punctuationCount = 0; + let hasAsciiLetter = false; + let hasWhitespace = false; + let hasNonAscii = false; + let hasAtSign = false; + let hasDot = false; + let hasSlash = false; + let hasColon = false; + let hasPlus = false; + + for (const codePoint of codePoints) { + const code = codePoint.codePointAt(0) ?? 0; + + if (isWhitespaceCodePoint(codePoint)) { + hasWhitespace = true; + if (code > 0x7f) { + hasNonAscii = true; + } + continue; + } + + if (UNICODE_DECIMAL_DIGIT_RE.test(codePoint)) { + digitCount++; + if (code > 0x7f) { + hasNonAscii = true; + } + continue; + } + + const delimiterCode = hintDelimiterCodePoint(codePoint, code); + hasAtSign ||= delimiterCode === 0x40; + hasDot ||= delimiterCode === 0x2e; + hasSlash ||= delimiterCode === 0x2f; + hasColon ||= delimiterCode === 0x3a; + hasPlus ||= delimiterCode === 0x2b; + + if (code > 0x7f) { + hasNonAscii = true; + if (UNICODE_PUNCTUATION_RE.test(codePoint)) { + punctuationCount++; + } + continue; + } + + if ((code >= 0x41 && code <= 0x5a) || (code >= 0x61 && code <= 0x7a)) { + hasAsciiLetter = true; + continue; + } + + if (code >= 0x21 && code <= 0x7e) { + punctuationCount++; + } + } + + return { + textLength: text.length, + codePointLength: codePoints.length, + isEmpty: text.length === 0, + hasAsciiOnly: !hasNonAscii, + hasNonAscii, + hasDigit: digitCount > 0, + digitCount, + hasAsciiLetter, + hasWhitespace, + hasPunctuation: punctuationCount > 0, + punctuationCount, + hasAtSign, + hasDot, + hasSlash, + hasColon, + hasPlus, + }; +} + export function createTextRangeScanResult( ranges: readonly TextCodePointRange[], metadata?: TextRangeScanMetadata, @@ -34,6 +134,21 @@ export function runTextRangeScanner( scanner: TextRangeScanner, input: TextScanInput, ): TextRangeScanResult { + if (isAllocationAwareRangeScanner(scanner)) { + const prepared = ensurePreparedText(input); + const matches: RangeMatch[] = []; + scanPreparedTextRanges(scanner, prepared, (match) => { + const snapshot = snapshotRangeMatch(match); + if (snapshot !== undefined) { + matches.push(snapshot); + } + }); + return createTextRangeScanResult( + matches.map((match) => match.range), + scanMetadataFromMatches(matches), + ); + } + const output = typeof scanner === "function" ? scanner(input) : scanner.scan(input); return normalizeTextRangeScannerOutput(output); @@ -56,6 +171,10 @@ export function createTextRangePipeline(): TextRangePipeline { return scanTextRanges(value, scanners); }, + check(value) { + return checkTextRanges(value, scanners); + }, + censor(value, mask) { const result = pipeline.scan(value); return censorCodePointRanges(result.codePoints, result.ranges, mask); @@ -78,7 +197,9 @@ export function scanTextRanges( value: unknown, scanners: readonly TextRangeScanner[], ): TextRangePipelineScanResult { - const input = createTextScanInput(value); + const input = scanners.some(isAllocationAwareRangeScanner) + ? createPreparedText(value) + : createTextScanInput(value); const scanResults = scanners.map((scanner) => runTextRangeScanner(scanner, input), ); @@ -94,6 +215,55 @@ export function scanTextRanges( }; } +export function checkTextRanges( + value: unknown, + scanners: readonly TextRangeScanner[], +): boolean { + const input = createTextScanInput(value); + let prepared: PreparedText | undefined; + + for (const scanner of scanners) { + if (isAllocationAwareRangeScanner(scanner)) { + prepared ??= ensurePreparedText(input); + let found = false; + scanPreparedTextRanges(scanner, prepared, (match) => { + if (snapshotRangeMatch(match) === undefined) { + return true; + } + + found = true; + return false; + }); + if (found) return true; + continue; + } + + if (runTextRangeScanner(scanner, input).ranges.length > 0) return true; + } + + return false; +} + +export function scanPreparedTextRanges( + scanner: AllocationAwareRangeScanner, + input: PreparedText, + sink: RangeMatchSink, +): boolean { + if (!scanner.check(input)) return true; + + let shouldContinue = true; + const stoppingSink: RangeMatchSink = (match) => { + if (!shouldContinue) return false; + + const result = sink(match); + shouldContinue = result !== false; + return shouldContinue; + }; + + const result = scanner.scan(input, stoppingSink); + return result === false ? false : shouldContinue; +} + function normalizeTextRangeScannerOutput( output: TextRangeScannerOutput, ): TextRangeScanResult { @@ -114,3 +284,106 @@ function isTextRangeScanner(scanner: unknown): scanner is TextRangeScanner { typeof scanner.scan === "function") ); } + +function isAllocationAwareRangeScanner( + scanner: TextRangeScanner, +): scanner is AllocationAwareRangeScanner { + return ( + typeof scanner === "object" && + scanner !== null && + "allocationAware" in scanner && + scanner.allocationAware === true && + "check" in scanner && + typeof scanner.check === "function" && + "scan" in scanner && + typeof scanner.scan === "function" + ); +} + +function scanMetadataFromMatches( + matches: readonly RangeMatch[], +): TextRangeScanMetadata | undefined { + const metadata = matches.flatMap((match) => + match.metadata === undefined ? [] : [match.metadata], + ); + + return metadata.length === 0 ? undefined : { matches: metadata }; +} + +function snapshotRangeMatch(match: RangeMatch): RangeMatch | undefined { + const range = normalizeCodePointRange(match.range); + if (range === undefined) return undefined; + + return { + range, + ...(match.metadata === undefined + ? {} + : { metadata: { ...match.metadata } }), + }; +} + +function normalizeCodePointRange( + range: TextCodePointRange, +): TextCodePointRange | undefined { + const start = Math.trunc(Number(range[0])); + const end = Math.trunc(Number(range[1])); + if (!Number.isFinite(start) || !Number.isFinite(end)) return undefined; + if (end <= start) return undefined; + return [start, end]; +} + +function isWhitespaceCodePoint(codePoint: string): boolean { + return /\s/u.test(codePoint); +} + +function hintDelimiterCodePoint(codePoint: string, code: number): number { + if (code <= 0x7f) return code; + + const normalized = codePoint.normalize("NFKC"); + return normalized.length === 1 ? (normalized.codePointAt(0) ?? code) : code; +} + +function ensurePreparedText(input: TextScanInput): PreparedText { + if (preparedTextCache.has(input as PreparedText)) { + return input as PreparedText; + } + + const hints = (input as { readonly hints?: unknown }).hints; + const computedHints = createTextHints(input.text, input.codePoints); + + if (isSameTextHints(hints, computedHints)) { + return input as PreparedText; + } + + return { + ...input, + hints: computedHints, + }; +} + +function isSameTextHints( + actual: unknown, + expected: TextHints, +): actual is TextHints { + if (typeof actual !== "object" || actual === null) return false; + + const hints = actual as Partial; + return ( + hints.textLength === expected.textLength && + hints.codePointLength === expected.codePointLength && + hints.isEmpty === expected.isEmpty && + hints.hasAsciiOnly === expected.hasAsciiOnly && + hints.hasNonAscii === expected.hasNonAscii && + hints.hasDigit === expected.hasDigit && + hints.digitCount === expected.digitCount && + hints.hasAsciiLetter === expected.hasAsciiLetter && + hints.hasWhitespace === expected.hasWhitespace && + hints.hasPunctuation === expected.hasPunctuation && + hints.punctuationCount === expected.punctuationCount && + hints.hasAtSign === expected.hasAtSign && + hints.hasDot === expected.hasDot && + hints.hasSlash === expected.hasSlash && + hints.hasColon === expected.hasColon && + hints.hasPlus === expected.hasPlus + ); +} diff --git a/tests/public-entrypoint.spec.ts b/tests/public-entrypoint.spec.ts index 1ee881d..d62107a 100644 --- a/tests/public-entrypoint.spec.ts +++ b/tests/public-entrypoint.spec.ts @@ -5,7 +5,10 @@ describe("textfilters core public entrypoint", () => { it("keeps the runtime export surface stable", () => { expect(Object.keys(core).sort()).toEqual([ "censorCodePointRanges", + "checkTextRanges", "createCachedTextProcessor", + "createPreparedText", + "createTextHints", "createTextPipeline", "createTextRangePipeline", "createTextRangeScanResult", @@ -22,6 +25,7 @@ describe("textfilters core public entrypoint", () => { "normalizeTextInput", "normalizeVisibleMaskChar", "runTextRangeScanner", + "scanPreparedTextRanges", "scanTextRanges", "stripZeroWidth", "toCodePoints", diff --git a/tests/scanner.spec.ts b/tests/scanner.spec.ts index e94c05e..f2c9f42 100644 --- a/tests/scanner.spec.ts +++ b/tests/scanner.spec.ts @@ -1,10 +1,15 @@ import { describe, expect, it } from "vitest"; import { + checkTextRanges, + createPreparedText, + createTextHints, createTextRangePipeline, createTextRangeScanResult, createTextScanInput, runTextRangeScanner, + scanPreparedTextRanges, scanTextRanges, + type AllocationAwareRangeScanner, type TextRangeScanner, type TextScanInput, } from "../src/index.js"; @@ -19,6 +24,71 @@ describe("textfilters scanner contracts", () => { text: "", codePoints: [], }); + expect("hints" in createTextScanInput("plain")).toBe(false); + }); + + it("creates prepared text with reusable generic hints", () => { + expect(createPreparedText("A+9@example.com/path:tail.")).toEqual({ + text: "A+9@example.com/path:tail.", + codePoints: Array.from("A+9@example.com/path:tail."), + hints: { + textLength: 26, + codePointLength: 26, + isEmpty: false, + hasAsciiOnly: true, + hasNonAscii: false, + hasDigit: true, + digitCount: 1, + hasAsciiLetter: true, + hasWhitespace: false, + hasPunctuation: true, + punctuationCount: 6, + hasAtSign: true, + hasDot: true, + hasSlash: true, + hasColon: true, + hasPlus: true, + }, + }); + + expect(createTextHints("a😀")).toMatchObject({ + textLength: 3, + codePointLength: 2, + hasAsciiOnly: false, + hasNonAscii: true, + }); + + expect(createTextHints("a\u00a0b")).toMatchObject({ + hasAsciiOnly: false, + hasNonAscii: true, + hasWhitespace: true, + }); + + expect(createTextHints("a\u2014b")).toMatchObject({ + hasAsciiOnly: false, + hasNonAscii: true, + hasPunctuation: true, + punctuationCount: 1, + }); + + expect(createTextHints("a1٣")).toMatchObject({ + hasAsciiOnly: false, + hasNonAscii: true, + hasDigit: true, + digitCount: 2, + }); + + expect(createTextHints("@.:/+")).toMatchObject({ + hasAsciiOnly: false, + hasNonAscii: true, + hasPunctuation: true, + punctuationCount: 4, + hasAtSign: true, + hasDot: true, + hasColon: true, + hasSlash: true, + hasPlus: true, + }); }); it("normalizes scanner results and preserves metadata", () => { @@ -53,6 +123,180 @@ describe("textfilters scanner contracts", () => { metadata: { source: "object" }, }); }); + + it("does not compute hints for legacy scanners", () => { + const codePoints = new Proxy(["a", "b", "c"], { + get(target, property, receiver) { + if (property === Symbol.iterator) { + throw new Error("unexpected hint walk"); + } + + return Reflect.get(target, property, receiver); + }, + }); + const scanner: TextRangeScanner = { + scan: () => ({ ranges: [[0, 1]] }), + }; + + expect(runTextRangeScanner(scanner, { text: "abc", codePoints })).toEqual({ + ranges: [[0, 1]], + }); + }); + + it("runs allocation-aware scanner objects through a sink", () => { + const input = createPreparedText("abc hit"); + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: (prepared) => prepared.hints.hasWhitespace, + scan: (_prepared, sink) => { + sink({ range: [4, 7], metadata: { kind: "word" } }); + }, + }; + + expect(runTextRangeScanner(scanner, input)).toEqual({ + ranges: [[4, 7]], + metadata: { matches: [{ kind: "word" }] }, + }); + }); + + it("copies streamed ranges before storing allocation-aware matches", () => { + const input = createPreparedText("abcd"); + const range = [0, 1] as [number, number]; + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: () => true, + scan: (_prepared, sink) => { + sink({ range }); + range[0] = 2; + range[1] = 3; + sink({ range }); + }, + }; + + expect(runTextRangeScanner(scanner, input).ranges).toEqual([ + [0, 1], + [2, 3], + ]); + }); + + it("copies streamed match metadata before storing allocation-aware matches", () => { + const input = createPreparedText("abcd"); + const metadata = { token: "first" }; + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: () => true, + scan: (_prepared, sink) => { + sink({ range: [0, 1], metadata }); + metadata.token = "second"; + sink({ range: [2, 3], metadata }); + metadata.token = "third"; + }, + }; + + expect(runTextRangeScanner(scanner, input).metadata).toEqual({ + matches: [{ token: "first" }, { token: "second" }], + }); + }); + + it("drops streamed metadata when allocation-aware ranges are rejected", () => { + const input = createPreparedText("abc"); + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: () => true, + scan: (_prepared, sink) => { + sink({ range: [2, 2], metadata: { token: "invalid" } }); + sink({ range: [0, 1], metadata: { token: "valid" } }); + }, + }; + + expect(runTextRangeScanner(scanner, input)).toEqual({ + ranges: [[0, 1]], + metadata: { matches: [{ token: "valid" }] }, + }); + }); + + it("recomputes stale or missing hints on plain scan input", () => { + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: (prepared) => prepared.hints.hasAtSign, + scan: (_prepared, sink) => { + sink({ range: [0, 4] }); + }, + }; + const input = { + text: "user@example.com", + codePoints: Array.from("user@example.com"), + hints: createTextHints("plain"), + }; + const inputWithMissingHints = { + text: "user@example.com", + codePoints: Array.from("user@example.com"), + hints: undefined, + }; + + expect(runTextRangeScanner(scanner, input).ranges).toEqual([[0, 4]]); + expect(runTextRangeScanner(scanner, inputWithMissingHints).ranges).toEqual([ + [0, 4], + ]); + }); + + it("preserves legacy object scanners that also expose check helpers", () => { + const input = createPreparedText("abc"); + const scanner = { + check: () => true, + scan: (_input: TextScanInput, _legacyOptions?: unknown) => ({ + ranges: [[0, 3]] as const, + }), + }; + + expect(runTextRangeScanner(scanner, input)).toEqual({ + ranges: [[0, 3]], + }); + expect(checkTextRanges("abc", [scanner])).toBe(true); + }); + + it("stops allocation-aware scanning when the sink returns false", () => { + const input = createPreparedText("one two"); + const seen: string[] = []; + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: () => true, + scan: (_prepared, sink) => { + seen.push("first"); + if (sink({ range: [0, 3] }) === false) return false; + seen.push("second"); + sink({ range: [4, 7] }); + }, + }; + + const completed = scanPreparedTextRanges(scanner, input, () => false); + + expect(completed).toBe(false); + expect(seen).toEqual(["first"]); + }); + + it("latches allocation-aware sink cancellation after false", () => { + const input = createPreparedText("one two"); + const seen: string[] = []; + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: () => true, + scan: (_prepared, sink) => { + seen.push("first"); + sink({ range: [0, 3] }); + seen.push("second"); + sink({ range: [4, 7] }); + }, + }; + + const completed = scanPreparedTextRanges(scanner, input, (match) => { + seen.push(`${match.range[0]}:${match.range[1]}`); + return false; + }); + + expect(completed).toBe(false); + expect(seen).toEqual(["first", "0:3", "second"]); + }); }); describe("textfilters range scanner pipeline", () => { @@ -104,6 +348,7 @@ describe("textfilters range scanner pipeline", () => { it("keeps clean text unchanged when scanners return no ranges", () => { const pipeline = createTextRangePipeline().use(() => []); + expect(pipeline.check("clean")).toBe(false); expect(pipeline.censor("clean")).toBe("clean"); expect(pipeline.scan("clean")).toMatchObject({ text: "clean", @@ -113,6 +358,118 @@ describe("textfilters range scanner pipeline", () => { }); }); + it("checks allocation-aware scanners without collecting matches", () => { + const events: string[] = []; + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: (input) => { + events.push(`check:${input.hints.hasDot}`); + return input.hints.hasDot; + }, + scan: (_input, _sink) => { + events.push("scan"); + }, + }; + + const pipeline = createTextRangePipeline().use(scanner); + + expect(pipeline.check("plain")).toBe(false); + expect(pipeline.check("has.dot")).toBe(false); + expect(events).toEqual(["check:false", "check:true", "scan"]); + }); + + it("checks allocation-aware scanners by stopping after the first emitted range", () => { + const seen: string[] = []; + const scanner: AllocationAwareRangeScanner = { + allocationAware: true, + check: () => true, + scan: (_input, sink) => { + seen.push("first"); + if (sink({ range: [0, 3] }) === false) return false; + seen.push("second"); + sink({ range: [4, 7] }); + }, + }; + + const pipeline = createTextRangePipeline().use(scanner); + + expect(pipeline.check("has hit")).toBe(true); + expect(seen).toEqual(["first"]); + }); + + it("defers prepared check input until an allocation-aware scanner is reached", () => { + const events: string[] = []; + const legacy: TextRangeScanner = { + scan: (input) => { + events.push(`legacy:${"hints" in input}`); + return [[0, 1]]; + }, + }; + const allocationAware: AllocationAwareRangeScanner = { + allocationAware: true, + check: () => { + events.push("allocation-aware-check"); + return true; + }, + scan: (_input, sink) => { + events.push("allocation-aware-scan"); + sink({ range: [2, 3] }); + }, + }; + + const pipeline = createTextRangePipeline().use(legacy).use(allocationAware); + + expect(pipeline.check("a.b")).toBe(true); + expect(events).toEqual(["legacy:false"]); + }); + + it("ignores invalid allocation-aware ranges while checking", () => { + const invalidOnly = createTextRangePipeline().use({ + allocationAware: true, + check: () => true, + scan: (_input, sink) => { + sink({ range: [2, 2] }); + }, + }); + const invalidThenValid = createTextRangePipeline().use({ + allocationAware: true, + check: () => true, + scan: (_input, sink) => { + sink({ range: [2, 2] }); + sink({ range: [0, 1] }); + }, + }); + + expect(invalidOnly.check("abc")).toBe(false); + expect(invalidThenValid.check("abc")).toBe(true); + }); + + it("reuses prepared text hints across registered scanners", () => { + const seenHints: unknown[] = []; + const first: AllocationAwareRangeScanner = { + allocationAware: true, + check: (input) => input.hints.hasDot, + scan: (input, sink) => { + seenHints.push(input.hints); + sink({ range: [0, 1] }); + }, + }; + const second: AllocationAwareRangeScanner = { + allocationAware: true, + check: (input) => input.hints.hasDot, + scan: (input, sink) => { + seenHints.push(input.hints); + sink({ range: [2, 3] }); + }, + }; + + expect(scanTextRanges("a.b", [first, second]).ranges).toEqual([ + [0, 1], + [2, 3], + ]); + expect(seenHints[0]).toBe(seenHints[1]); + }); + it("collects ranges from scanner functions without constructing a pipeline", () => { const scanner = (input: TextScanInput) => input.text.includes("hit") ? [[0, 3] as const] : []; @@ -125,6 +482,20 @@ describe("textfilters range scanner pipeline", () => { }); }); + it("uses plain scan input for legacy-only pipeline scans", () => { + const seenHints: boolean[] = []; + const scanner: TextRangeScanner = { + scan: (input) => { + seenHints.push("hints" in input); + return [[0, 1]]; + }, + }; + + expect(scanTextRanges("abc", [scanner]).ranges).toEqual([[0, 1]]); + expect(checkTextRanges("abc", [scanner])).toBe(true); + expect(seenHints).toEqual([false, false]); + }); + it("rejects invalid scanner registrations", () => { expect(() => createTextRangePipeline().use({} as TextRangeScanner)).toThrow( "scanner must be a function or scanner object",