Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions packages/trilium-core/src/migrations/migrations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,137 @@ export function getMaxMigrationVersion() {

// Migrations should be kept in descending order, so the latest migration is first.
export const MIGRATIONS: (SqlMigration | JsMigration)[] = [
// Add FTS5 full-text index over note blob content so quick search doesn't have
// to scan every blob at query time.
//
// Tokenizer: **trigram**, which indexes every contiguous 3-character window of
// the content. This makes the index a strict *superset* of what the JS
// `findInText` substring/operator semantics ask for — every doc containing
// the search token as a literal substring shows up as a candidate, so
// `*=*`, `*=`, `=*`, and `=` can re-check candidates without false negatives.
// The earlier `unicode61` + prefix wildcards approach only matched word-start
// occurrences ("ello" missed "hello"), which broke the superset property.
//
// Trigram doesn't ship with built-in diacritic folding (the `unicode61`
// `remove_diacritics` option doesn't apply), so diacritic-insensitive content
// searches are out of scope for this migration. Title and attribute matches
// still go through `NoteFlatTextExp`, which normalizes diacritics in JS, so
// most user queries are unaffected; full diacritic-insensitive content search
// can be layered on top later by indexing a pre-normalized column.
//
// The index is also **scoped to blobs that are currently referenced by a
// non-deleted text-content note** — blobs that only exist because they back a
// historical revision or an attachment are skipped, since they aren't reachable
// from the search JOIN anyway and would only bloat the index. Triggers on
// \`notes\` keep the index in sync as notes are inserted, as their content
// (\`blobId\`)/type/isDeleted change, or as they're hard-deleted; an
// \`AFTER DELETE ON blobs\` trigger cleans up any FTS row left behind when the
// blob itself is garbage-collected.
{
version: 239,
sql: /*sql*/`
CREATE VIRTUAL TABLE IF NOT EXISTS notes_fts USING fts5(
blobId UNINDEXED,
content,
tokenize = 'trigram'
);

-- Backfill: only blobs reachable from current non-deleted text notes.
-- DISTINCT because content-addressed dedup means one blob can back many
-- notes; we want one FTS row per indexed blob.
INSERT INTO notes_fts (blobId, content)
SELECT DISTINCT b.blobId, b.content
FROM blobs b
JOIN notes n ON n.blobId = b.blobId
WHERE n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet')
AND n.isDeleted = 0
AND b.content IS NOT NULL
AND LENGTH(b.content) > 0
AND LENGTH(b.content) < 2097152
AND typeof(b.content) = 'text';

-- When a new note row is inserted, index its blob if it qualifies.
-- (Re-indexing on content/type/isDeleted changes is handled by the
-- UPDATE trigger below; restoring a soft-deleted note runs an UPDATE
-- on \`isDeleted\` and therefore fires there, not here.)
CREATE TRIGGER IF NOT EXISTS notes_fts_after_note_insert
AFTER INSERT ON notes
WHEN new.blobId IS NOT NULL
AND new.isDeleted = 0
AND new.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet')
BEGIN
INSERT INTO notes_fts (blobId, content)
SELECT b.blobId, b.content
FROM blobs b
WHERE b.blobId = new.blobId
AND b.content IS NOT NULL
AND LENGTH(b.content) > 0
AND LENGTH(b.content) < 2097152
AND typeof(b.content) = 'text'
AND NOT EXISTS (SELECT 1 FROM notes_fts WHERE blobId = new.blobId);
END;

-- When a note's content (blobId), type, or isDeleted flag changes, the
-- set of indexable blobs can shift in either direction. We first remove
-- the old blob from FTS if nothing else still needs it indexed, then
-- (re)insert the new blob if it now qualifies and isn't already there.
CREATE TRIGGER IF NOT EXISTS notes_fts_after_note_update
AFTER UPDATE OF blobId, type, isDeleted ON notes
BEGIN
DELETE FROM notes_fts
WHERE blobId = old.blobId
AND old.blobId IS NOT NULL
AND (old.blobId != new.blobId
OR new.isDeleted = 1
OR new.type NOT IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet'))
AND NOT EXISTS (
SELECT 1 FROM notes
WHERE blobId = old.blobId
AND noteId != old.noteId
AND isDeleted = 0
AND type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet')
);

INSERT INTO notes_fts (blobId, content)
SELECT b.blobId, b.content
FROM blobs b
WHERE b.blobId = new.blobId
AND new.blobId IS NOT NULL
AND new.isDeleted = 0
AND new.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet')
AND b.content IS NOT NULL
AND LENGTH(b.content) > 0
AND LENGTH(b.content) < 2097152
AND typeof(b.content) = 'text'
AND NOT EXISTS (SELECT 1 FROM notes_fts WHERE blobId = new.blobId);
END;

-- Hard-delete of a note row: drop its blob from FTS unless another note
-- still keeps the same blob indexable.
CREATE TRIGGER IF NOT EXISTS notes_fts_after_note_delete
AFTER DELETE ON notes
WHEN old.blobId IS NOT NULL
BEGIN
DELETE FROM notes_fts
WHERE blobId = old.blobId
AND NOT EXISTS (
SELECT 1 FROM notes
WHERE blobId = old.blobId
AND isDeleted = 0
AND type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet')
);
END;

-- Safety net: if the blob row itself is garbage-collected (no notes,
-- revisions, or attachments reference it any more), make sure its FTS
-- entry goes with it.
CREATE TRIGGER IF NOT EXISTS notes_fts_after_blob_delete
AFTER DELETE ON blobs
BEGIN
DELETE FROM notes_fts WHERE blobId = old.blobId;
END;
`
},
// Add description column to revisions table for manual revision comments
{
version: 238,
Expand Down
33 changes: 24 additions & 9 deletions packages/trilium-core/src/routes/api/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ import { ValidationError } from "../../errors.js";
import becca_service from "../../becca/becca_service.js";
import { getHoistedNoteId } from "../../services/context.js";

// Number of results returned to the dropdown. Above this the user is better
// served by "Show in full search" which renders a paginated UI.
const QUICK_SEARCH_MAX_RESULTS = 50;

// Snippet extraction reads the blob for each note — capping it to the first
// batch the dropdown actually displays keeps the endpoint responsive.
const QUICK_SEARCH_SNIPPET_LIMIT = 15;

function searchFromNote(req: Request<{ noteId: string }>): SearchNoteResult {
const note = becca.getNoteOrThrow(req.params.noteId);

Expand Down Expand Up @@ -57,20 +65,27 @@ function quickSearch(req: Request<{ searchString: string }>) {
ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId()
});

// Execute search with our context
const allSearchResults = searchService.findResultsWithQuery(searchString, searchContext);
const trimmed = allSearchResults.slice(0, 200);

// Extract snippets using highlightedTokens from our context
for (const result of trimmed) {
result.contentSnippet = searchService.extractContentSnippet(result.noteId, searchContext.highlightedTokens);
result.attributeSnippet = searchService.extractAttributeSnippet(result.noteId, searchContext.highlightedTokens);
const trimmed = allSearchResults.slice(0, QUICK_SEARCH_MAX_RESULTS);

// Snippet extraction is the dominant per-result cost; only run it for the
// first batch the dropdown actually displays. Results beyond the limit still
// appear in the dropdown as plain links — explicitly assign empty snippets
// so downstream code (highlighter, API mapper) sees a consistent string shape
// rather than mixing strings with undefined.
for (let i = 0; i < trimmed.length; i++) {
const result = trimmed[i];
if (i < QUICK_SEARCH_SNIPPET_LIMIT) {
result.contentSnippet = searchService.extractContentSnippet(result.noteId, searchContext.highlightedTokens);
result.attributeSnippet = searchService.extractAttributeSnippet(result.noteId, searchContext.highlightedTokens);
} else {
result.contentSnippet = "";
result.attributeSnippet = "";
}
}

// Highlight the results
searchService.highlightSearchResults(trimmed, searchContext.highlightedTokens, searchContext.ignoreInternalAttributes);

// Map to API format
const searchResults = trimmed.map((result) => {
const { title, icon } = becca_service.getNoteTitleAndIcon(result.noteId);
return {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { describe, expect,it } from "vitest";

import NoteContentFulltextExp from "./note_content_fulltext.js";
import NoteContentFulltextExp, { buildFtsMatchQuery } from "./note_content_fulltext.js";

describe("Fuzzy Search Operators", () => {
it("~= operator works with typos", () => {
Expand All @@ -17,3 +17,52 @@ describe("Fuzzy Search Operators", () => {
expect(() => new NoteContentFulltextExp("~*", { tokens: ["wo"] })).toThrow(); // Too short
});
});

describe("buildFtsMatchQuery", () => {
it("translates substring / starts-with / ends-with / exact operators into a trigram phrase AND query", () => {
// All four operators are substring-superset, so trigram FTS can narrow
// candidates and let findInText enforce the precise boundary semantics.
expect(buildFtsMatchQuery("*=*", ["hello", "world"])).toBe(`"hello" "world"`);
expect(buildFtsMatchQuery("=", ["hello"])).toBe(`"hello"`);
expect(buildFtsMatchQuery("*=", ["hello"])).toBe(`"hello"`);
expect(buildFtsMatchQuery("=*", ["hello"])).toBe(`"hello"`);
});

it("returns null for operators FTS trigram can't safely narrow", () => {
// Fuzzy operators can match through typos that don't share trigrams with
// the target; FTS would silently drop those matches.
expect(buildFtsMatchQuery("~=", ["hello"])).toBeNull();
expect(buildFtsMatchQuery("~*", ["hello"])).toBeNull();
// Negation and regex need to see every row.
expect(buildFtsMatchQuery("!=", ["foo"])).toBeNull();
expect(buildFtsMatchQuery("%=", ["foo"])).toBeNull();
});

it("returns null when no usable tokens remain", () => {
expect(buildFtsMatchQuery("*=*", [])).toBeNull();
// Trigram cannot match phrases shorter than 3 codepoints.
expect(buildFtsMatchQuery("*=*", ["a"])).toBeNull();
expect(buildFtsMatchQuery("*=*", ["ab"])).toBeNull();
expect(buildFtsMatchQuery("*=*", ["", " "])).toBeNull();
// Punctuation-only tokens have no alphanumeric codepoint, so they'd
// tokenize to nothing in the trigram index and FTS5 would raise
// `fts5: syntax error` on the empty phrase.
expect(buildFtsMatchQuery("*=*", ["++", "=="])).toBeNull();
});

it("keeps tokens with mixed punctuation and alphanumeric content", () => {
// `v2.0` and similar still carry indexable trigrams (e.g. `v2.`, `2.0`),
// so they're kept rather than being treated as pure punctuation.
expect(buildFtsMatchQuery("*=*", ["v2.0"])).toBe(`"v2.0"`);
});
Comment on lines +41 to +57

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Add a test case to verify that punctuation-only tokens are correctly filtered out and do not cause FTS5 syntax errors.

Suggested change
it("returns null when no usable tokens remain", () => {
expect(buildFtsMatchQuery("*=*", [])).toBeNull();
expect(buildFtsMatchQuery("*=*", ["a"])).toBeNull(); // single char filtered out
expect(buildFtsMatchQuery("*=*", ["", " "])).toBeNull();
});
it("returns null when no usable tokens remain", () => {
expect(buildFtsMatchQuery("*=*", [])).toBeNull();
expect(buildFtsMatchQuery("*=*", ["a"])).toBeNull(); // single char filtered out
expect(buildFtsMatchQuery("*=*", ["", " "])).toBeNull();
expect(buildFtsMatchQuery("*=*", ["++", "=="])).toBeNull(); // punctuation-only filtered out
});

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in 62084f3, plus an extra case asserting that mixed alphanumeric/punctuation tokens (v2.0) still flow through, so the new alphanumeric filter doesn't accidentally over-filter.


it("filters out tokens shorter than the trigram window but keeps the rest", () => {
expect(buildFtsMatchQuery("*=*", ["a", "hello"])).toBe(`"hello"`);
expect(buildFtsMatchQuery("*=*", ["ok", "hello"])).toBe(`"hello"`);
});

it("escapes embedded double-quotes by doubling", () => {
// FTS5 phrase syntax escapes `"` as `""` inside a quoted phrase.
expect(buildFtsMatchQuery("*=*", [`he"llo`])).toBe(`"he""llo"`);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,52 @@ interface ConstructorOpts {

type SearchRow = Pick<NoteRow, "noteId" | "type" | "mime" | "content" | "isProtected">;

/**
* Translate a Trilium search operator + token list into a notes_fts MATCH query, or
* return null if the operator can't be safely narrowed by FTS (the caller then falls
* back to a full blob scan).
*
* The FTS5 table uses the `trigram` tokenizer (see migration 239), so a phrase
* query like `"hello"` matches any document containing the literal substring
* "hello". That makes the candidate set a strict superset of what JS
* `findInText` accepts for substring/start/end/exact operators (`*=*`, `=`,
* `*=`, `=*`) — `findInText` then re-checks each candidate to enforce the
* precise operator semantics. For operators where trigram is not a superset —
* fuzzy (`~=`/`~*`, where typos change every trigram), negation (`!=`), and
* regex (`%=`) — we return null and the caller scans every blob.
*
* Exported for unit testing; the class wraps it as a private method.
*/
export function buildFtsMatchQuery(operator: string, tokens: string[]): string | null {
// ~= / ~* tolerate typos that produce no overlapping trigrams with the target,
// so FTS would silently drop valid matches. != and %= require seeing every row.
if (operator === "~=" || operator === "~*" ||
operator === "!=" || operator === "%=") {
return null;
}

// The trigram tokenizer can only match phrases of at least 3 codepoints —
// anything shorter has no representable token in the index. Punctuation-only
// strings tokenize to nothing and would cause `fts5: syntax error`, so we
// also require at least one alphanumeric codepoint (any Unicode letter or
// number, including CJK / Cyrillic). Tokens that fail either check fall
// through to the legacy scan via the null return below.
const hasAlphanumeric = /[\p{L}\p{N}]/u;
const usableTokens = tokens
.map((t) => (t ?? "").trim())
.filter((t) => t.length >= 3 && hasAlphanumeric.test(t));
if (usableTokens.length === 0) {
return null;
}

// FTS5 trigram phrase syntax: each token becomes a quoted phrase (with inner
// `"` doubled per FTS5's escape rule). Trigram does **not** accept the `*`
// prefix wildcard. Multiple phrases joined by spaces are implicitly ANDed.
return usableTokens
.map((t) => `"${t.replace(/"/g, '""')}"`)
.join(" ");
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.

class NoteContentFulltextExp extends Expression {
private operator: string;
tokens: string[];
Expand Down Expand Up @@ -79,13 +125,34 @@ class NoteContentFulltextExp extends Expression {

const resultNoteSet = new NoteSet();

// Search through notes with content
for (const row of getSql().iterateRows<SearchRow>(`
SELECT noteId, type, mime, content, isProtected
FROM notes JOIN blobs USING (blobId)
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet')
AND isDeleted = 0
AND LENGTH(content) < ${MAX_SEARCH_CONTENT_SIZE}`)) {
// Narrow candidates through the notes_fts inverted index when the operator
// allows it. FTS5 (unicode61 + remove_diacritics) returns matching blobIds
// in microseconds, turning what would be a full-blob scan into a small set
// that findInText re-checks with the existing fuzzy/normalize logic to
// enforce precise operator semantics. Operators FTS can't express (regex,
// negation, anchored matches) fall back to the legacy unfiltered scan.
//
// Protected notes store encrypted ciphertext in blobs, so FTS can't see
// their plaintext — they're always included as candidates and decrypted
// inside findInText. Their typically small count keeps the speedup intact.
const ftsQuery = this.buildFtsMatchQuery();

const baseSql = `
SELECT notes.noteId, notes.type, notes.mime, blobs.content, notes.isProtected
FROM notes JOIN blobs USING (blobId)
WHERE notes.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap', 'spreadsheet')
AND notes.isDeleted = 0
AND LENGTH(blobs.content) < ${MAX_SEARCH_CONTENT_SIZE}`;

const sql = ftsQuery
? `${baseSql}
AND (notes.isProtected = 1
OR blobs.blobId IN (SELECT blobId FROM notes_fts WHERE notes_fts MATCH ?))`
: baseSql;

const params = ftsQuery ? [ftsQuery] : [];

for (const row of getSql().iterateRows<SearchRow>(sql, params)) {
this.findInText(row, inputNoteSet, resultNoteSet);
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.

Expand Down Expand Up @@ -128,6 +195,10 @@ class NoteContentFulltextExp extends Expression {
return resultNoteSet;
}

private buildFtsMatchQuery(): string | null {
return buildFtsMatchQuery(this.operator, this.tokens);
}

/**
* Helper method to check if a single word appears as an exact match in text
* @param wordToFind - The word to search for (should be normalized)
Expand Down