From 1fe49ed9c1f32e92f44d4009bc9c3f8099eac28e Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 5 May 2026 00:41:42 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20feat(parser=5Frouting):=20add?= =?UTF-8?q?=20per-file=20process=20options=20and=20canonical=20basename=20?= =?UTF-8?q?dedup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduce process_options string (i/t/e/!/F/R/S) for per-document multimodal, KG and chunking control - add filename hint support for [ENGINE-OPTIONS], [OPTIONS] and [ENGINE] forms - extend LIGHTRAG_PARSER rules with engine-options suffix for default processing options - add canonical_basename field for stable dedup and doc_id generation while preserving user-visible file_path with hints - deprecate addon_params["enable_multimodal_pipeline"] in favor of per-document process_options - update analyze_multimodal to gate VLM analysis by process_options and log opt-in/sidecar mismatches - skip entity/relation extraction when process_options "!" is set, keeping chunks for naive/mix retrieval - add chunking strategy selection (F/R/S) with fallback logging for unstructured legacy paths - persist process_options and canonical_basename through full_docs and doc_status metadata - update document routing, pipeline enqueue, storage backends and tests for new fields and dedup logic --- docs/FileProcessingConfiguration-zh.md | 88 +++++- lightrag/addon_params.py | 27 +- lightrag/api/routers/document_routes.py | 8 +- lightrag/base.py | 41 ++- lightrag/constants.py | 31 ++ lightrag/kg/json_doc_status_impl.py | 26 +- lightrag/parser_routing.py | 316 +++++++++++++++++--- lightrag/pipeline.py | 321 +++++++++++++++++---- lightrag/utils_pipeline.py | 41 ++- tests/test_document_routes_docx_archive.py | 60 ++++ tests/test_pipeline_release_closure.py | 245 +++++++++++++++- 11 files changed, 1056 insertions(+), 148 deletions(-) diff --git a/docs/FileProcessingConfiguration-zh.md b/docs/FileProcessingConfiguration-zh.md index b0b8d9f8dc..2a5f1c77b2 100644 --- a/docs/FileProcessingConfiguration-zh.md +++ b/docs/FileProcessingConfiguration-zh.md @@ -15,7 +15,7 @@ ## 修改默认内容抽取方式 -使用环境变量 `LIGHTRAG_PARSER`可以给不同的文件后最配资默认的文件内容提取方式: +使用环境变量 `LIGHTRAG_PARSER`可以给不同的文件后缀配置默认的文件内容提取方式以及默认的处理选项: ```bash LIGHTRAG_PARSER=pdf:mineru,docx:docling,pptx:docling,xlsx:docling,*:legacy @@ -28,6 +28,7 @@ DOCLING_ENDPOINT=http://localhost:8081/v1/convert/file/async ```text 后缀:引擎,后缀:引擎,*:legacy 后缀:引擎;后缀:引擎;*:legacy +后缀:引擎-选项 # 在引擎后追加默认处理选项(见下文“处理选项”一节) ``` 注意事项: @@ -35,9 +36,10 @@ DOCLING_ENDPOINT=http://localhost:8081/v1/convert/file/async - 左侧匹配的是文件后缀,不是完整文件名;应写 `pdf:mineru`,不要写 `*.pdf:mineru`。 - 规则可以使用英文逗号 `,` 或分号 `;` 分隔。 - 规则按从左到右的顺序检查;优先规则放在前面;通配符规则应放在最后。 -- 启动时会严格校验规则:未知内容提取引擎、错误后缀写法、显式使用不支持的后缀、外部引擎缺少 endpoint 都会导致启动失败。 +- 启动时会严格校验规则:未知内容提取引擎、错误后缀写法、显式使用不支持的后缀、外部引擎缺少 endpoint、处理选项中的非法字符都会导致启动失败。 - 通配符规则只会让引擎处理其能力表支持的后缀。例如 `*:mineru;html:docling` 中,MinerU 只接管 MinerU 支持的后缀,`html` 会继续匹配到后续 `docling` 规则。 - 如果所有规则都不可用,文件内容提取方式会回退到 `legacy`,如果legacy也不支持对应的文件后缀,会向系统加一个错误条目,上传文件保留在`INPUT`目录。 +- 引擎后缀 `-选项` 部分作为该规则匹配文件的默认 `process_options`,会被文件名 hint 中的 `[...]` 覆盖。例如 `LIGHTRAG_PARSER=docx:native-iet` 表示所有 `.docx` 默认采用 `native` 引擎并开启图、表、公式分析。 ## 对单文件指定内容抽取方式 @@ -50,7 +52,60 @@ memo.[native].docx report.[legacy].pdf ``` -文件名 hint 的优先级高于 `LIGHTRAG_PARSER`。如果指定的引擎不支持该后缀,系统会回退到默认规则继续选择可用引擎。如果所有规则都不可用,文件内容提取方式会回退到 `legacy`,如果legacy也不支持对应的文件后缀,会向系统加一个错误条目,上传文件保留在`INPUT`目录。 +中括号内的内容支持三种形式: + +```text +[ENGINE] # 仅指定引擎,处理选项使用默认或 LIGHTRAG_PARSER 提供的默认 +[ENGINE-OPTIONS] # 引擎 + 处理选项 +[OPTIONS] # 仅指定处理选项,引擎仍按 LIGHTRAG_PARSER / 默认规则解析 +``` + +仅当首段以 `-` 分隔出第二段时第一段才被作为引擎候选;否则若整段能整体匹配引擎名(`mineru` / `native` / `docling` / `legacy`),视为只指定引擎;否则整段视为选项串。文件名 hint 的优先级高于 `LIGHTRAG_PARSER`。如果指定的引擎不支持该后缀,系统会回退到默认规则继续选择可用引擎。如果所有规则都不可用,文件内容提取方式会回退到 `legacy`,如果legacy也不支持对应的文件后缀,会向系统加一个错误条目,上传文件保留在`INPUT`目录。 + +## 处理选项 + +处理选项控制单个文件在多模态分析、知识图谱构建和文本分块上的行为。所有选项都是可选的;缺省值见下表。同一文件最多指定一种分块方式(`F` / `R` / `S`),其它选项可任意组合。 + +| 选项 | 类型 | 默认 | 含义 | +| --- | --- | --- | --- | +| `i` | 多模态 | 关闭 | 启用图像分析(VLM) | +| `t` | 多模态 | 关闭 | 启用表格分析(VLM) | +| `e` | 多模态 | 关闭 | 启用公式分析(VLM) | +| `!` | 流水线 | 关闭 | 禁止实体/关系抽取,不构建知识图谱(仅保留 chunks 向量索引,naive / mix 检索仍可用) | +| `F` | 分块 | 默认 | 固定长度或按分隔符机械分割(按分隔符分割时块不重叠) | +| `R` | 分块 | — | 递归语义分块(优先按段落、句子分割);当前版本回退至 `F`,行为等同于固定分块 | +| `S` | 分块 | — | 标题语义分块(优先按标题分割,标题块不重叠);要求 `native` 抽取出的结构化输出,否则降级到 `F` | + +举例: + +```text +my-proposal.[native-iet].docx # 使用 native 引擎,开启图、表、公式分析 +my-memo.[native-R!].md # 使用 native 引擎,递归语义分块,禁止知识图谱构建,多模态默认关 +my-proposal.[!].docx # 使用默认引擎(按 LIGHTRAG_PARSER 解析),仅禁止知识图谱构建 +my-proposal.[mineru].docx # 使用 MinerU 引擎,多模态、分块、KG 全部默认(即多模态关、F 分块、构建 KG) +``` + +校验与解析规则: + +- `F`/`R`/`S` 至多出现一个;同一选项重复时只生效一次但不报错。 +- 大小写敏感:分块选项 `F`/`R`/`S` 必须大写;其它选项 `i`/`t`/`e`/`!` 小写。 +- 中括号内出现非法字符时,整个 hint 失效,引擎按默认规则解析、选项按 `LIGHTRAG_PARSER` 默认或全部默认;同时落日志 warning。 +- 如果文件名 hint 提供了非空选项串,则以 hint 为准;否则使用 `LIGHTRAG_PARSER` 规则中匹配项的默认选项;都没有则使用全部默认。 +- `S` 仅对 `native` 抽取出的结构化结果(interchange JSONL)有效;对 `legacy` 路径或非结构化输出会自动降级到 `F` 并记录 warning。 + +> 多模态全局开关 `addon_params["enable_multimodal_pipeline"]` 已废弃,相关行为统一由文件级 `i` / `t` / `e` 选项控制。如启动配置仍包含该字段,会在日志输出 deprecation warning 并被忽略。 + +### 选项作用阶段 + +处理选项的不同字符在流水线的不同阶段生效,具体如下: + +| 选项 | 作用阶段 | 说明 | +| --- | --- | --- | +| `i` / `t` / `e` | ANALYZING(VLM 分析) | 决定是否对 sidecar 中的图像 / 表格 / 公式调用 VLM 做摘要分析。**抽取阶段不受影响**:内容提取引擎按文档实际内容输出 `drawings.json` / `tables.json` / `equations.json` sidecar 文件。这样后续仅修改 `i`/`t`/`e` 选项触发"再分析"即可补做 VLM,无须重新解析原始文件。 | +| `!` | EXTRACTION(实体关系抽取) | 跳过实体/关系抽取与图谱写入;chunks 仍写入向量库以保留 naive / mix 检索能力。 | +| `F` / `R` / `S` | CHUNKING(文本分块) | 决定使用哪种分块策略;对解析阶段输出无影响。 | + +> 模态可用性以"sidecar 文件是否存在"为唯一信号,内容提取引擎不需要在 meta 中声明能力。某文档若没有任何图像/表格/公式,对应 sidecar 不会写入;用户即使开启了 `i`/`t`/`e`,对应模态也只会被静默跳过,但 `analyze_multimodal` 会在该篇文档落一行 INFO 级日志(`[analyze_multimodal] process_options opted into i:drawings ... but the parser produced no such sidecar`),便于排查"VLM 为何没跑"。这种情况不会报错。 ## 推荐配置 @@ -70,6 +125,12 @@ report.[legacy].pdf LIGHTRAG_PARSER=docx:native ``` +为 docx 默认开启图、表、公式分析(处理选项默认): + +```bash +LIGHTRAG_PARSER=docx:native-iet +``` + ### 梦幻组合 * 使用Legacy处理md(写在最前面是为了避免用Docling处理md文件) @@ -90,25 +151,28 @@ DOCLING_ENDPOINT=http://localhost:8081/v1/convert/file/async | 字段 | 说明 | | --- | --- | -| `file_path` | 文件名 basename(不含目录)。如果文件名带有支持的处理指引 hint,例如 `abc.[native].docx`,会先规范化为 `abc.docx` 后保存。未提供有效来源时保存为 `unknown_source`;有效文件名的重复判定与内容溯源都基于该字段。 | -| `source_path` | 入队时提供的原始路径(仅当与规范化后的 `file_path` 不同才会写入),供 `native` / `mineru` / `docling` 解析器定位真实文件位置。 | +| `file_path` | 文件名 basename(不含目录),**保留用户提供的原始名(含中括号 hint)**,例如 `abc.[native-iet].docx` 原样写入。未提供有效来源时保存为 `unknown_source`。文件名 hint 不会被剥离,方便管理 UI 直接展示用户原本的命名意图。 | +| `canonical_basename` | 去掉处理提示 hint 后的规范化 basename(例如 `abc.docx`)。文件名查重以此字段为索引 key,保证 `abc.docx` 与 `abc.[native-iet].docx` 视为同一逻辑文档。 | +| `source_path` | 入队时提供的原始路径(仅当含目录分隔符或绝对路径时才写入),供 `native` / `mineru` / `docling` 解析器定位真实文件位置。 | | `format` | 内容格式:`pending_parse`, `raw`, `lightrag`。 | | `content` | `raw` 时保存抽取文本;`pending_parse` 时为空字符串;`lightrag` 时固定为以 `{{LRdoc}}`开头的一段内容摘要。 | | `content_hash` | 内容 MD5,用于跨文件名查重。`format=raw` 取 `sanitize_text_for_encoding` 后文本的 hash;`format=lightrag` 取 `*.blocks.jsonl` 文件 hash;`format=pending_parse` 不写入,待抽取完成后补上。 | -| `lightrag_document_path` | `format=lightrag` 时保存结构化 LightRAG Document 的路径;新记录优先保存为相对 `INPUT_DIR` 的路径,例如 `__parsed__/report.docx.parsed/report.blocks.jsonl`。 | +| `lightrag_document_path` | `format=lightrag` 时保存结构化 LightRAG Document 的路径;新记录优先保存为相对 `INPUT_DIR` 的路径,例如 `__parsed__/report.docx.parsed/report.blocks.jsonl`。注意路径中的子目录与 blocks 文件名都使用规范化 basename(不含 hint)。 | | `parsed_engine` | 实际完成抽取的引擎:`legacy`, `native`, `mineru`, `docling`。对于待抽取文件,也可暂存目标引擎。 | +| `process_options` | 入队时记录的原始处理选项串(不含引擎名和分隔 `-`),例如 `"iet"`、`"R!"`、`""`。下游各阶段以此字段为权威源,决定是否启用图像/表格/公式分析(`i`/`t`/`e`)、是否禁止知识图谱构建(`!`)以及分块方式(`F`/`R`/`S`)。空字符串等价于全部默认值。 | `pending_parse` 表示文件已经入队,但还没有完成抽取。抽取成功后会改写为 `raw` 或 `lightrag`,并补齐 `content_hash`。抽取失败时保留 `pending_parse` 和空 `content`,便于后续排查和重试。 -> `doc_status` 中也会同步保存规范化后的 `file_path`(basename)与 `content_hash`,作为 `get_doc_by_file_basename` / `get_doc_by_content_hash` 的查重索引来源。 +> `doc_status` 中也同步保存原始 `file_path`(含 hint)、`canonical_basename` 与 `content_hash`,作为 `get_doc_by_file_basename` / `get_doc_by_content_hash` 的查重索引来源。`get_doc_by_file_basename` 内部把传入参数先经 `canonicalize_parser_hinted_basename` 规范化后再与 `canonical_basename` 比对,因此 `abc.docx` 与 `abc.[native-iet].docx` 总是命中同一文档。 +> `process_options` 同时镜像写入 `doc_status.metadata["process_options"]`,便于管理 UI 直接展示当前文件的处理策略。 ## 内容提取结果目录结构 `__parsed__` 是输入目录旁的归档与分析结果目录。它同时保存已经处理过的原始文档,以及结构化解析产生的 LightRAG Document 文件和图片等资源。 -- 原始文件归档:`legacy` 本地抽取成功并入队后,原文件会移动到同级 `__parsed__` 目录;`native` / `mineru` / `docling` 会先保留原文件供 pipeline 解析,解析成功并写入 `full_docs` 后再移动到 `__parsed__`。 -- 分析结果目录:结构化解析结果会写入以规范化文件名加 `.parsed` 后缀命名的子目录,避免与归档原文件同名冲突。例如 `report.docx` 的分析结果目录为 `__parsed__/report.docx.parsed/`;`report.[native].docx` 也会写入 `__parsed__/report.docx.parsed/`。 -- 分析结果文件:LightRAG Document blocks 文件使用规范化文件名的主干命名,例如 `__parsed__/report.docx.parsed/report.blocks.jsonl`;同一目录下还可能包含 `report.tables.json`、`report.drawings.json`、`report.equations.json` 和 `report.blocks.assets/` 图片资源目录。 +- 原始文件归档:`legacy` 本地抽取成功并入队后,原文件会移动到同级 `__parsed__` 目录;`native` / `mineru` / `docling` 会先保留原文件供 pipeline 解析,解析成功并写入 `full_docs` 后再移动到 `__parsed__`。**归档时保留原始文件名(含 `[hint]`)**,例如 `report.[native-iet].docx` 归档为 `__parsed__/report.[native-iet].docx`,便于追溯用户最初的命名与处理选项。 +- 分析结果目录:结构化解析结果会写入以**规范化文件名**(去掉 `[hint]`)加 `.parsed` 后缀命名的子目录,避免与归档原文件同名冲突,并保证当文件名 hint 或处理选项变化时同一逻辑文档继续指向同一目录。例如 `report.docx`、`report.[native].docx`、`report.[native-iet].docx` 的分析结果都写入 `__parsed__/report.docx.parsed/`。 +- 分析结果文件:LightRAG Document blocks 文件以及 sidecar 都使用规范化文件名的主干命名,例如 `__parsed__/report.docx.parsed/report.blocks.jsonl`;同一目录下还可能包含 `report.tables.json`、`report.drawings.json`、`report.equations.json` 和 `report.blocks.assets/` 图片资源目录。**sidecar 是否生成由文档内容决定**:解析器只在文档实际包含表格/图片/公式时写出对应文件。这是模态可用性的唯一信号 —— 引擎不需要在 meta 中声明能力。`i`/`t`/`e` 选项只决定下一阶段是否对已存在的 sidecar 调用 VLM 做摘要分析。 - 解析失败时,原文件不会移动,便于修复配置后重新处理。 - `/documents/scan` 扫描到同名且已 `PROCESSED` 的文件时,该输入文件会被视为已处理并移动到 `__parsed__`,不会作为新文档入队。 - `/documents/scan` 同一次扫描中发现多个规范化后同名的文件时,会优先保留带支持引擎 hint 的文件以尊重用户的引擎选择;如果没有任何变体带 hint,则按排序处理第一个文件。其余变体会输出 warning 并移动到 `__parsed__`,避免同批文件互相覆盖。例如 `abc.docx` 和 `abc.[native].docx` 同时存在时只会处理 `abc.[native].docx`。 @@ -122,7 +186,7 @@ DOCLING_ENDPOINT=http://localhost:8081/v1/convert/file/async ### 1) 文件名(basename)查重 - 判断粒度为 basename,不包含目录路径和 workspace 路径。例如 `/data/a.pdf`、`inputs/a.pdf` 和 `a.pdf` 都视为同一个文件名 `a.pdf`。 -- 文件名查重会去掉文件名末尾的支持引擎处理指引 hint,即认为 `abc.docx` 与 `abc.[native].docx` 是文件名重复;不支持的 hint 不会被剥离,例如 `abc.[draft].docx` 仍按原文件名处理。 +- 文件名查重以 `canonical_basename` 为索引:将文件名末尾的支持引擎处理提示 hint 剥离后再比对,因此 `abc.docx`、`abc.[native].docx`、`abc.[native-iet].docx` 之间互相视为同名;不支持的 hint 不会被剥离,例如 `abc.[draft].docx` 仍按原文件名处理。 - 对普通上传、文本接口和核心入队 API,只要 `doc_status` 中已经存在同名文件记录,无论该记录当前处于 `PENDING`、`PARSING`、`ANALYZING`、`PROCESSING`、`FAILED` 还是 `PROCESSED`,同名文件都会被视为重复。 - 对 `/documents/scan` 目录扫描: - 同一次扫描中如果有多个文件规范化后同名,优先处理带支持引擎 hint 的文件;若无任何 hint 变体,则处理排序后的第一个文件,其余文件会归档到 `__parsed__` 并跳过。 @@ -133,7 +197,7 @@ DOCLING_ENDPOINT=http://localhost:8081/v1/convert/file/async - 核心 API `insert` / `ainsert` / `apipeline_enqueue_documents` 仍兼容未传 `file_paths` 的调用;这类文档的 `file_path` 会保存为 `unknown_source`,不会参与文件名查重,文档 ID 继续按文本内容生成。 - 空字符串、`no-file-path` 和 `unknown_source` 都会被视为未知来源;它们不会阻止新的无来源文本入队,也不会作为同名文件互相去重。 -存储后端通过 `get_doc_by_file_basename` 提供 basename 直查能力。`JsonDocStatusStorage` 已经实现了内存级遍历;其它后端目前回落到默认实现(扫描全部状态后比对 basename),将在后续 PR 中补齐原生索引。 +存储后端通过 `get_doc_by_file_basename` 提供 basename 直查能力,内部按 `canonical_basename` 字段比对(传入参数会先经 `canonicalize_parser_hinted_basename` 规范化)。`JsonDocStatusStorage` 已经实现了内存级遍历;其它后端目前回落到默认实现(扫描全部状态后比对 `canonical_basename`),将在后续 PR 中补齐原生索引。 ### 2) 内容 hash 查重 diff --git a/lightrag/addon_params.py b/lightrag/addon_params.py index f5933d3203..b5d62aee95 100644 --- a/lightrag/addon_params.py +++ b/lightrag/addon_params.py @@ -17,7 +17,25 @@ from typing import Any, Callable, Mapping from lightrag.constants import DEFAULT_SUMMARY_LANGUAGE -from lightrag.utils import get_env_value +from lightrag.utils import get_env_value, logger + + +# Keys that used to live in addon_params but have been superseded by +# per-document ``process_options``. We log once when callers still pass them +# so existing configs surface their drift without breaking. +_DEPRECATED_ADDON_PARAM_KEYS: tuple[str, ...] = ("enable_multimodal_pipeline",) +_warned_deprecated_keys: set[str] = set() + + +def _emit_deprecated_addon_warnings(params: Mapping[str, Any]) -> None: + for key in _DEPRECATED_ADDON_PARAM_KEYS: + if key in params and key not in _warned_deprecated_keys: + logger.warning( + f"addon_params['{key}'] is deprecated and ignored; per-document " + f"behaviour is now controlled by filename-hint process_options " + f"(see docs/FileProcessingConfiguration-zh.md)." + ) + _warned_deprecated_keys.add(key) def default_addon_params() -> dict[str, Any]: @@ -32,7 +50,12 @@ def normalize_addon_params(addon_params: Mapping[str, Any] | None) -> dict[str, if addon_params is None: normalized = default_addon_params() elif isinstance(addon_params, Mapping): - normalized = dict(addon_params) + _emit_deprecated_addon_warnings(addon_params) + normalized = { + k: v + for k, v in addon_params.items() + if k not in _DEPRECATED_ADDON_PARAM_KEYS + } else: raise TypeError( "addon_params must be a Mapping or None, got " diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 5bafc53357..d1ba67b7b9 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -34,7 +34,7 @@ from lightrag.parser_routing import ( canonicalize_parser_hinted_basename, filename_parser_hint, - resolve_file_parser_engine, + resolve_file_parser_directives, ) from lightrag.utils import ( generate_track_id, @@ -1403,7 +1403,7 @@ async def pipeline_enqueue_file( except Exception: file_size = 0 - extraction_engine = resolve_file_parser_engine(file_path) + extraction_engine, process_options = resolve_file_parser_directives(file_path) if extraction_engine != PARSER_ENGINE_LEGACY: try: enqueue_kwargs = { @@ -1412,6 +1412,8 @@ async def pipeline_enqueue_file( "docs_format": FULL_DOCS_FORMAT_PENDING_PARSE, "parsed_engine": extraction_engine, } + if process_options: + enqueue_kwargs["process_options"] = process_options if reprocess_existing_non_processed: enqueue_kwargs["reprocess_existing_non_processed"] = True enqueue_result = await rag.apipeline_enqueue_documents( @@ -1725,6 +1727,8 @@ async def pipeline_enqueue_file( "track_id": track_id, "parsed_engine": PARSER_ENGINE_LEGACY, } + if process_options: + enqueue_kwargs["process_options"] = process_options if reprocess_existing_non_processed: enqueue_kwargs["reprocess_existing_non_processed"] = True enqueue_result = await rag.apipeline_enqueue_documents( diff --git a/lightrag/base.py b/lightrag/base.py index 66c55b78db..a3b9cd7558 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -806,6 +806,12 @@ class DocProcessingStatus: Used together with file_path basename for duplicate detection. Empty for pending_parse records whose content has not been extracted yet. """ + canonical_basename: str | None = None + """Canonical (parser-hint stripped) basename used as the dedup index key. + + Stored alongside the original ``file_path`` so backends can look records + up by ``abc.docx`` even when the user-visible name is ``abc.[native].docx``. + """ """Internal field: indicates if multimodal processing is complete. Not shown in repr() but accessible for debugging.""" def __post_init__(self): @@ -917,9 +923,12 @@ async def get_doc_by_file_basename( ) -> tuple[str, dict[str, Any]] | None: """Get document by file basename (filename without directory). - Used for filename-based deduplication. Backends that can index by - basename should override this for efficiency. The default implementation - scans all documents via get_docs_by_statuses(). + Used for filename-based deduplication. Comparison is performed + against the canonical (parser-hint stripped) basename, so callers may + pass either the original or the canonical form. Backends that can + index by ``canonical_basename`` should override this for efficiency. + The default implementation scans all documents via + :meth:`get_docs_by_statuses`. Args: basename: The filename basename to search for (e.g. "report.pdf"). @@ -929,6 +938,11 @@ async def get_doc_by_file_basename( """ if not basename: return None + # Imported lazily to avoid a hard dependency at module-load time and + # because parser_routing already depends on lightrag.constants. + from lightrag.parser_routing import canonicalize_parser_hinted_basename + + target = canonicalize_parser_hinted_basename(basename) try: docs = await self.get_docs_by_statuses(list(DocStatus)) except NotImplementedError: @@ -936,14 +950,23 @@ async def get_doc_by_file_basename( except Exception: return None for doc_id, doc in docs.items(): - existing_path = ( - doc.get("file_path") + stored_canonical = ( + doc.get("canonical_basename") if isinstance(doc, dict) - else getattr(doc, "file_path", None) + else getattr(doc, "canonical_basename", None) ) - if not existing_path: - continue - if Path(str(existing_path)).name == basename: + if not stored_canonical: + existing_path = ( + doc.get("file_path") + if isinstance(doc, dict) + else getattr(doc, "file_path", None) + ) + if not existing_path: + continue + stored_canonical = canonicalize_parser_hinted_basename( + Path(str(existing_path)).name + ) + if stored_canonical == target: return doc_id, (doc if isinstance(doc, dict) else asdict(doc)) return None diff --git a/lightrag/constants.py b/lightrag/constants.py index 4bece5f173..3759be48fc 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -168,6 +168,37 @@ } PARSED_DIR_NAME = "__parsed__" # Dir for parsed files (renamed from __enqueued__) +# Per-file processing options carried by filename hints / LIGHTRAG_PARSER rules. +# See docs/FileProcessingConfiguration-zh.md for the full specification. +PROCESS_OPTION_IMAGES = "i" # Enable VLM analysis for drawings/images +PROCESS_OPTION_TABLES = "t" # Enable VLM analysis for tables +PROCESS_OPTION_EQUATIONS = "e" # Enable VLM analysis for equations +PROCESS_OPTION_SKIP_KG = "!" # Skip entity/relation extraction (no KG build) +PROCESS_OPTION_CHUNK_FIXED = "F" # Fixed-length / separator chunking (default) +PROCESS_OPTION_CHUNK_RECURSIVE = ( + "R" # Recursive semantic chunking (currently aliased to F) +) +PROCESS_OPTION_CHUNK_HEADING = "S" # Heading-driven semantic chunking + +PROCESS_OPTION_CHUNK_CHARS = frozenset( + { + PROCESS_OPTION_CHUNK_FIXED, + PROCESS_OPTION_CHUNK_RECURSIVE, + PROCESS_OPTION_CHUNK_HEADING, + } +) +SUPPORTED_PROCESS_OPTIONS = frozenset( + { + PROCESS_OPTION_IMAGES, + PROCESS_OPTION_TABLES, + PROCESS_OPTION_EQUATIONS, + PROCESS_OPTION_SKIP_KG, + PROCESS_OPTION_CHUNK_FIXED, + PROCESS_OPTION_CHUNK_RECURSIVE, + PROCESS_OPTION_CHUNK_HEADING, + } +) + DEFAULT_MAX_PARALLEL_ANALYZE = 2 # Multimodal analysis (VLM) concurrency # Embedding configuration defaults diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index ac47e013df..8fc646165f 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -404,23 +404,33 @@ async def get_doc_by_file_path(self, file_path: str) -> Union[dict[str, Any], No async def get_doc_by_file_basename( self, basename: str ) -> Union[tuple[str, dict[str, Any]], None]: - """Find an existing record whose file_path basename matches. + """Find an existing record whose canonical basename matches. - Compares against the stored file_path's basename so legacy records - that still hold a full path are matched the same way as new records - whose file_path is already a basename. + Compares against the stored ``canonical_basename`` field, falling + back to ``canonicalize_parser_hinted_basename(file_path)`` for + legacy records that pre-date the field. Inputs are likewise + canonicalized so callers can pass either ``abc.docx`` or + ``abc.[native].docx``. """ if not basename: return None if self._storage_lock is None: raise StorageNotInitializedError("JsonDocStatusStorage") + from lightrag.parser_routing import canonicalize_parser_hinted_basename + + target = canonicalize_parser_hinted_basename(basename) async with self._storage_lock: for doc_id, doc_data in self._data.items(): - stored_path = doc_data.get("file_path") - if not stored_path: - continue - if Path(str(stored_path)).name == basename: + stored_canonical = doc_data.get("canonical_basename") + if not stored_canonical: + stored_path = doc_data.get("file_path") + if not stored_path: + continue + stored_canonical = canonicalize_parser_hinted_basename( + Path(str(stored_path)).name + ) + if stored_canonical == target: return doc_id, doc_data return None diff --git a/lightrag/parser_routing.py b/lightrag/parser_routing.py index caf1266f3f..6db10fe9ce 100644 --- a/lightrag/parser_routing.py +++ b/lightrag/parser_routing.py @@ -3,8 +3,9 @@ import fnmatch import os import re +from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import Any, Literal from lightrag.constants import ( FULL_DOCS_FORMAT_LIGHTRAG, @@ -15,8 +16,18 @@ PARSER_ENGINE_MINERU, PARSER_ENGINE_NATIVE, PARSER_ENGINE_SUFFIX_CAPABILITIES, + PROCESS_OPTION_CHUNK_CHARS, + PROCESS_OPTION_CHUNK_FIXED, + PROCESS_OPTION_CHUNK_HEADING, + PROCESS_OPTION_CHUNK_RECURSIVE, + PROCESS_OPTION_EQUATIONS, + PROCESS_OPTION_IMAGES, + PROCESS_OPTION_SKIP_KG, + PROCESS_OPTION_TABLES, SUPPORTED_PARSER_ENGINES, + SUPPORTED_PROCESS_OPTIONS, ) +from lightrag.utils import logger _PARSER_RULE_SPLIT_RE = re.compile(r"[;,]") _PARSER_ENGINE_ENDPOINT_ENV = { @@ -40,6 +51,119 @@ def normalize_parser_engine(engine: Any) -> str: return str(engine or "").strip().split("-", 1)[0].lower() +# --------------------------------------------------------------------------- +# Per-file processing options (i/t/e/!/F/R/S) +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class ProcessOptions: + """Decoded view of a ``process_options`` string. + + The ``raw`` string is preserved verbatim (with duplicates and ordering) + for storage / audit purposes; boolean flags reflect the deduped logical + state used by the pipeline. + """ + + raw: str = "" + images: bool = False + tables: bool = False + equations: bool = False + skip_kg: bool = False + chunking: Literal["F", "R", "S"] = PROCESS_OPTION_CHUNK_FIXED + + +_PROCESS_OPTION_DEFAULT = ProcessOptions() + + +def sanitize_process_options(options: Any) -> str: + """Strip non-supported characters / hyphen / whitespace from an options string. + + Returns the raw token sequence as-is (no dedup, no reorder) so the + canonical user intent is preserved on disk. Invalid characters are + silently dropped — the caller is expected to have already validated. + """ + if not options: + return "" + return "".join(ch for ch in str(options) if ch in SUPPORTED_PROCESS_OPTIONS) + + +def validate_process_options( + options: str, *, label: str = "process options" +) -> list[str]: + """Return a list of error messages for an options string; empty if valid.""" + errors: list[str] = [] + if not options: + return errors + seen_chunkers: list[str] = [] + for ch in options: + if ch in (" ", "-"): + continue + if ch not in SUPPORTED_PROCESS_OPTIONS: + errors.append(f"{label} contains unsupported character {ch!r}") + continue + if ch in PROCESS_OPTION_CHUNK_CHARS and ch not in seen_chunkers: + seen_chunkers.append(ch) + if len(seen_chunkers) > 1: + errors.append( + f"{label} specifies multiple chunking modes " + f"({'/'.join(seen_chunkers)}); pick one of " + f"{PROCESS_OPTION_CHUNK_FIXED}/{PROCESS_OPTION_CHUNK_RECURSIVE}/{PROCESS_OPTION_CHUNK_HEADING}" + ) + return errors + + +def parse_process_options(options: Any) -> ProcessOptions: + """Decode a process-options string into a :class:`ProcessOptions` view.""" + raw = sanitize_process_options(options) + if not raw: + return _PROCESS_OPTION_DEFAULT + chars = set(raw) + chunking: Literal["F", "R", "S"] = PROCESS_OPTION_CHUNK_FIXED + # Pick the first chunking selector encountered; validate_process_options + # already filters duplicates upstream. + for ch in raw: + if ch in PROCESS_OPTION_CHUNK_CHARS: + chunking = ch # type: ignore[assignment] + break + return ProcessOptions( + raw=raw, + images=PROCESS_OPTION_IMAGES in chars, + tables=PROCESS_OPTION_TABLES in chars, + equations=PROCESS_OPTION_EQUATIONS in chars, + skip_kg=PROCESS_OPTION_SKIP_KG in chars, + chunking=chunking, + ) + + +def split_engine_and_options(bracket_inner: str) -> tuple[str | None, str]: + """Decompose a bracket-hint inner string into ``(engine, options)``. + + Format rules (see docs/FileProcessingConfiguration-zh.md): + - ``ENGINE-OPTIONS``: first ``-``-separated segment is the engine + candidate; the remainder is the options string. + - ``ENGINE``: matches a supported engine name as a whole. + - ``OPTIONS``: anything else is treated as options-only. + """ + inner = (bracket_inner or "").strip() + if not inner: + return None, "" + + if "-" in inner: + head, _, tail = inner.partition("-") + engine_candidate = normalize_parser_engine(head) + if engine_candidate in SUPPORTED_PARSER_ENGINES: + return engine_candidate, tail.strip() + # Unknown engine before "-"; treat the whole thing as opaque options + # (likely invalid — caller will validate downstream). + return None, inner + + engine_candidate = normalize_parser_engine(inner) + if engine_candidate in SUPPORTED_PARSER_ENGINES: + return engine_candidate, "" + return None, inner + + def parser_suffix(file_path: str | Path) -> str: return Path(file_path).suffix.lower().lstrip(".") @@ -72,29 +196,79 @@ def _engine_is_usable( return True -def filename_parser_hint(file_path: str | Path) -> str | None: - m = _PARSER_HINT_RE.search(Path(file_path).name) +def _filename_hint_match( + file_path: str | Path, +) -> tuple[re.Match[str], str, str] | None: + """Locate a supported ``[hint]`` segment in a basename. + + Returns ``(match, engine_or_empty, options)`` when the bracket inner is a + recognised hint per the spec; otherwise ``None``. Both branches require + the options portion to pass :func:`validate_process_options` — + engine-qualified hints with bad option chars (e.g. ``[native-FR]``, + ``[native-Q]``) fail the same way options-only hints do, so the + documented "invalid characters → whole hint fails → defaults apply" + contract holds for every hint shape. + """ + basename = Path(file_path).name + m = _PARSER_HINT_RE.search(basename) if not m: return None - engine = normalize_parser_engine(m.group(1)) - return engine if engine in SUPPORTED_PARSER_ENGINES else None + engine, options = split_engine_and_options(m.group(1)) + if options: + option_errors = validate_process_options(options) + if option_errors: + logger.warning( + f"[parser_routing] ignoring filename hint {m.group(0)!r} in " + f"{basename!r}: {'; '.join(option_errors)}" + ) + return None + if engine in SUPPORTED_PARSER_ENGINES: + return m, engine, options + if engine is None and options: + return m, "", options + return None + + +def filename_parser_hint(file_path: str | Path) -> str | None: + """Return the engine inferred from a filename hint, or ``None``.""" + found = _filename_hint_match(file_path) + if not found: + return None + _, engine, _ = found + return engine or None + + +def filename_process_options(file_path: str | Path) -> str: + """Return the raw process-options string from a filename hint.""" + found = _filename_hint_match(file_path) + if not found: + return "" + return found[2] + + +def filename_parser_directives(file_path: str | Path) -> tuple[str | None, str]: + """Return ``(engine, options)`` decoded from a filename hint.""" + found = _filename_hint_match(file_path) + if not found: + return None, "" + _, engine, options = found + return (engine or None), options def canonicalize_parser_hinted_basename(file_path: str | Path) -> str: """Return basename with a supported parser hint removed. - Only the final ``.[engine].ext`` segment is stripped, exactly once, and - only when the bracketed value normalizes to a supported parser engine. - Nested hints such as ``name.[native].[mineru].pdf`` therefore become - ``name.[native].pdf`` — additional outer hints are not unwrapped. + Only the final ``.[engine].ext`` (or ``.[engine-options].ext`` / + ``.[options].ext``) segment is stripped, exactly once, and only when the + bracket content is a recognised hint. Nested hints such as + ``name.[native].[mineru].pdf`` therefore become ``name.[native].pdf`` — + additional outer hints are not unwrapped. """ basename = Path(file_path).name - m = _PARSER_HINT_RE.search(basename) - if not m: - return basename - engine = normalize_parser_engine(m.group(1)) - if engine not in SUPPORTED_PARSER_ENGINES: + found = _filename_hint_match(file_path) + if not found: return basename + m, _, _ = found return f"{basename[: m.start()]}{m.group(2)}" @@ -115,6 +289,17 @@ def _rule_pattern_matches_engine_capability(pattern: str, engine: str) -> bool: return any(fnmatch.fnmatch(suffix, pattern) for suffix in supported_suffixes) +def _rule_engine_and_options(engine_hint: str) -> tuple[str, str]: + """Split a ``LIGHTRAG_PARSER`` rule's RHS (``engine[-options]``). + + Returns ``(normalized_engine, options_str)``. Unlike the filename hint + splitter this always treats the first ``-`` as the engine/options + boundary, since ``LIGHTRAG_PARSER`` rules cannot be options-only. + """ + head, _, tail = engine_hint.partition("-") + return normalize_parser_engine(head), tail.strip() + + def validate_parser_routing_config(parser_rules: str | None = None) -> None: """Validate LIGHTRAG_PARSER syntax and required external parser endpoints.""" rules = parser_rules_from_env() if parser_rules is None else parser_rules.strip() @@ -131,7 +316,7 @@ def validate_parser_routing_config(parser_rules: str | None = None) -> None: pattern, engine_hint = item.split(":", 1) pattern = pattern.strip().lower() engine_hint = engine_hint.strip() - engine = normalize_parser_engine(engine_hint) + engine, options_str = _rule_engine_and_options(engine_hint) if not pattern: errors.append(f"{label} has an empty suffix pattern") @@ -162,6 +347,13 @@ def validate_parser_routing_config(parser_rules: str | None = None) -> None: endpoint_env = _PARSER_ENGINE_ENDPOINT_ENV.get(engine) if endpoint_env and not parser_engine_endpoint_configured(engine): errors.append(f"{label} requires {endpoint_env} to be configured") + if options_str: + errors.extend( + f"{label}: {msg}" + for msg in validate_process_options( + options_str, label="process options" + ) + ) if errors: raise ParserRoutingConfigError( @@ -169,6 +361,39 @@ def validate_parser_routing_config(parser_rules: str | None = None) -> None: ) +def _matching_rule_directives( + file_path: str | Path, + *, + parser_rules: str | None, + require_external_endpoint: bool, +) -> tuple[str | None, str]: + """Find the first matching ``LIGHTRAG_PARSER`` rule for ``file_path``. + + Returns ``(engine, options_str)`` where ``engine`` is ``None`` when no + usable rule is found. ``options_str`` is empty when a rule matched but + has no ``-options`` suffix. + """ + suffix = parser_suffix(file_path) + rules = parser_rules_from_env() if parser_rules is None else parser_rules.strip() + if not rules: + return None, "" + for _, item in _iter_parser_rule_items(rules): + if ":" not in item: + continue + pattern, engine_hint = item.split(":", 1) + pattern = pattern.strip().lower() + engine, options_str = _rule_engine_and_options(engine_hint.strip()) + if not fnmatch.fnmatch(suffix, pattern): + continue + if _engine_is_usable( + engine, + suffix, + require_external_endpoint=require_external_endpoint, + ): + return engine, options_str + return None, "" + + def resolve_file_parser_engine( file_path: str | Path, *, @@ -176,32 +401,49 @@ def resolve_file_parser_engine( require_external_endpoint: bool = True, ) -> str: """Resolve the extraction engine for a source file before content extraction.""" + engine, _ = resolve_file_parser_directives( + file_path, + parser_rules=parser_rules, + require_external_endpoint=require_external_endpoint, + ) + return engine + + +def resolve_file_parser_directives( + file_path: str | Path, + *, + parser_rules: str | None = None, + require_external_endpoint: bool = True, +) -> tuple[str, str]: + """Resolve ``(engine, process_options)`` for a source file before extraction. + + Resolution order (mirrors :func:`resolve_file_parser_engine`): + 1. Filename ``[hint]`` — engine and / or options take precedence. + 2. ``LIGHTRAG_PARSER`` rules — first matching rule provides defaults + for whichever of engine / options the filename hint did not + specify. + 3. Default engine ``legacy`` with empty options. + """ suffix = parser_suffix(file_path) - hint = filename_parser_hint(file_path) - if hint and _engine_is_usable( - hint, suffix, require_external_endpoint=require_external_endpoint + hinted_engine, hinted_options = filename_parser_directives(file_path) + if hinted_engine and not _engine_is_usable( + hinted_engine, suffix, require_external_endpoint=require_external_endpoint ): - return hint + # Hinted engine cannot handle this file (e.g. wrong suffix or missing + # endpoint); fall back to rule-based resolution but keep the hinted + # options if any. + hinted_engine = None + + rule_engine, rule_options = _matching_rule_directives( + file_path, + parser_rules=parser_rules, + require_external_endpoint=require_external_endpoint, + ) - rules = parser_rules_from_env() if parser_rules is None else parser_rules.strip() - if rules: - for _, item in _iter_parser_rule_items(rules): - if ":" not in item: - continue - pattern, engine_hint = item.split(":", 1) - pattern = pattern.strip().lower() - engine = normalize_parser_engine(engine_hint) - if not fnmatch.fnmatch(suffix, pattern): - continue - if _engine_is_usable( - engine, - suffix, - require_external_endpoint=require_external_endpoint, - ): - return engine - - return PARSER_ENGINE_LEGACY + engine = hinted_engine or rule_engine or PARSER_ENGINE_LEGACY + options_str = hinted_options or rule_options + return engine, sanitize_process_options(options_str) def resolve_stored_document_parser_engine( diff --git a/lightrag/pipeline.py b/lightrag/pipeline.py index 98025bd5a5..ec28526ff9 100644 --- a/lightrag/pipeline.py +++ b/lightrag/pipeline.py @@ -66,6 +66,7 @@ compute_text_content_hash, doc_status_field, doc_status_value, + document_canonical_key, document_source_key, get_by_path, get_duplicate_doc_by_content_hash, @@ -101,6 +102,7 @@ async def apipeline_enqueue_documents( docs_format: str = FULL_DOCS_FORMAT_RAW, lightrag_document_paths: str | list[str] | None = None, parsed_engine: str | list[str] | None = None, + process_options: str | list[str] | None = None, reprocess_existing_non_processed: bool = False, ) -> str: """ @@ -119,6 +121,10 @@ async def apipeline_enqueue_documents( docs_format: "raw" (default) or "lightrag"; when "lightrag" content may be empty and content-dedup is skipped lightrag_document_paths: paths to LightRAG Document (e.g. .blocks.jsonl dir or base path), when docs_format is lightrag parsed_engine: file extraction engine already used or target engine for pending_parse + process_options: per-document processing options string (i/t/e/!/F/R/S); + accepted as a single string broadcast to every input or as a list + aligned with ``input``. Stored verbatim on ``full_docs`` and + mirrored to ``doc_status.metadata['process_options']``. reprocess_existing_non_processed: allow scan retries to overwrite existing same-name records that have not reached PROCESSED. @@ -140,6 +146,8 @@ async def apipeline_enqueue_documents( ) if isinstance(parsed_engine, str): parsed_engine = [parsed_engine] * len(input) + if isinstance(process_options, str): + process_options = [process_options] * len(input) # If file_paths is provided, ensure it matches the number of documents if file_paths is not None: @@ -166,6 +174,10 @@ async def apipeline_enqueue_documents( raise ValueError( "Number of parsed engines must match the number of documents" ) + if process_options is not None and len(process_options) != len(input): + raise ValueError( + "Number of process options must match the number of documents" + ) def _parsed_engine_at(index: int) -> str | None: if parsed_engine is None: @@ -173,6 +185,13 @@ def _parsed_engine_at(index: int) -> str | None: engine = str(parsed_engine[index] or "").strip().lower() return engine or None + def _process_options_at(index: int) -> str: + if process_options is None: + return "" + from lightrag.parser_routing import sanitize_process_options + + return sanitize_process_options(process_options[index]) + # 1. Validate ids and build contents (when lightrag: no content dedup, content may be empty) if ids is not None: if len(ids) != len(input): @@ -180,7 +199,15 @@ def _parsed_engine_at(index: int) -> str | None: if len(ids) != len(set(ids)): raise ValueError("IDs must be unique") + # Two basenames per file: + # - source_keys: user-visible name preserved verbatim, written as + # full_docs.file_path / doc_status.file_path so the UI can render + # the user's original ``[hint]`` choice. + # - canonical_keys: parser-hint stripped basename used for filename + # dedup and as the seed for deterministic doc_ids; written as + # full_docs.canonical_basename / doc_status.canonical_basename. source_keys = [document_source_key(path) for path in file_paths] + canonical_keys = [document_canonical_key(path) for path in file_paths] contents: dict[str, dict[str, Any]] = {} source_to_doc_id: dict[str, str] = {} content_hash_to_doc_id: dict[str, str] = {} @@ -194,6 +221,7 @@ def _add_content( lightrag_document_path: str | None = None, ) -> None: source_key = source_keys[index] + canonical_key = canonical_keys[index] source_path = file_paths[index] # Compute content hash: skip for pending_parse (content extracted later). @@ -205,25 +233,25 @@ def _add_content( resolve_lightrag_document_path(lightrag_document_path) ) - known_source = has_known_document_source(source_key) + known_source = has_known_document_source(canonical_key) if ids is not None: doc_id = ids[index] elif known_source: - doc_id = compute_mdhash_id(source_key, prefix="doc-") + doc_id = compute_mdhash_id(canonical_key, prefix="doc-") elif doc_format == FULL_DOCS_FORMAT_RAW: doc_id = compute_mdhash_id(content or "", prefix="doc-") elif content_hash: doc_id = compute_mdhash_id(content_hash, prefix="doc-") else: doc_id = compute_mdhash_id( - f"{source_key}-{track_id}-{index}", prefix="doc-" + f"{canonical_key}-{track_id}-{index}", prefix="doc-" ) - if known_source and source_key in source_to_doc_id: + if known_source and canonical_key in source_to_doc_id: duplicate_attempts.append( { "doc_id": doc_id, - "original_doc_id": source_to_doc_id[source_key], + "original_doc_id": source_to_doc_id[canonical_key], "file_path": source_key, "content_length": len(content or ""), "existing_status": "batch_duplicate", @@ -248,23 +276,31 @@ def _add_content( return if known_source: - source_to_doc_id[source_key] = doc_id + source_to_doc_id[canonical_key] = doc_id if content_hash: content_hash_to_doc_id[content_hash] = doc_id - content_data = { + content_data: dict[str, Any] = { "content": content, "file_path": source_key, + "canonical_basename": canonical_key, "format": doc_format, } if content_hash: content_data["content_hash"] = content_hash - if str(source_path).strip() and str(source_path).strip() != source_key: + # Persist the original path only when it actually carries directory + # information (absolute path or contains a separator); a plain + # basename is already captured by ``file_path``. + raw_source = str(source_path).strip() + if raw_source and (os.sep in raw_source or "/" in raw_source): content_data["source_path"] = source_path if lightrag_document_path: content_data["lightrag_document_path"] = lightrag_document_path if engine := _parsed_engine_at(index): content_data["parsed_engine"] = engine + options_str = _process_options_at(index) + if options_str: + content_data["process_options"] = options_str contents[doc_id] = content_data if is_lightrag_format: @@ -318,21 +354,28 @@ def _add_content( _add_content(i, cleaned_content, FULL_DOCS_FORMAT_RAW) # 2. Generate document initial status (without content) - new_docs: dict[str, Any] = { - id_: { + def _initial_doc_status(content_data: dict[str, Any]) -> dict[str, Any]: + base: dict[str, Any] = { "status": DocStatus.PENDING, "content_summary": get_content_summary(content_data.get("content", "")), "content_length": len(content_data.get("content", "")), "created_at": datetime.now(timezone.utc).isoformat(), "updated_at": datetime.now(timezone.utc).isoformat(), "file_path": content_data["file_path"], + "canonical_basename": content_data.get("canonical_basename"), "track_id": track_id, - **( - {"content_hash": content_data["content_hash"]} - if content_data.get("content_hash") - else {} - ), } + if content_data.get("content_hash"): + base["content_hash"] = content_data["content_hash"] + options_str = content_data.get("process_options") or "" + if options_str: + # Mirror process_options into doc_status.metadata so admin UIs + # can surface the per-document strategy without a full_docs lookup. + base["metadata"] = {"process_options": options_str} + return base + + new_docs: dict[str, Any] = { + id_: _initial_doc_status(content_data) for id_, content_data in contents.items() } @@ -521,6 +564,7 @@ def _add_content( doc_id: { "content": contents[doc_id].get("content", ""), "file_path": contents[doc_id]["file_path"], + "canonical_basename": contents[doc_id].get("canonical_basename"), "format": contents[doc_id].get("format", FULL_DOCS_FORMAT_RAW), } for doc_id in new_docs.keys() @@ -540,6 +584,10 @@ def _add_content( full_docs_data[doc_id]["parsed_engine"] = contents[doc_id][ "parsed_engine" ] + if contents[doc_id].get("process_options"): + full_docs_data[doc_id]["process_options"] = contents[doc_id][ + "process_options" + ] await self.full_docs.upsert(full_docs_data) # Persist data to disk immediately await self.full_docs.index_done_callback() @@ -1100,6 +1148,17 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: extraction_meta: dict[str, Any] = {} + # Decode per-document processing options once; later + # stages (multimodal hook / KG extraction) re-read + # them from full_docs as well. + from lightrag.parser_routing import ( + parse_process_options, + ) + + doc_process_opts = parse_process_options( + (content_data or {}).get("process_options", "") + ) + # Try to parse as interchange JSONL (smart extraction output) parsed_interchange = parse_interchange_jsonl( content, self.tokenizer @@ -1118,14 +1177,26 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: "format_version" ), "engine": interchange_meta.get("engine"), - "engine_capabilities": interchange_meta.get( - "engine_capabilities", [] - ), "chunking_method": interchange_meta.get( "chunking_method" ), } else: + # Per-document chunking strategy: + # - 'F' (default): use the configured chunking_func + # (chunking_by_token_size). + # - 'S' / 'R': require structured input which the + # legacy text path cannot provide; fall back to + # 'F' and log a warning so the user knows their + # selection had no effect for this document. + if doc_process_opts.chunking != "F": + logger.warning( + f"[chunking] process_options chunking=" + f"{doc_process_opts.chunking!r} requested for d-id: " + f"{doc_id}, file: {file_path}, but no structured " + f"interchange output is available; falling back to " + f"fixed chunking ('F')." + ) # Call chunking function, supporting both sync and async implementations chunking_result = self.chunking_func( self.tokenizer, @@ -1149,6 +1220,11 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: extraction_meta = { "extraction_format": "plain_text_chunking", "engine": "legacy", + "chunking_method": ( + "fixed_token_fallback" + if doc_process_opts.chunking != "F" + else "fixed_token" + ), } # Multimodal post-process hook entrypoint: @@ -1308,18 +1384,29 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: # Execute first stage tasks await asyncio.gather(*first_stage_tasks) - # Stage 2: Process entity relation graph (after text_chunks are saved) - entity_relation_task = asyncio.create_task( - self._process_extract_entities( - chunks, pipeline_status, pipeline_status_lock + # Stage 2: Process entity relation graph (after text_chunks are saved). + # When the user opted out via process_options '!', skip + # entity/relation extraction entirely; chunks remain in + # the vector store so naive / mix retrieval still works. + if doc_process_opts.skip_kg: + logger.info( + f"[skip_kg] process_options '!' set for d-id: {doc_id}; " + f"skipping entity/relation extraction" + ) + chunk_results = [] + extraction_meta["skip_kg"] = True + else: + entity_relation_task = asyncio.create_task( + self._process_extract_entities( + chunks, pipeline_status, pipeline_status_lock + ) + ) + chunk_results = await entity_relation_task + chunk_results = augment_chunk_results_with_mm_entities( + chunk_results=chunk_results, + mm_specs=mm_specs, + file_path=file_path, ) - ) - chunk_results = await entity_relation_task - chunk_results = augment_chunk_results_with_mm_entities( - chunk_results=chunk_results, - mm_specs=mm_specs, - file_path=file_path, - ) file_extraction_stage_ok = True except Exception as e: @@ -1407,25 +1494,30 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: "User cancelled" ) - # Use chunk_results from entity_relation_task - await merge_nodes_and_edges( - chunk_results=chunk_results, # result collected from entity_relation_task - knowledge_graph_inst=self.chunk_entity_relation_graph, - entity_vdb=self.entities_vdb, - relationships_vdb=self.relationships_vdb, - global_config=self._build_global_config(), - full_entities_storage=self.full_entities, - full_relations_storage=self.full_relations, - doc_id=doc_id, - pipeline_status=pipeline_status, - pipeline_status_lock=pipeline_status_lock, - llm_response_cache=self.llm_response_cache, - entity_chunks_storage=self.entity_chunks, - relation_chunks_storage=self.relation_chunks, - current_file_number=current_file_number, - total_files=total_files, - file_path=file_path, - ) + # Use chunk_results from entity_relation_task. + # When skip_kg is set, chunk_results is empty so + # there are no nodes/edges to merge — but we + # still need to flush the chunks_vdb / text_chunks + # writes (already done above) and reach PROCESSED. + if not doc_process_opts.skip_kg: + await merge_nodes_and_edges( + chunk_results=chunk_results, # result collected from entity_relation_task + knowledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + global_config=self._build_global_config(), + full_entities_storage=self.full_entities, + full_relations_storage=self.full_relations, + doc_id=doc_id, + pipeline_status=pipeline_status, + pipeline_status_lock=pipeline_status_lock, + llm_response_cache=self.llm_response_cache, + entity_chunks_storage=self.entity_chunks, + relation_chunks_storage=self.relation_chunks, + current_file_number=current_file_number, + total_files=total_files, + file_path=file_path, + ) # Record processing end time processing_end_time = int(time.time()) @@ -1766,11 +1858,25 @@ async def analyze_multimodal( doc_id: str, file_path: str, parsed_data: dict[str, Any], + *, + process_options: str | None = None, ) -> dict[str, Any]: """Phase 2: Multimodal analysis (VLM). Writes llm_analyze_result and analyze_time to LightRAG Document. - Uses vlm_llm_model_func (VLM role). When Ray-Anything merges, bind VLM model here. - Default: no-op, returns parsed_data unchanged. + + Per-document ``i`` / ``t`` / ``e`` flags from + ``full_docs.process_options`` decide which modalities are sent to the + VLM. Sidecars are always written by the parser regardless of these + flags so toggling options later does not require re-parsing — only + the ``llm_analyze_result`` payload is gated here. + + Args: + process_options: Optional override that bypasses the + ``full_docs.process_options`` lookup; primarily used by unit + tests that exercise the VLM analysis path without going + through the enqueue pipeline. """ + from lightrag.parser_routing import parse_process_options + blocks_path = parsed_data.get("blocks_path") if not blocks_path: return parsed_data @@ -1779,6 +1885,56 @@ async def analyze_multimodal( if not block_file.exists(): return parsed_data + # Resolve which modalities the user opted into for this document. + if process_options is None: + try: + content_data = await self.full_docs.get_by_id(doc_id) or {} + except Exception: + content_data = {} + options_str = ( + content_data.get("process_options") + if isinstance(content_data, dict) + else "" + ) or "" + else: + options_str = process_options + process_opts = parse_process_options(options_str) + if not (process_opts.images or process_opts.tables or process_opts.equations): + logger.debug( + f"[analyze_multimodal] no i/t/e options set for d-id: {doc_id}; " + f"skipping VLM analysis" + ) + return parsed_data + + # Diagnose opt-in vs sidecar mismatch up-front so users investigating + # "why did VLM not run on my images" see a one-line INFO per document + # instead of silent skips. Empty sidecars are a normal outcome + # (some documents simply have no images/tables/equations), so this is + # informational rather than a warning. + sidecar_base = str(block_file) + if sidecar_base.endswith(".blocks.jsonl"): + sidecar_base = sidecar_base[: -len(".blocks.jsonl")] + opt_in_missing: list[str] = [] + for opt_char, modality, suffix in ( + ("i", "drawings", ".drawings.json"), + ("t", "tables", ".tables.json"), + ("e", "equations", ".equations.json"), + ): + enabled = { + "i": process_opts.images, + "t": process_opts.tables, + "e": process_opts.equations, + }[opt_char] + if enabled and not Path(sidecar_base + suffix).exists(): + opt_in_missing.append(f"{opt_char}:{modality}") + if opt_in_missing: + logger.info( + f"[analyze_multimodal] process_options opted into " + f"{','.join(opt_in_missing)} for d-id: {doc_id} (file={file_path}), " + f"but the parser produced no such sidecar; VLM analysis skipped " + f"for those modalities." + ) + try: lines = block_file.read_text(encoding="utf-8").splitlines() if not lines: @@ -2041,11 +2197,17 @@ def _normalize_grounded_value(value: Any) -> Any: if base_name.endswith(".blocks.jsonl"): base_name = base_name[: -len(".blocks.jsonl")] sidecars = [ - (Path(base_name + ".drawings.json"), "drawings"), - (Path(base_name + ".tables.json"), "tables"), - (Path(base_name + ".equations.json"), "equations"), + (Path(base_name + ".drawings.json"), "drawings", process_opts.images), + (Path(base_name + ".tables.json"), "tables", process_opts.tables), + ( + Path(base_name + ".equations.json"), + "equations", + process_opts.equations, + ), ] - for sidecar_path, root_key in sidecars: + for sidecar_path, root_key, enabled in sidecars: + if not enabled: + continue if not sidecar_path.exists(): continue try: @@ -2186,6 +2348,15 @@ async def _persist_parsed_full_docs( ``get_doc_by_content_hash`` lookups can dedupe across pending_parse records that did not have a hash at enqueue time. Also patches the existing ``doc_status`` row so both storages stay aligned. + + The original ``pending_parse`` record carries metadata seeded at + enqueue time (``process_options``, ``canonical_basename``, + ``source_path``, ...) that downstream stages still need after parsing. + ``full_docs`` upserts overwrite the entire row, so we merge the + existing record with the new ``record`` payload before upserting: + fresh fields from ``record`` (``content`` / ``format`` / + ``lightrag_document_path`` / ``parsed_engine`` / ``update_time``) + take precedence, while pre-existing fields are preserved. """ fmt = record.get("format") content_hash: str | None = None @@ -2198,7 +2369,11 @@ async def _persist_parsed_full_docs( resolve_lightrag_document_path(blocks_path) ) - payload = dict(record) + existing = await self.full_docs.get_by_id(doc_id) + if isinstance(existing, dict): + payload = {**existing, **record} + else: + payload = dict(record) if content_hash: payload["content_hash"] = content_hash @@ -2970,21 +3145,45 @@ async def _run_multimodal_postprocess_hook( Default behavior is no-op. This method defines a stable extension point for built-in multimodal processors. + + Activates when the per-document ``process_options`` opts into at least + one of ``i`` / ``t`` / ``e``. Per-modality work in subsequent steps + (``_build_mm_chunks_from_sidecars``, ``analyze_multimodal``) decides + whether to act based on whether ``drawings.json`` / ``tables.json`` / + ``equations.json`` actually exist on disk — the parser declares + modality availability by writing those sidecars, not by listing + capabilities in meta. """ - addon_params = self.addon_params - if not addon_params.get("enable_multimodal_pipeline", False): - return chunking_result + from lightrag.parser_routing import parse_process_options extraction_format = extraction_meta.get("extraction_format") - capabilities = set(extraction_meta.get("engine_capabilities", []) or []) if extraction_format != "interchange_jsonl": return chunking_result - if not capabilities.intersection({"i", "e", "t"}): + + try: + content_data = await self.full_docs.get_by_id(doc_id) or {} + except Exception: + content_data = {} + process_opts = parse_process_options( + content_data.get("process_options") + if isinstance(content_data, dict) + else "" + ) + active = { + ch + for ch, enabled in ( + ("i", process_opts.images), + ("t", process_opts.tables), + ("e", process_opts.equations), + ) + if enabled + } + if not active: return chunking_result logger.info( f"[multimodal-hook] enabled for d-id: {doc_id}, file: {file_path}, " - f"engine={extraction_meta.get('engine')}, caps={sorted(capabilities)}" + f"engine={extraction_meta.get('engine')}, opts={sorted(active)}" ) # TODO(multimodal pipeline): diff --git a/lightrag/utils_pipeline.py b/lightrag/utils_pipeline.py index 741b5de684..7539c2dbd8 100644 --- a/lightrag/utils_pipeline.py +++ b/lightrag/utils_pipeline.py @@ -92,14 +92,35 @@ def _normalize_path(candidate: Any) -> str | None: def document_source_key(file_path: Any) -> str: - """Return the filename-level key used for document uniqueness.""" + """Return the user-visible basename for ``full_docs.file_path``. + + Preserves any ``[hint]`` segment so the UI can show users their original + naming intent. Use :func:`document_canonical_key` to get the canonical + form used for filename-based deduplication. + """ + source = str(file_path or "").strip() + if source in PLACEHOLDER_DOCUMENT_SOURCES: + return "unknown_source" + basename = Path(source).name.strip() + if basename in PLACEHOLDER_DOCUMENT_SOURCES: + return "unknown_source" + return basename or "unknown_source" + + +def document_canonical_key(file_path: Any) -> str: + """Return the canonical basename used for filename dedup / doc_id seeding. + + Strips any supported ``[hint]`` segment so ``abc.docx`` and + ``abc.[native-iet].docx`` map to the same key. Returns + ``"unknown_source"`` for placeholder sources. + """ source = str(file_path or "").strip() if source in PLACEHOLDER_DOCUMENT_SOURCES: return "unknown_source" - filename = canonicalize_parser_hinted_basename(source).strip() - if filename in PLACEHOLDER_DOCUMENT_SOURCES: + canonical = canonicalize_parser_hinted_basename(source).strip() + if canonical in PLACEHOLDER_DOCUMENT_SOURCES: return "unknown_source" - return filename or "unknown_source" + return canonical or "unknown_source" def has_known_document_source(source_key: str) -> bool: @@ -165,14 +186,14 @@ def configured_input_dir() -> Path: async def get_existing_doc_by_file_basename( doc_status: DocStatusStorage, file_path: Any ) -> tuple[str, Any] | None: - """Find an existing doc_status record by file basename. + """Find an existing doc_status record by canonical file basename. - Both write and lookup paths feed file_path through ``document_source_key`` - first, so stored basenames are already canonical (parser hints stripped). - Storage backends therefore compare canonical-vs-canonical and do not need - to re-run any normalization themselves. + Stored ``file_path`` values keep any ``[hint]`` segment intact so the UI + can surface the user's original naming. Filename-based dedup, however, + operates on the canonical (hint-stripped) basename so ``abc.docx`` and + ``abc.[native-iet].docx`` are treated as the same logical document. """ - basename = document_source_key(file_path) + basename = document_canonical_key(file_path) if basename == "unknown_source": return None return await doc_status.get_doc_by_file_basename(basename) diff --git a/tests/test_document_routes_docx_archive.py b/tests/test_document_routes_docx_archive.py index 1bf598a02a..34752eb8e2 100644 --- a/tests/test_document_routes_docx_archive.py +++ b/tests/test_document_routes_docx_archive.py @@ -61,6 +61,7 @@ async def apipeline_enqueue_documents( track_id=None, docs_format=None, parsed_engine=None, + process_options=None, reprocess_existing_non_processed=False, ): item = { @@ -69,6 +70,7 @@ async def apipeline_enqueue_documents( "track_id": track_id, "docs_format": docs_format, "parsed_engine": parsed_engine, + "process_options": process_options, } if reprocess_existing_non_processed: item["reprocess_existing_non_processed"] = True @@ -146,6 +148,14 @@ def __init__(self, source_path): self.events = [] self.data = {} + async def get_by_id(self, doc_id): + # ``_persist_parsed_full_docs`` merges with the existing pending_parse + # record so metadata seeded at enqueue time (process_options, + # canonical_basename, ...) survives the parse-result overwrite. These + # tests only seed the row via the parser, so returning None is fine. + record = self.data.get(doc_id) + return dict(record) if record is not None else None + async def upsert(self, data): self.events.append("upsert") self.data.update(data) @@ -249,6 +259,7 @@ async def test_pipeline_enqueue_docx_plain_text_extracts_before_enqueue( "track_id": "track-docx", "docs_format": None, "parsed_engine": "legacy", + "process_options": None, } ] assert not file_path.exists() @@ -272,6 +283,7 @@ async def test_pipeline_enqueue_md_moves_after_enqueue(tmp_path, monkeypatch): "track_id": "track-md", "docs_format": None, "parsed_engine": "legacy", + "process_options": None, } ] assert not file_path.exists() @@ -330,10 +342,58 @@ def _fail_pdf_extract(*args, **kwargs): "track_id": "track-pdf", "docs_format": FULL_DOCS_FORMAT_PENDING_PARSE, "parsed_engine": "mineru", + "process_options": None, } ] +async def test_pipeline_enqueue_passes_process_options_from_filename_hint( + tmp_path, monkeypatch +): + """Filename hint ``[native-iet]`` flows into apipeline_enqueue_documents.""" + monkeypatch.setenv("LIGHTRAG_PARSER", "docx:native") + file_path = tmp_path / "report.[native-iet].docx" + file_path.write_bytes(b"docx-bytes") + rag = _FakeRag() + + success, returned_track_id = await pipeline_enqueue_file( + rag, file_path, "track-options" + ) + + assert success is True + assert returned_track_id == "track-options" + assert rag.enqueued == [ + { + "input": "", + "file_path": str(file_path), + "track_id": "track-options", + "docs_format": FULL_DOCS_FORMAT_PENDING_PARSE, + "parsed_engine": "native", + "process_options": "iet", + } + ] + # Native engine deferral keeps the source file in place for the parser. + assert file_path.exists() + + +async def test_pipeline_enqueue_lightrag_parser_rule_provides_default_options( + tmp_path, monkeypatch +): + """LIGHTRAG_PARSER ``docx:native-iet`` becomes the default ``process_options``.""" + monkeypatch.setenv("LIGHTRAG_PARSER", "docx:native-iet,*:legacy") + file_path = tmp_path / "rule_default.docx" + file_path.write_bytes(b"docx-bytes") + rag = _FakeRag() + + success, _ = await pipeline_enqueue_file(rag, file_path, "track-rule-default") + + assert success is True + assert len(rag.enqueued) == 1 + enqueued = rag.enqueued[0] + assert enqueued["parsed_engine"] == "native" + assert enqueued["process_options"] == "iet" + + async def test_pipeline_index_files_leaves_lightrag_document_docx_batch( tmp_path, monkeypatch ): diff --git a/tests/test_pipeline_release_closure.py b/tests/test_pipeline_release_closure.py index 3a96933a8a..80c647953d 100644 --- a/tests/test_pipeline_release_closure.py +++ b/tests/test_pipeline_release_closure.py @@ -129,6 +129,165 @@ def test_canonicalize_parser_hinted_basename(): canonicalize_parser_hinted_basename("name.[native].[mineru].pdf") == "name.[native].pdf" ) + # New options-only and engine+options forms strip cleanly too. + assert canonicalize_parser_hinted_basename("foo.[!].docx") == "foo.docx" + assert canonicalize_parser_hinted_basename("foo.[native-iet].docx") == "foo.docx" + assert canonicalize_parser_hinted_basename("foo.[mineru-R!].pdf") == "foo.pdf" + # Invalid options-only hint (unknown chars) is left alone. + assert canonicalize_parser_hinted_basename("foo.[xyz].docx") == "foo.[xyz].docx" + + +@pytest.mark.offline +def test_filename_parser_directives_decodes_engine_and_options(): + from lightrag.parser_routing import filename_parser_directives + + assert filename_parser_directives("paper.[native-iet].docx") == ("native", "iet") + assert filename_parser_directives("memo.[native-R!].md") == ("native", "R!") + assert filename_parser_directives("report.[!].pdf") == (None, "!") + assert filename_parser_directives("doc.[mineru].docx") == ("mineru", "") + assert filename_parser_directives("foo.docx") == (None, "") + # Unsupported tokens leave the hint untouched and unparsed. + assert filename_parser_directives("foo.[draft].docx") == (None, "") + + +@pytest.mark.offline +def test_filename_hint_rejects_invalid_engine_qualified_options(): + """Engine-qualified hints with bad option chars must fail validation + the same way options-only hints do, so the documented behaviour + "invalid characters → whole hint fails → defaults apply" holds across + all hint shapes (otherwise foo.[native-FR].docx would be canonicalised + even though its options conflict). + """ + from lightrag.parser_routing import ( + canonicalize_parser_hinted_basename, + filename_parser_directives, + ) + + # F+R conflict → hint dropped; engine and options are NOT applied. + assert filename_parser_directives("foo.[native-FR].docx") == (None, "") + # Unknown char Q → hint dropped; engine native is also NOT applied. + assert filename_parser_directives("foo.[native-Q].docx") == (None, "") + + # The basename must remain unchanged so the documented "defaults apply" + # path in the dedup index reflects the literal file the user supplied. + assert ( + canonicalize_parser_hinted_basename("foo.[native-FR].docx") + == "foo.[native-FR].docx" + ) + assert ( + canonicalize_parser_hinted_basename("foo.[native-Q].docx") + == "foo.[native-Q].docx" + ) + + +@pytest.mark.offline +def test_parse_process_options_decodes_flags(): + from lightrag.parser_routing import parse_process_options + + opts = parse_process_options("iet") + assert opts.images and opts.tables and opts.equations + assert not opts.skip_kg + assert opts.chunking == "F" + + opts = parse_process_options("R!") + assert opts.skip_kg and opts.chunking == "R" + assert not opts.images and not opts.tables and not opts.equations + + opts = parse_process_options("S") + assert opts.chunking == "S" + + opts = parse_process_options("") + assert not (opts.images or opts.tables or opts.equations or opts.skip_kg) + assert opts.chunking == "F" + + +@pytest.mark.offline +def test_validate_process_options_rejects_invalid_combos(): + from lightrag.parser_routing import validate_process_options + + assert validate_process_options("iet") == [] + assert validate_process_options("R!") == [] + # F+R conflict is reported. + errs = validate_process_options("FR") + assert any("multiple chunking modes" in m for m in errs) + # Lowercase chunking selectors are not valid. + errs = validate_process_options("f") + assert any("'f'" in m for m in errs) + # Unknown chars are reported individually. + errs = validate_process_options("xyz") + assert sum(1 for m in errs if "unsupported character" in m) == 3 + + +@pytest.mark.offline +def test_lightrag_parser_rule_supports_options_suffix(monkeypatch): + monkeypatch.setenv("MINERU_ENDPOINT", "http://fake-mineru") + monkeypatch.delenv("DOCLING_ENDPOINT", raising=False) + # Valid options suffix passes validation. + validate_parser_routing_config("docx:native-iet,*:legacy") + + # Invalid options suffix is rejected with the rule label and message. + with pytest.raises(ParserRoutingConfigError, match="multiple chunking modes"): + validate_parser_routing_config("docx:native-FR,*:legacy") + + with pytest.raises(ParserRoutingConfigError, match="unsupported character"): + validate_parser_routing_config("docx:native-Q,*:legacy") + + +@pytest.mark.offline +def test_resolve_file_parser_directives_priority(monkeypatch): + from lightrag.parser_routing import resolve_file_parser_directives + + monkeypatch.setenv("MINERU_ENDPOINT", "http://fake-mineru") + monkeypatch.setenv("LIGHTRAG_PARSER", "docx:native-iet,*:legacy") + + # Filename hint takes precedence for engine and options. + engine, options = resolve_file_parser_directives("paper.[native-R!].docx") + assert engine == "native" + assert options == "R!" + + # No filename hint → fall through to LIGHTRAG_PARSER defaults for both. + engine, options = resolve_file_parser_directives("plain.docx") + assert engine == "native" + assert options == "iet" + + # Options-only hint keeps engine from rule but uses hinted options. + engine, options = resolve_file_parser_directives("plain.[!].docx") + assert engine == "native" + assert options == "!" + + +@pytest.mark.offline +def test_apipeline_enqueue_persists_process_options(tmp_path): + async def _run(): + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + await rag.apipeline_enqueue_documents( + "alpha body", + file_paths="abc.[native-R!].docx", + track_id="track-opts", + process_options="R!", + ) + doc_id = compute_mdhash_id("abc.docx", prefix="doc-") + full_doc = await rag.full_docs.get_by_id(doc_id) + assert full_doc is not None + # full_docs preserves the user-visible name and the canonical key. + assert full_doc["file_path"] == "abc.[native-R!].docx" + assert full_doc.get("canonical_basename") == "abc.docx" + assert full_doc.get("process_options") == "R!" + + status_doc = await rag.doc_status.get_by_id(doc_id) + assert status_doc is not None + metadata = ( + status_doc.get("metadata") + if isinstance(status_doc, dict) + else getattr(status_doc, "metadata", {}) + ) + assert metadata.get("process_options") == "R!" + finally: + await rag.finalize_storages() + + asyncio.run(_run()) @pytest.mark.offline @@ -199,6 +358,10 @@ async def _run(): first_id = compute_mdhash_id("abc.docx", prefix="doc-") first_doc = await rag.full_docs.get_by_id(first_id) assert first_doc is not None + # file_path keeps the user's original basename verbatim, while + # canonical_basename carries the dedup key. + assert first_doc["file_path"] == "abc.docx" + assert first_doc.get("canonical_basename") == "abc.docx" await rag.apipeline_enqueue_documents( "changed body", @@ -208,9 +371,12 @@ async def _run(): assert (await rag.full_docs.get_by_id(first_id))["content"] == "alpha body" failed_docs = await rag.doc_status.get_docs_by_status(DocStatus.FAILED) + # The duplicate record reflects the second attempt's user-visible + # basename (hint preserved); the canonical dedup happened against + # ``abc.docx`` regardless. assert any( getattr(doc, "metadata", {}).get("duplicate_kind") == "filename" - and getattr(doc, "file_path", "") == "abc.docx" + and getattr(doc, "file_path", "") == "abc.[native].docx" for doc in failed_docs.values() ) finally: @@ -238,7 +404,9 @@ async def _run(): result = await rag.adelete_by_doc_id(doc_id) assert result.status == "success" - assert result.file_path == "abc.docx" + # ``file_path`` now preserves the parser-hint segment for UI + # display; canonicalisation only affects the dedup key. + assert result.file_path == "abc.[native].docx" assert result.source_path == source_path finally: await rag.finalize_storages() @@ -501,6 +669,67 @@ async def _run(): asyncio.run(_run()) +@pytest.mark.offline +def test_persist_parsed_full_docs_preserves_pending_metadata(tmp_path): + """``_persist_parsed_full_docs`` must keep process_options / canonical_basename + seeded at enqueue time so downstream stages (analyze_multimodal, + chunking selection, KG-skip) still see the user's original opt-ins after + the parse-result record overwrites the pending_parse row. + """ + + async def _run(): + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + await rag.apipeline_enqueue_documents( + "", + file_paths="report.[native-iet!].docx", + docs_format=FULL_DOCS_FORMAT_PENDING_PARSE, + parsed_engine=PARSER_ENGINE_NATIVE, + process_options="iet!", + track_id="track-merge", + ) + doc_id = compute_mdhash_id("report.docx", prefix="doc-") + + pre = await rag.full_docs.get_by_id(doc_id) + assert pre is not None + assert pre.get("process_options") == "iet!" + assert pre.get("canonical_basename") == "report.docx" + assert pre.get("file_path") == "report.[native-iet!].docx" + + # Simulate a parse_* completion: pass only the fresh fields the + # parsers actually emit and verify that pre-existing metadata + # survives the upsert. + await rag._persist_parsed_full_docs( + doc_id, + { + "content": "extracted body", + "file_path": "report.[native-iet!].docx", + "format": "raw", + "parsed_engine": PARSER_ENGINE_NATIVE, + "update_time": 12345, + }, + ) + + post = await rag.full_docs.get_by_id(doc_id) + assert post is not None + # Parser-supplied fields take precedence... + assert post["content"] == "extracted body" + assert post["format"] == "raw" + # ...while metadata seeded at enqueue time is preserved. + assert post.get("process_options") == "iet!" + assert post.get("canonical_basename") == "report.docx" + assert post.get("file_path") == "report.[native-iet!].docx" + # And content_hash is freshly computed from the parsed body. + assert post["content_hash"] == compute_mdhash_id( + "extracted body", prefix="" + ) + finally: + await rag.finalize_storages() + + asyncio.run(_run()) + + @pytest.mark.offline def test_state_machine_upsert_preserves_content_hash(tmp_path): async def _run(): @@ -814,7 +1043,7 @@ async def _retry_vlm(prompt, **kwargs): "blocks_path": str(blocks), "content": "body", } - await rag.analyze_multimodal("doc-1", "demo.pdf", parsed) + await rag.analyze_multimodal("doc-1", "demo.pdf", parsed, process_options="ite") meta = json.loads(blocks.read_text(encoding="utf-8").splitlines()[0]) assert meta.get("analyze_time") @@ -849,7 +1078,9 @@ async def _run(): "blocks_path": str(blocks), "content": "body", } - result = await rag.analyze_multimodal("doc-1", "demo.pdf", parsed) + result = await rag.analyze_multimodal( + "doc-1", "demo.pdf", parsed, process_options="ite" + ) meta = json.loads(blocks.read_text(encoding="utf-8").splitlines()[0]) assert meta.get("analyze_time") @@ -956,7 +1187,7 @@ async def _vlm(_prompt, **_kwargs): "blocks_path": str(blocks), "content": "body", } - await rag.analyze_multimodal("doc-1", "demo.pdf", parsed) + await rag.analyze_multimodal("doc-1", "demo.pdf", parsed, process_options="ite") payload = json.loads(drawings.read_text(encoding="utf-8")) result = payload["drawings"]["id1"]["llm_analyze_result"] @@ -1006,7 +1237,7 @@ async def _vlm(_prompt, **_kwargs): "blocks_path": str(blocks), "content": "body", } - await rag.analyze_multimodal("doc-1", "demo.pdf", parsed) + await rag.analyze_multimodal("doc-1", "demo.pdf", parsed, process_options="ite") payload = json.loads(drawings.read_text(encoding="utf-8")) result = payload["drawings"]["id1"]["llm_analyze_result"] @@ -1197,7 +1428,7 @@ async def _vlm(_prompt, **_kwargs): "blocks_path": str(blocks), "content": "body", } - await rag.analyze_multimodal("doc-1", "demo.pdf", parsed) + await rag.analyze_multimodal("doc-1", "demo.pdf", parsed, process_options="ite") payload = json.loads(tables.read_text(encoding="utf-8")) result = payload["tables"]["id1"]["llm_analyze_result"] From 084485f9755672e3f44a70f3975756cb0fa45b12 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 5 May 2026 02:17:16 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E2=9C=A8=20feat(api):=20pipeline=20reentra?= =?UTF-8?q?ncy=20guards=20and=20idempotent=20multimodal=20analyze?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tightens upload/scan/enqueue concurrency rules and makes analyze_multimodal idempotent so users can incrementally enable i/t/e modalities without re-running the VLM on already-analyzed sidecar items. - pipeline_status gains a ``scanning`` flag; the /documents/scan endpoint acquires it synchronously before scheduling the background task and refuses with status="scanning_skipped_pipeline_busy" when the pipeline is busy or another scan is already in flight - /documents/upload, /documents/text, /documents/texts now reject with HTTP 409 while pipeline_status['busy'] or ['scanning'] is set - Strict name pre-check on upload: same-canonical-basename in INPUT directory or doc_status now raises 409 instead of returning a status="duplicated" 200 payload; clients must DELETE the existing record before re-uploading - apipeline_enqueue_documents adds a last-line RuntimeError guard for busy/scanning state; the reprocess_existing_non_processed parameter is removed from this and pipeline_enqueue_file / pipeline_index_files (recovery of half-finished documents will be handled by a future pipeline-resume branch instead of re-enqueueing) - analyze_multimodal drops the meta.analyze_time early-return; per-item llm_analyze_result presence is checked instead so re-running with new i/t/e options only analyzes the newly enabled modalities; analyze_time becomes the timestamp of the most recent successful pass - WebUI UploadDocumentsDialog maps HTTP 409 with "already contains" / "Status:" detail back to the duplicate-file UI affordance, surfacing other 409 reasons (busy/scanning) verbatim from the server - WebUI lightrag.ts type aligned: DocActionResponse drops 'duplicated', ScanResponse adds 'scanning_skipped_pipeline_busy' - InsertResponse Pydantic Literal narrowed to remove the now-unreachable "duplicated" value - docs/FileProcessingConfiguration-zh.md adds a "并发与重入约束" section and a "流水线启动时的续跑规则" section - New regression tests for busy/scanning rejection at every layer, scanning flag acquire/release lifecycle, and analyze_multimodal per-item idempotency BREAKING CHANGES - HTTP: same-name conflicts on upload/text/texts now return 409 instead of a 200 status="duplicated" payload; clients reading the response status field must catch the 409 error path - Python API: apipeline_enqueue_documents / pipeline_enqueue_file / pipeline_index_files no longer accept reprocess_existing_non_processed - Python API: apipeline_enqueue_documents raises RuntimeError when called while another pipeline run or scan is in progress Co-Authored-By: Claude Opus 4.7 --- docs/FileProcessingConfiguration-zh.md | 66 +++++ lightrag/api/routers/document_routes.py | 265 ++++++++++++++---- lightrag/kg/shared_storage.py | 1 + lightrag/pipeline.py | 113 +++++--- lightrag_webui/src/api/lightrag.ts | 4 +- .../documents/UploadDocumentsDialog.tsx | 31 +- tests/test_document_routes_docx_archive.py | 237 +++++++++++++++- tests/test_pipeline_release_closure.py | 180 ++++++++++++ 8 files changed, 776 insertions(+), 121 deletions(-) diff --git a/docs/FileProcessingConfiguration-zh.md b/docs/FileProcessingConfiguration-zh.md index 2a5f1c77b2..4017c766ab 100644 --- a/docs/FileProcessingConfiguration-zh.md +++ b/docs/FileProcessingConfiguration-zh.md @@ -213,3 +213,69 @@ DOCLING_ENDPOINT=http://localhost:8081/v1/convert/file/async - 存储后端通过 `get_doc_by_content_hash` 进行 hash 直查;命名约定与 `get_doc_by_file_basename` 一致。 > 入队批次内(同一次 `apipeline_enqueue_documents` 调用)也会做 basename 与 content_hash 去重,命中时把后续条目直接写为 `FAILED` 并标记 `existing_status=batch_duplicate`。其中 basename 去重只对有效文件名生效;`unknown_source`、`no-file-path` 和空来源只参与内容 hash 去重。 + +## 并发与重入约束 + +为防止 `scan` / `upload` 与运行中的流水线相互覆盖 `doc_status` / `full_docs` 记录,所有写入入口都以 `pipeline_status["busy"]`(加 scan 自身的 `pipeline_status["scanning"]` 旗标)为唯一闸门。 + +### 入口行为 + +| 入口 | 流水线 busy 或 scanning 中 | 否则 | +| --- | --- | --- | +| `/documents/upload` / `/documents/file` / `/documents/text` / `/documents/insert*` | 直接返回 HTTP 409 错误,不写文件、不调入队 | 进入下文"严格名字预检"后再保存与入队 | +| `/documents/scan` | 落 warning 后立即返回 `scanning_skipped_pipeline_busy`,不 schedule 后台任务 | 设 `scanning=True` 后 schedule,task 结束统一清旗 | +| `apipeline_enqueue_documents` 内部 | 抛 `RuntimeError("Cannot enqueue while pipeline is busy")` 作为最后一道闸 | 正常入队 | + +> 流水线启动后无法可靠判断 `INPUT` 目录中各文件具体处理到哪个阶段,对运行中的 `doc_status` 做任何修改(哪怕仅"重置 PENDING/FAILED")都可能与 parse / analyze / process 工作线程的写入交错,导致存储记录与实际处理逻辑不一致。规则采用最严:busy 期间一律拒绝写入,让流水线跑完再说。 + +### 严格名字预检(upload 路径,busy=False 时) + +upload 在保存文件前必须双道检查: + +1. **INPUT 目录扫描**:把要保存的 basename 经 `canonicalize_parser_hinted_basename` 规范化,遍历 INPUT 目录里现有任何同 canonical 变体(含 hint / 不含 hint),命中即 409。 +2. **doc_status 查重**:用规范化 basename 调 `get_existing_doc_by_file_basename`,命中即 409。 + +两道都过 → 保存文件 → 调 `apipeline_enqueue_documents`。 + +> 旧版本曾允许 upload 在已有同名记录时悄悄写入 FAILED 重复条目;新规则改为 fail-fast,不在 doc_status 留下任何重复痕迹。如需替换同名文档,请先调用 `/documents/{doc_id}` 的删除接口。 + +### 移除 `reprocess_existing_non_processed` + +旧 `apipeline_enqueue_documents` 的 `reprocess_existing_non_processed=True` 行为会在 scan 时直接删除非 PROCESSED 的旧记录并重建,与本规则相冲突,已整段移除。scan 中遇到非 PROCESSED 同名文件直接归档到 `__parsed__/` 跳过,由"流水线启动时的续跑规则"在下一次流水线启动时统一接管。 + +## 流水线启动时的续跑规则 + +每次 `apipeline_process_enqueue_documents` 起步时,会拉取所有处于 `PARSING` / `ANALYZING` / `PROCESSING` / `PENDING` / `FAILED` 状态的文档继续处理。续跑路径**根据"内容是否已抽取"分流**,保证同一个文档无论之前进度如何,按当前 `process_options` 续跑都有幂等结果。 + +### 判断"内容已抽取" + +读 `full_docs[doc_id]`: + +| `format` | 判定 | +| --- | --- | +| `lightrag` 且 `lightrag_document_path` 文件存在 | ✅ 已抽取 | +| `raw` 且 `content` 非空 | ✅ 已抽取 | +| 其它(含 `pending_parse`、记录缺失) | ❌ 未抽取 | + +### 分支 A:未抽取 + +走完整流水线(`parse_native` / `parse_mineru` / `parse_docling` → `analyze_multimodal` → 分块 → 实体抽取),按 `full_docs.process_options` 决定每一阶段的行为。这是"首次入队"的常规流。 + +### 分支 B:已抽取 + +**一律跳过解析**(不重新调 `parse_*`),从 ANALYZING 阶段重启,并清光旧 chunks / entities 后按当前 `process_options` 重做: + +| 子步骤 | 行为 | +| --- | --- | +| 引擎对比 | 若 `process_options` 隐含的引擎 ≠ `full_docs.parsed_engine`,**仅 warn**,不重新解析。已抽取的内容是不可变事实,重新跑不同引擎会产生不一致。要切换引擎请先 delete 整个文档再重传。 | +| 旧 chunks 清理 | 读 `doc_status.chunks_list`,从 `chunks_vdb` 与 `text_chunks` 全部 delete。理由:流水线产物中无法可靠区分"普通文本块 vs 多模态附加块",按 chunk id 一律重新生成最简单也最可靠 | +| 旧实体 / 关系清理 | 复用 `adelete_by_doc_id` 内部清理逻辑(抽出为 `_purge_doc_chunks_and_kg(doc_id)` helper),删除 `entity_chunks` / `relation_chunks` 中以这些 chunk id 为 source 的条目,并把图谱里因之失去全部源的孤立节点一并删除 | +| `analyze_multimodal` | **不再看 `meta.analyze_time`**:按新 `process_options.{i,t,e}` 与 sidecar 中各 item 的 `llm_analyze_result` 取交集做增量分析(已分析的 item 跳过,新启用的模态从空状态开始分析)。`analyze_time` 改为"最近一次成功分析时间"语义,仅供观测 | +| 重新分块 | 按新 `process_options.chunking` 重跑(interchange path 用 native heading-driven,legacy path 用 fixed) | +| 实体抽取 / KG-skip | 按新 `process_options.skip_kg` 决定 | + +> 这条规则保证:用户改 `i/t/e` 重传同名文档(先删旧 doc 再上传带新 hint 的文件)时,多模态分析能增量补齐;改 `F`/`R`/`S` 时 chunks 与图谱重建;改 `!` 时停掉或恢复 KG 构建。引擎变更被视为"重大变更",统一由 delete + 重传完成,不在续跑路径里隐式发生。 + +### 与"文档重复判定规则"的关系 + +续跑规则只对 `doc_id` 已经存在于 `doc_status` 的文档生效。新文件入队仍然走"并发与重入约束"中的严格名字预检 + "文档重复判定规则"中的 `canonical_basename` / `content_hash` 查重;续跑分支不会被用来"新文件挤掉旧 PROCESSED 记录"。 diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index d1ba67b7b9..d6f71d4d51 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -148,12 +148,15 @@ class ScanResponse(BaseModel): """Response model for document scanning operation Attributes: - status: Status of the scanning operation + status: Status of the scanning operation. ``scanning_started`` when + a new background scan has been scheduled; + ``scanning_skipped_pipeline_busy`` when the request was rejected + because indexing or another scan is already running. message: Optional message with additional details track_id: Tracking ID for monitoring scanning progress """ - status: Literal["scanning_started"] = Field( + status: Literal["scanning_started", "scanning_skipped_pipeline_busy"] = Field( description="Status of the scanning operation" ) message: Optional[str] = Field( @@ -310,12 +313,15 @@ class InsertResponse(BaseModel): """Response model for document insertion operations Attributes: - status: Status of the operation (success, duplicated, partial_success, failure) + status: Status of the operation (success, partial_success, failure). + Same-name conflicts are rejected with HTTP 409 rather than being + reported as a "duplicated" 200 response, so this field never + takes that value any more. message: Detailed message describing the operation result track_id: Tracking ID for monitoring processing status """ - status: Literal["success", "duplicated", "partial_success", "failure"] = Field( + status: Literal["success", "partial_success", "failure"] = Field( description="Status of the operation" ) message: str = Field(description="Message describing the operation result") @@ -971,6 +977,55 @@ async def get_existing_doc_by_file_path_candidates( return existing_doc_data +async def _ensure_pipeline_idle(rag: LightRAG) -> None: + """Reject upload/insert/scan calls while indexing or scanning is running. + + The concurrency contract states that no write to ``doc_status`` / + ``full_docs`` may interleave with an in-flight pipeline run. Endpoints + that would otherwise persist new state must call this guard first and + surface a 409 to clients. + + A workspace whose ``pipeline_status`` has never been initialised (via + ``initialize_pipeline_status``) is treated as idle: production code + always runs ``rag.initialize_storages`` during startup, but mocked + test rigs may skip that step. + + Raises: + HTTPException(409): when ``pipeline_status['busy']`` or + ``pipeline_status['scanning']`` is set. + """ + from lightrag.exceptions import PipelineNotInitializedError + from lightrag.kg.shared_storage import get_namespace_data, get_namespace_lock + + try: + pipeline_status = await get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + except PipelineNotInitializedError: + # Workspace pipeline_status not yet bootstrapped → treat as idle. + return + pipeline_status_lock = get_namespace_lock( + "pipeline_status", workspace=rag.workspace + ) + async with pipeline_status_lock: + if pipeline_status.get("busy"): + raise HTTPException( + status_code=409, + detail=( + "Pipeline is currently busy processing documents. " + "Wait for the running job to finish before submitting new work." + ), + ) + if pipeline_status.get("scanning"): + raise HTTPException( + status_code=409, + detail=( + "Document scan is in progress. " + "Wait for the scan to complete before submitting new work." + ), + ) + + def find_existing_file_by_canonical_basename( input_dir: Path, canonical_basename: str ) -> Path | None: @@ -1375,7 +1430,6 @@ async def pipeline_enqueue_file( rag: LightRAG, file_path: Path, track_id: str = None, - reprocess_existing_non_processed: bool = False, ) -> tuple[bool, str]: """Add a file to the queue for processing @@ -1414,8 +1468,6 @@ async def pipeline_enqueue_file( } if process_options: enqueue_kwargs["process_options"] = process_options - if reprocess_existing_non_processed: - enqueue_kwargs["reprocess_existing_non_processed"] = True enqueue_result = await rag.apipeline_enqueue_documents( "", **enqueue_kwargs ) @@ -1729,8 +1781,6 @@ async def pipeline_enqueue_file( } if process_options: enqueue_kwargs["process_options"] = process_options - if reprocess_existing_non_processed: - enqueue_kwargs["reprocess_existing_non_processed"] = True enqueue_result = await rag.apipeline_enqueue_documents( content, **enqueue_kwargs ) @@ -1832,7 +1882,6 @@ async def pipeline_index_files( rag: LightRAG, file_paths: List[Path], track_id: str = None, - reprocess_existing_non_processed: bool = False, ): """Index multiple files sequentially to avoid high CPU load @@ -1857,7 +1906,6 @@ async def pipeline_index_files( rag, file_path, track_id, - reprocess_existing_non_processed=reprocess_existing_non_processed, ) if success: enqueued = True @@ -1912,6 +1960,26 @@ async def run_scanning_process( doc_manager: DocumentManager instance track_id: Optional tracking ID to pass to all scanned files """ + # The scan endpoint set ``pipeline_status['scanning']=True`` synchronously + # before scheduling this task; we MUST clear it in finally so subsequent + # uploads / scans can proceed even if the body raises. When pipeline_status + # is not initialised (mocked test rigs), the flag was never set so there's + # nothing to clear — track that here to skip the namespace fetch. + from lightrag.exceptions import PipelineNotInitializedError + from lightrag.kg.shared_storage import get_namespace_data, get_namespace_lock + + pipeline_status = None + pipeline_status_lock = None + try: + pipeline_status = await get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status_lock = get_namespace_lock( + "pipeline_status", workspace=rag.workspace + ) + except PipelineNotInitializedError: + pass + try: new_files = doc_manager.scan_directory_for_new_files() total_files = len(new_files) @@ -1989,13 +2057,17 @@ async def run_scanning_process( # File is new or in non-PROCESSED status, add to processing list valid_files.append(file_path) - # Process valid files (new files + non-PROCESSED status files) + # Process valid files (new files + non-PROCESSED status files). + # The previous reprocess_existing_non_processed=True flag has been + # removed: scan no longer overwrites in-flight or half-finished + # records. Recovery of non-PROCESSED documents is handled by the + # pipeline's resume logic when apipeline_process_enqueue_documents + # picks them up. if valid_files: await pipeline_index_files( rag, valid_files, track_id, - reprocess_existing_non_processed=True, ) if processed_files: logger.info( @@ -2019,6 +2091,13 @@ async def run_scanning_process( except Exception as e: logger.error(f"Error during scanning process: {str(e)}") logger.error(traceback.format_exc()) + finally: + # Always release the scanning flag so future uploads / scans are + # not blocked by a crashed task. Skip when pipeline_status was + # never initialised for this workspace (test rigs). + if pipeline_status is not None and pipeline_status_lock is not None: + async with pipeline_status_lock: + pipeline_status["scanning"] = False async def background_delete_documents( @@ -2235,17 +2314,72 @@ async def scan_for_new_documents(background_tasks: BackgroundTasks): """ Trigger the scanning process for new documents. - This endpoint initiates a background task that scans the input directory for new documents - and processes them. If a scanning process is already running, it returns a status indicating - that fact. + Refuses to start a new scan when ``pipeline_status['busy']`` (indexing + or deletion in flight) or ``pipeline_status['scanning']`` (another + scan already running) is set; in that case returns + ``status='scanning_skipped_pipeline_busy'`` immediately and does not + schedule a background task. The ``scanning`` flag is acquired + synchronously here so a subsequent fast-follow request hits the + guard rather than racing against the not-yet-started task. Returns: ScanResponse: A response object containing the scanning status and track_id """ + from lightrag.exceptions import PipelineNotInitializedError + from lightrag.kg.shared_storage import get_namespace_data, get_namespace_lock + # Generate track_id with "scan" prefix for scanning operation track_id = generate_track_id("scan") - # Start the scanning process in the background with track_id + try: + pipeline_status = await get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + except PipelineNotInitializedError: + # Workspace pipeline_status not yet bootstrapped (e.g. mocked + # test rigs). Treat as idle and allow the scan to proceed; the + # scanning flag has nowhere to live so it is effectively skipped. + background_tasks.add_task(run_scanning_process, rag, doc_manager, track_id) + return ScanResponse( + status="scanning_started", + message="Scanning process has been initiated in the background", + track_id=track_id, + ) + pipeline_status_lock = get_namespace_lock( + "pipeline_status", workspace=rag.workspace + ) + + # Atomically acquire the scanning flag: if the pipeline is busy or + # another scan is in flight, refuse without scheduling. + async with pipeline_status_lock: + if pipeline_status.get("busy"): + logger.warning( + "Scan request skipped: pipeline is busy processing documents" + ) + return ScanResponse( + status="scanning_skipped_pipeline_busy", + message=( + "Pipeline is currently busy processing documents. " + "Wait for the running job to finish before triggering another scan." + ), + track_id=track_id, + ) + if pipeline_status.get("scanning"): + logger.warning( + "Scan request skipped: another scan is already in progress" + ) + return ScanResponse( + status="scanning_skipped_pipeline_busy", + message=( + "Another scan is already in progress. " + "Wait for it to finish before triggering a new one." + ), + track_id=track_id, + ) + pipeline_status["scanning"] = True + + # Start the scanning process in the background with track_id. The + # task is responsible for clearing the flag in its finally block. background_tasks.add_task(run_scanning_process, rag, doc_manager, track_id) return ScanResponse( status="scanning_started", @@ -2276,13 +2410,17 @@ async def upload_to_input_dir( This endpoint handles two types of duplicate scenarios differently: 1. **Filename Duplicate (Synchronous Detection)**: - - Detected immediately before file processing - - File name is treated as the unique document key; an existing - document storage row rejects the upload regardless of status - - Returns `status="duplicated"` with the existing document's track_id - - Two cases: - - If filename exists in document storage: returns existing track_id - - If filename exists in file system only: returns empty track_id ("") + - Detected immediately, before any file is written. + - File name is treated as the unique document key. Both + ``doc_status`` and the INPUT directory are checked under the + canonical (parser-hint stripped) basename so ``abc.docx`` and + ``abc.[native].docx`` map to the same record. + - **HTTP 409** is returned when a same-name record already exists. + The response detail names the conflict source ("Document + storage already contains ..." or "Input directory already + contains ..."). Clients must delete the existing document + (``DELETE /documents/{doc_id}``) before re-uploading; there is + no longer a 200 ``status="duplicated"`` soft-fail response. 2. **Content Duplicate (Asynchronous Detection)**: - Detected during background processing after content extraction @@ -2300,6 +2438,12 @@ async def upload_to_input_dir( - Content extraction is expensive (PDF/DOCX parsing), done asynchronously - This design prevents blocking the client during expensive operations + **Concurrency Constraint:** + - The endpoint refuses with HTTP 409 while + ``pipeline_status['busy']`` (indexing in flight) or + ``pipeline_status['scanning']`` (a scan is running) is set. + Wait for the running job to finish before re-submitting. + Args: background_tasks: FastAPI BackgroundTasks for async processing file (UploadFile): The file to be uploaded. It must have an allowed extension. @@ -2307,12 +2451,17 @@ async def upload_to_input_dir( Returns: InsertResponse: A response object containing the upload status and a message. - status="success": File accepted and queued for processing - - status="duplicated": Filename already exists (see track_id for existing document) Raises: - HTTPException: If the file type is not supported (400), file too large (413), or other errors occur (500). + HTTPException: 400 unsupported file type, 409 same-name conflict + or pipeline busy/scanning, 413 file too large, 500 other errors. """ try: + # Reject upload while pipeline is busy or scanning. The strict + # name pre-check below would otherwise race with in-flight writes + # to doc_status, leading to inconsistent state. + await _ensure_pipeline_idle(rag) + # Sanitize filename to prevent Path Traversal attacks safe_filename = sanitize_filename(file.filename, doc_manager.input_dir) @@ -2345,22 +2494,25 @@ async def upload_to_input_dir( file_path = doc_manager.input_dir / safe_filename - # Check if filename already exists in doc_status storage + # Strict name pre-check. Both the INPUT directory and doc_status + # must be free of any same-canonical-basename record before we + # accept the upload. Replacing an existing document requires an + # explicit DELETE first; we no longer write a "duplicated" 200 + # response that silently no-ops. existing_doc_data = await get_existing_doc_by_file_path_candidates( rag.doc_status, file_path ) if existing_doc_data: - # Get document status and track_id from existing document status = get_doc_status_value(existing_doc_data) or "unknown" - # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id) - existing_track_id = get_doc_track_id(existing_doc_data) - return InsertResponse( - status="duplicated", - message=f"File '{safe_filename}' already exists in document storage (Status: {status}).", - track_id=existing_track_id, + raise HTTPException( + status_code=409, + detail=( + f"Document storage already contains '{safe_filename}' " + f"(Status: {status}). Delete the existing record before re-uploading." + ), ) - # Check if file already exists in file system, using canonical parser-hint names. + # INPUT directory check, using canonical parser-hint names. # Fast path: exact filename match avoids iterdir on large input directories. canonical_filename = normalize_file_path(safe_filename) if file_path.exists(): @@ -2370,10 +2522,13 @@ async def upload_to_input_dir( doc_manager.input_dir, canonical_filename ) if existing_input_file: - return InsertResponse( - status="duplicated", - message=f"File '{safe_filename}' already exists in the input directory.", - track_id="", + raise HTTPException( + status_code=409, + detail=( + f"Input directory already contains a file with the same " + f"canonical basename ('{existing_input_file.name}'). " + f"Remove or rename it before re-uploading." + ), ) # Async streaming write with size check @@ -2457,6 +2612,9 @@ async def insert_text( HTTPException: If an error occurs during text processing (500). """ try: + # Reject text insertion while pipeline is busy or scanning. + await _ensure_pipeline_idle(rag) + # Check if file_source already exists in doc_status storage if not is_valid_file_source(request.file_source): raise HTTPException( @@ -2469,14 +2627,13 @@ async def insert_text( rag.doc_status, normalized_file_source ) if existing_doc_data: - # Get document status and track_id from existing document status = get_doc_status_value(existing_doc_data) or "unknown" - # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id) - existing_track_id = get_doc_track_id(existing_doc_data) - return InsertResponse( - status="duplicated", - message=f"File source '{normalized_file_source}' already exists in document storage (Status: {status}).", - track_id=existing_track_id, + raise HTTPException( + status_code=409, + detail=( + f"Document storage already contains '{normalized_file_source}' " + f"(Status: {status}). Delete the existing record before re-inserting." + ), ) # Generate track_id for text insertion @@ -2527,6 +2684,9 @@ async def insert_texts( HTTPException: If an error occurs during text processing (500). """ try: + # Reject batch text insertion while pipeline is busy or scanning. + await _ensure_pipeline_idle(rag) + # Check if any file_sources already exist in doc_status storage if not request.file_sources or len(request.file_sources) != len( request.texts @@ -2558,14 +2718,13 @@ async def insert_texts( rag.doc_status, file_source ) if existing_doc_data: - # Get document status and track_id from existing document status = get_doc_status_value(existing_doc_data) or "unknown" - # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id) - existing_track_id = get_doc_track_id(existing_doc_data) - return InsertResponse( - status="duplicated", - message=f"File source '{file_source}' already exists in document storage (Status: {status}).", - track_id=existing_track_id, + raise HTTPException( + status_code=409, + detail=( + f"Document storage already contains '{file_source}' " + f"(Status: {status}). Delete the existing record before re-inserting." + ), ) # Generate track_id for texts insertion diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py index 6da563088e..23808b4291 100644 --- a/lightrag/kg/shared_storage.py +++ b/lightrag/kg/shared_storage.py @@ -1289,6 +1289,7 @@ async def initialize_pipeline_status(workspace: str | None = None): { "autoscanned": False, # Auto-scan started "busy": False, # Control concurrent processes + "scanning": False, # /documents/scan in progress (independent of busy) "job_name": "-", # Current job name (indexing files/indexing texts) "job_start": None, # Job start time "docs": 0, # Total number of documents to be indexed diff --git a/lightrag/pipeline.py b/lightrag/pipeline.py index ec28526ff9..bf570d178a 100644 --- a/lightrag/pipeline.py +++ b/lightrag/pipeline.py @@ -65,7 +65,6 @@ compute_file_content_hash, compute_text_content_hash, doc_status_field, - doc_status_value, document_canonical_key, document_source_key, get_by_path, @@ -103,7 +102,6 @@ async def apipeline_enqueue_documents( lightrag_document_paths: str | list[str] | None = None, parsed_engine: str | list[str] | None = None, process_options: str | list[str] | None = None, - reprocess_existing_non_processed: bool = False, ) -> str: """ Pipeline for Processing Documents @@ -125,12 +123,35 @@ async def apipeline_enqueue_documents( accepted as a single string broadcast to every input or as a list aligned with ``input``. Stored verbatim on ``full_docs`` and mirrored to ``doc_status.metadata['process_options']``. - reprocess_existing_non_processed: allow scan retries to overwrite - existing same-name records that have not reached PROCESSED. Returns: str: tracking ID for monitoring processing status + + Raises: + RuntimeError: if the pipeline is already busy or a scan is in + progress. Per the concurrency contract, all writes to + ``doc_status`` / ``full_docs`` must wait for the in-flight + indexing or scanning to finish. Callers from HTTP endpoints + should pre-check ``pipeline_status['busy']`` / + ``pipeline_status['scanning']`` and surface a 409 to clients + long before reaching this last-line guard. """ + # Pipeline-busy guard: refuse to mutate doc_status / full_docs while + # any indexing or scanning job is running. This is the last line of + # defence — endpoints should fail fast with 409 before getting here. + pipeline_status = await get_namespace_data( + "pipeline_status", workspace=self.workspace + ) + pipeline_status_lock = get_namespace_lock( + "pipeline_status", workspace=self.workspace + ) + async with pipeline_status_lock: + if pipeline_status.get("busy") or pipeline_status.get("scanning"): + raise RuntimeError( + "Cannot enqueue while pipeline is busy or scanning; " + "wait for the running job to finish before retrying." + ) + # Generate track_id if not provided if track_id is None or track_id.strip() == "": track_id = generate_track_id("enqueue") @@ -382,18 +403,13 @@ def _initial_doc_status(content_data: dict[str, Any]) -> dict[str, Any]: # 3. Filter out already processed documents # Get docs ids all_new_doc_ids = set(new_docs.keys()) - # Exclude IDs of documents that are already enqueued + # Exclude IDs of documents that are already enqueued. The previous + # ``reprocess_existing_non_processed`` flag has been removed: any + # same-name record (regardless of status) is treated as a duplicate + # here. Recovering half-processed documents is now the job of the + # pipeline's resume logic, which runs in apipeline_process_enqueue_documents + # rather than this enqueue path. unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids) - reprocess_doc_ids: set[str] = set() - if reprocess_existing_non_processed: - for doc_id in all_new_doc_ids - unique_new_doc_ids: - existing_doc = await self.doc_status.get_by_id(doc_id) - if ( - existing_doc - and doc_status_value(existing_doc) != DocStatus.PROCESSED.value - ): - unique_new_doc_ids.add(doc_id) - reprocess_doc_ids.add(doc_id) for doc_id in list(unique_new_doc_ids): content_data = contents[doc_id] @@ -404,18 +420,6 @@ def _initial_doc_status(content_data: dict[str, Any]) -> dict[str, Any]: ) if match: existing_doc_id, existing_doc = match - if ( - reprocess_existing_non_processed - and doc_status_value(existing_doc) != DocStatus.PROCESSED.value - ): - reprocess_doc_ids.add(doc_id) - if existing_doc_id != doc_id: - await self.doc_status.delete([existing_doc_id]) - try: - await self.full_docs.delete([existing_doc_id]) - except Exception: - pass - continue unique_new_doc_ids.discard(doc_id) duplicate_attempts.append( { @@ -445,8 +449,6 @@ def _initial_doc_status(content_data: dict[str, Any]) -> dict[str, Any]: ) if hash_match: existing_doc_id, existing_doc = hash_match - if existing_doc_id == doc_id and doc_id in reprocess_doc_ids: - continue unique_new_doc_ids.discard(doc_id) duplicate_attempts.append( { @@ -471,8 +473,6 @@ def _initial_doc_status(content_data: dict[str, Any]) -> dict[str, Any]: for doc_id in ignored_ids: if any(attempt.get("doc_id") == doc_id for attempt in duplicate_attempts): continue - if doc_id in reprocess_doc_ids: - continue existing_doc = await self.doc_status.get_by_id(doc_id) duplicate_attempts.append( { @@ -1861,7 +1861,7 @@ async def analyze_multimodal( *, process_options: str | None = None, ) -> dict[str, Any]: - """Phase 2: Multimodal analysis (VLM). Writes llm_analyze_result and analyze_time to LightRAG Document. + """Phase 2: Multimodal analysis (VLM). Writes llm_analyze_result to LightRAG Document. Per-document ``i`` / ``t`` / ``e`` flags from ``full_docs.process_options`` decide which modalities are sent to the @@ -1869,6 +1869,14 @@ async def analyze_multimodal( flags so toggling options later does not require re-parsing — only the ``llm_analyze_result`` payload is gated here. + Idempotent by design: ``meta.analyze_time`` is treated as the + timestamp of the most recent successful pass rather than a + "completed" sentinel, and per-item ``llm_analyze_result`` already + present is not re-computed. This lets users incrementally enable + new modalities (e.g. add ``t`` after a prior ``i``-only pass) and + re-trigger analysis without redundant VLM calls or losing prior + results. + Args: process_options: Optional override that bypasses the ``full_docs.process_options`` lookup; primarily used by unit @@ -1942,13 +1950,12 @@ async def analyze_multimodal( meta = json.loads(lines[0]) if not isinstance(meta, dict) or meta.get("type") != "meta": return parsed_data - if meta.get("analyze_time"): - return parsed_data + # ``analyze_time`` is now the "most recent successful pass" + # timestamp. We refresh it after the body finishes successfully + # rather than using it as an early-return gate, so re-triggering + # analyze_multimodal with newly-enabled i/t/e options proceeds. now_iso = datetime.now(timezone.utc).isoformat() - meta["analyze_time"] = now_iso - lines[0] = json.dumps(meta, ensure_ascii=False) - block_file.write_text("\n".join(lines) + "\n", encoding="utf-8") # Analyze sidecar multimodal items by VLM model role. use_vlm_func = self.role_llm_funcs["vlm"] @@ -2216,12 +2223,26 @@ def _normalize_grounded_value(value: Any) -> Any: if isinstance(items, dict): analyze_tasks = [] valid_keys = [] + skipped_existing = 0 for item_id, item in items.items(): - if isinstance(item, dict): - valid_keys.append(item_id) - analyze_tasks.append( - _analyze_item(root_key, item_id, item) - ) + if not isinstance(item, dict): + continue + # Idempotency: skip items that already have a VLM + # result from a prior pass. A user re-enabling + # additional modalities should not re-spend tokens + # on items that were already analyzed. + if isinstance(item.get("llm_analyze_result"), dict): + skipped_existing += 1 + continue + valid_keys.append(item_id) + analyze_tasks.append(_analyze_item(root_key, item_id, item)) + if skipped_existing: + logger.debug( + f"[analyze_multimodal] {root_key}: " + f"{skipped_existing} item(s) already have " + f"llm_analyze_result, skipping; " + f"{len(analyze_tasks)} item(s) to analyze" + ) analyzed_results = await asyncio.gather( *analyze_tasks, return_exceptions=True ) @@ -2245,6 +2266,14 @@ def _normalize_grounded_value(value: Any) -> Any: f"[analyze_multimodal] failed to write sidecar {sidecar_path}: {sidecar_error}" ) + # Refresh ``meta.analyze_time`` to record the most-recent successful + # pass. This happens after the sidecar loop so a crash mid-loop + # does not falsely advertise completion; on the next run the same + # already-analyzed items will be skipped anyway. + meta["analyze_time"] = now_iso + lines[0] = json.dumps(meta, ensure_ascii=False) + block_file.write_text("\n".join(lines) + "\n", encoding="utf-8") + parsed_data["analyze_time"] = now_iso parsed_data["multimodal_processed"] = True logger.info( diff --git a/lightrag_webui/src/api/lightrag.ts b/lightrag_webui/src/api/lightrag.ts index 947bafcc4c..0f7dbd6a68 100644 --- a/lightrag_webui/src/api/lightrag.ts +++ b/lightrag_webui/src/api/lightrag.ts @@ -201,13 +201,13 @@ export type EntityUpdateResponse = { } export type DocActionResponse = { - status: 'success' | 'partial_success' | 'failure' | 'duplicated' + status: 'success' | 'partial_success' | 'failure' message: string track_id?: string } export type ScanResponse = { - status: 'scanning_started' + status: 'scanning_started' | 'scanning_skipped_pipeline_busy' message: string track_id: string } diff --git a/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx b/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx index 16e21e7d0c..2fdd0fdc2b 100644 --- a/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx +++ b/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx @@ -103,13 +103,7 @@ export default function UploadDocumentsDialog({ onDocumentsUploaded }: UploadDoc })) }) - if (result.status === 'duplicated') { - uploadErrors[file.name] = t('documentPanel.uploadDocuments.fileUploader.duplicateFile') - setFileErrors(prev => ({ - ...prev, - [file.name]: t('documentPanel.uploadDocuments.fileUploader.duplicateFile') - })) - } else if (result.status !== 'success') { + if (result.status !== 'success') { uploadErrors[file.name] = result.message setFileErrors(prev => ({ ...prev, @@ -124,13 +118,30 @@ export default function UploadDocumentsDialog({ onDocumentsUploaded }: UploadDoc // Handle HTTP errors, including 400 errors let errorMsg = errorMessage(err) + const duplicateFileMsg = t('documentPanel.uploadDocuments.fileUploader.duplicateFile') // If it's an axios error with response data, try to extract more detailed error info if (err && typeof err === 'object' && 'response' in err) { const axiosError = err as { response?: { status: number, data?: { detail?: string } } } - if (axiosError.response?.status === 400) { - // Extract specific error message from backend response - errorMsg = axiosError.response.data?.detail || errorMsg + const status = axiosError.response?.status + const detail = axiosError.response?.data?.detail + if (status === 409) { + // Server now rejects same-name uploads with HTTP 409 instead of + // returning a 200 ``status="duplicated"`` payload. Map the most + // common cases (existing record / file in INPUT dir) back to the + // dedicated "duplicate file" UI affordance, and surface other + // 409 reasons (pipeline busy / scanning) verbatim from the + // server detail so users can tell why they were rejected. + if ( + typeof detail === 'string' && + (/already contains/i.test(detail) || /Status:/i.test(detail)) + ) { + errorMsg = duplicateFileMsg + } else { + errorMsg = detail || errorMsg + } + } else if (status === 400) { + errorMsg = detail || errorMsg } // Set progress to 100% to display error message diff --git a/tests/test_document_routes_docx_archive.py b/tests/test_document_routes_docx_archive.py index 34752eb8e2..3cc1026d69 100644 --- a/tests/test_document_routes_docx_archive.py +++ b/tests/test_document_routes_docx_archive.py @@ -38,6 +38,33 @@ pytestmark = pytest.mark.offline +@pytest.fixture(autouse=True) +def _ensure_shared_storage_initialized(): + """Initialize the shared_storage module-level dicts before each test. + + The scan endpoint and pipeline-busy guards introduced for the + reentrancy / resume work read ``pipeline_status`` via + ``get_namespace_data``, which raises if shared dicts have never been + initialized. Tests using mocked ``LightRAG`` instances don't run + ``initialize_storages``, so we set up the shared store here and reset + pipeline_status state per-test to avoid leakage. + """ + import importlib + + shared_storage = importlib.import_module("lightrag.kg.shared_storage") + shared_storage.initialize_share_data() + yield + # Reset pipeline_status to a clean state so subsequent tests don't + # inherit ``busy`` / ``scanning`` flags set by prior runs. + if shared_storage._shared_dicts is not None: + for key in list(shared_storage._shared_dicts.keys()): + if key.endswith("pipeline_status") or key == "pipeline_status": + ns = shared_storage._shared_dicts[key] + if isinstance(ns, dict): + ns["busy"] = False + ns["scanning"] = False + + class _FakeDocStatus: def __init__(self): self.docs = {} @@ -62,7 +89,6 @@ async def apipeline_enqueue_documents( docs_format=None, parsed_engine=None, process_options=None, - reprocess_existing_non_processed=False, ): item = { "input": input, @@ -72,8 +98,6 @@ async def apipeline_enqueue_documents( "parsed_engine": parsed_engine, "process_options": process_options, } - if reprocess_existing_non_processed: - item["reprocess_existing_non_processed"] = True self.enqueued.append(item) return track_id @@ -126,6 +150,7 @@ async def apipeline_process_enqueue_documents(self): class _DuplicateUploadRag: def __init__(self, docs_by_path): self.doc_status = _ScanDocStatus(docs_by_path) + self.workspace = f"upload-test-{uuid4().hex}" class _DeleteRag: @@ -501,7 +526,9 @@ async def capture_pipeline(rag_arg, file_paths, track_id, **kwargs): # the plain variant is the one that gets archived. assert len(calls) == 1 assert calls[0]["file_paths"] == [hinted_file] - assert calls[0]["kwargs"] == {"reprocess_existing_non_processed": True} + # The reprocess_existing_non_processed flag was removed; pipeline_index_files + # is now invoked without any extra kwargs. + assert calls[0]["kwargs"] == {} archived_names = { path.name for path in (tmp_path / PARSED_DIR_NAME).iterdir() if path.is_file() } @@ -539,12 +566,16 @@ async def capture_pipeline(rag_arg, file_paths, track_id, **kwargs): await run_scanning_process(rag, doc_manager, "track-scan") + # Non-PROCESSED records are still passed through to pipeline_index_files; + # the now-removed reprocess_existing_non_processed flag is no longer set. + # Recovery of half-finished documents is performed by the pipeline's + # resume logic, not by re-enqueueing them here. assert calls == [ { "rag": rag, "file_paths": [file_path], "track_id": "track-scan", - "kwargs": {"reprocess_existing_non_processed": True}, + "kwargs": {}, } ] assert file_path.exists() @@ -580,11 +611,14 @@ async def test_upload_rejects_same_name_failed_doc_status_without_full_docs( file=BytesIO(b"replacement docx bytes"), ) - response = await upload_endpoint(_document_routes.BackgroundTasks(), upload_file) - - assert response.status == "duplicated" - assert response.track_id == "track-failed" - assert "Status: failed" in response.message + # Strict name pre-check: same-canonical record in doc_status now raises 409 + # rather than returning a "duplicated" 200 response. Clients must delete + # the existing record before re-uploading. + with pytest.raises(_document_routes.HTTPException) as excinfo: + await upload_endpoint(_document_routes.BackgroundTasks(), upload_file) + assert excinfo.value.status_code == 409 + assert "failed.docx" in excinfo.value.detail + assert "Status: failed" in excinfo.value.detail assert not (tmp_path / "failed.docx").exists() @@ -606,13 +640,188 @@ async def test_upload_rejects_parser_hinted_filesystem_duplicate(tmp_path, monke file=BytesIO(b"replacement docx bytes"), ) - response = await upload_endpoint(_document_routes.BackgroundTasks(), upload_file) - - assert response.status == "duplicated" - assert response.track_id == "" + # Strict name pre-check: an INPUT directory file with the same canonical + # basename now blocks the upload with 409. + with pytest.raises(_document_routes.HTTPException) as excinfo: + await upload_endpoint(_document_routes.BackgroundTasks(), upload_file) + assert excinfo.value.status_code == 409 + assert "existing.docx" in excinfo.value.detail assert not (tmp_path / "existing.[native].docx").exists() +async def test_upload_returns_409_when_pipeline_busy(tmp_path, monkeypatch): + """Upload must refuse with 409 while ``pipeline_status['busy']`` is set, + independent of any name dedup. The strict name pre-check happens AFTER + the busy guard, so the 409 detail is about the pipeline, not the file. + """ + import importlib + + monkeypatch.setattr( + _document_routes, "global_args", SimpleNamespace(max_upload_size=None) + ) + doc_manager = DocumentManager(str(tmp_path)) + rag = _DuplicateUploadRag({}) + + shared_storage = importlib.import_module("lightrag.kg.shared_storage") + await shared_storage.initialize_pipeline_status(workspace=rag.workspace) + pipeline_status = await shared_storage.get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status["busy"] = True + + router = create_document_routes(rag, doc_manager) + upload_endpoint = [ + route.endpoint + for route in router.routes + if getattr(route, "name", "") == "upload_to_input_dir" + ][-1] + upload_file = _document_routes.UploadFile( + filename="while_busy.docx", + file=BytesIO(b"docx bytes"), + ) + + with pytest.raises(_document_routes.HTTPException) as excinfo: + await upload_endpoint(_document_routes.BackgroundTasks(), upload_file) + assert excinfo.value.status_code == 409 + assert "busy" in excinfo.value.detail.lower() + assert not (tmp_path / "while_busy.docx").exists() + + +async def test_upload_returns_409_when_scanning(tmp_path, monkeypatch): + """Upload must refuse with 409 when a scan is in progress.""" + import importlib + + monkeypatch.setattr( + _document_routes, "global_args", SimpleNamespace(max_upload_size=None) + ) + doc_manager = DocumentManager(str(tmp_path)) + rag = _DuplicateUploadRag({}) + + shared_storage = importlib.import_module("lightrag.kg.shared_storage") + await shared_storage.initialize_pipeline_status(workspace=rag.workspace) + pipeline_status = await shared_storage.get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status["scanning"] = True + + router = create_document_routes(rag, doc_manager) + upload_endpoint = [ + route.endpoint + for route in router.routes + if getattr(route, "name", "") == "upload_to_input_dir" + ][-1] + upload_file = _document_routes.UploadFile( + filename="while_scanning.docx", + file=BytesIO(b"docx bytes"), + ) + + with pytest.raises(_document_routes.HTTPException) as excinfo: + await upload_endpoint(_document_routes.BackgroundTasks(), upload_file) + assert excinfo.value.status_code == 409 + assert "scan" in excinfo.value.detail.lower() + assert not (tmp_path / "while_scanning.docx").exists() + + +async def test_scan_endpoint_returns_skipped_when_pipeline_busy(tmp_path): + """Scan endpoint must return ``scanning_skipped_pipeline_busy`` and NOT + schedule a background task while the pipeline is busy.""" + import importlib + + doc_manager = DocumentManager(str(tmp_path)) + rag = _ScanRag({}) + + shared_storage = importlib.import_module("lightrag.kg.shared_storage") + await shared_storage.initialize_pipeline_status(workspace=rag.workspace) + pipeline_status = await shared_storage.get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status["busy"] = True + + router = create_document_routes(rag, doc_manager) + scan_endpoint = [ + route.endpoint + for route in router.routes + if getattr(route, "name", "") == "scan_for_new_documents" + ][-1] + + bg = _document_routes.BackgroundTasks() + response = await scan_endpoint(bg) + + assert response.status == "scanning_skipped_pipeline_busy" + # No background task should have been scheduled. + assert len(bg.tasks) == 0 + # And ``scanning`` is left unchanged at False (we didn't acquire it). + assert pipeline_status.get("scanning") is False + + +async def test_scan_endpoint_returns_skipped_when_already_scanning(tmp_path): + """Scan endpoint must reject overlapping scans by checking the + ``scanning`` flag, not just ``busy``.""" + import importlib + + doc_manager = DocumentManager(str(tmp_path)) + rag = _ScanRag({}) + + shared_storage = importlib.import_module("lightrag.kg.shared_storage") + await shared_storage.initialize_pipeline_status(workspace=rag.workspace) + pipeline_status = await shared_storage.get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status["scanning"] = True + + router = create_document_routes(rag, doc_manager) + scan_endpoint = [ + route.endpoint + for route in router.routes + if getattr(route, "name", "") == "scan_for_new_documents" + ][-1] + + bg = _document_routes.BackgroundTasks() + response = await scan_endpoint(bg) + + assert response.status == "scanning_skipped_pipeline_busy" + assert len(bg.tasks) == 0 + + +async def test_scan_endpoint_acquires_and_releases_scanning_flag(tmp_path, monkeypatch): + """The scan endpoint must atomically set ``scanning=True`` and + ``run_scanning_process`` must clear it in finally — even when the body + raises — so successive scans aren't permanently blocked. + """ + import importlib + + doc_manager = DocumentManager(str(tmp_path)) + rag = _ScanRag({}) + + shared_storage = importlib.import_module("lightrag.kg.shared_storage") + await shared_storage.initialize_pipeline_status(workspace=rag.workspace) + pipeline_status = await shared_storage.get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status["busy"] = False + pipeline_status["scanning"] = False + + router = create_document_routes(rag, doc_manager) + scan_endpoint = [ + route.endpoint + for route in router.routes + if getattr(route, "name", "") == "scan_for_new_documents" + ][-1] + + bg = _document_routes.BackgroundTasks() + response = await scan_endpoint(bg) + + # Endpoint scheduled the task and acquired the flag synchronously. + assert response.status == "scanning_started" + assert pipeline_status["scanning"] is True + assert len(bg.tasks) == 1 + + # Run the scheduled task; finally-block must clear the flag. + task = bg.tasks[0] + await task.func(*task.args, **task.kwargs) + assert pipeline_status["scanning"] is False + + def test_delete_file_variants_removes_canonical_hint_variants(tmp_path): parsed_dir = tmp_path / PARSED_DIR_NAME parsed_dir.mkdir() diff --git a/tests/test_pipeline_release_closure.py b/tests/test_pipeline_release_closure.py index 80c647953d..5fed6939c1 100644 --- a/tests/test_pipeline_release_closure.py +++ b/tests/test_pipeline_release_closure.py @@ -290,6 +290,186 @@ async def _run(): asyncio.run(_run()) +@pytest.mark.offline +def test_apipeline_enqueue_rejects_when_pipeline_busy(tmp_path): + """Pipeline busy / scanning state forbids any new enqueue. This is the + last-line guard inside ``apipeline_enqueue_documents``; HTTP endpoints + catch this earlier and return 409, but core API callers must surface the + invariant violation as a RuntimeError. + """ + + async def _run(): + from lightrag.kg.shared_storage import ( + get_namespace_data, + get_namespace_lock, + ) + + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + pipeline_status = await get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status_lock = get_namespace_lock( + "pipeline_status", workspace=rag.workspace + ) + + # Simulate an in-flight indexing job. + async with pipeline_status_lock: + pipeline_status["busy"] = True + try: + with pytest.raises(RuntimeError, match="busy"): + await rag.apipeline_enqueue_documents( + "should not enqueue", + file_paths="busy.txt", + track_id="track-busy", + ) + finally: + async with pipeline_status_lock: + pipeline_status["busy"] = False + + # Same guard fires for in-flight scans. + async with pipeline_status_lock: + pipeline_status["scanning"] = True + try: + with pytest.raises(RuntimeError, match="scanning"): + await rag.apipeline_enqueue_documents( + "should not enqueue", + file_paths="scan.txt", + track_id="track-scan", + ) + finally: + async with pipeline_status_lock: + pipeline_status["scanning"] = False + + # When idle, the same call succeeds — proving the guard is the + # only thing blocking, not some side effect of the test setup. + await rag.apipeline_enqueue_documents( + "now allowed", + file_paths="ok.txt", + track_id="track-ok", + ) + finally: + await rag.finalize_storages() + + asyncio.run(_run()) + + +@pytest.mark.offline +def test_analyze_multimodal_skips_already_analyzed_items(tmp_path): + """Re-running analyze_multimodal must not re-analyze items that already + carry an ``llm_analyze_result`` from a prior pass. This makes + enabling a new modality (e.g. add ``t`` after a prior ``i``-only pass) + cheap: the drawings sidecar is fully populated and skipped, while the + tables sidecar is newly populated. + """ + + async def _run(): + call_count = {"n": 0} + + async def _vlm(_prompt, **_kwargs): + call_count["n"] += 1 + return json.dumps( + { + "name": "Item", + "summary": "ok", + "detail_description": "details", + "grounded": True, + "grounding_reason": "visual_evidence", + } + ) + + rag = _new_rag(tmp_path, vlm_llm_model_func=_vlm) + + # Minimal blocks file with valid meta. + blocks = tmp_path / "demo.blocks.jsonl" + blocks.write_text( + "\n".join( + [ + json.dumps({"type": "meta", "format_version": "1.0"}), + json.dumps({"type": "content", "content": "body"}), + ] + ) + + "\n", + encoding="utf-8", + ) + + # Drawings sidecar with ONE item that already has llm_analyze_result + # (simulates a prior pass). + drawings = tmp_path / "demo.drawings.json" + drawings.write_text( + json.dumps( + { + "version": "1.0", + "drawings": { + "id1": { + "id": "id1", + "caption": "fig1", + "llm_analyze_result": { + "name": "Existing", + "summary": "from prior run", + "detail_description": "kept as-is", + "grounded": True, + }, + } + }, + } + ), + encoding="utf-8", + ) + + # Tables sidecar with one fresh item (no prior result). + tables = tmp_path / "demo.tables.json" + tables.write_text( + json.dumps( + { + "version": "1.0", + "tables": { + "tbl1": { + "id": "tbl1", + "caption": "tbl", + "content": "Header|Row", + } + }, + } + ), + encoding="utf-8", + ) + + parsed = { + "doc_id": "doc-1", + "file_path": "demo.pdf", + "blocks_path": str(blocks), + "content": "body", + } + # Second pass enables BOTH images and tables; drawings should be + # skipped (already analyzed) and only tables should hit the VLM. + await rag.analyze_multimodal("doc-1", "demo.pdf", parsed, process_options="it") + + drawings_payload = json.loads(drawings.read_text(encoding="utf-8")) + existing = drawings_payload["drawings"]["id1"]["llm_analyze_result"] + # Existing result preserved verbatim — VLM was NOT called for this item. + assert existing["name"] == "Existing" + assert existing["summary"] == "from prior run" + + tables_payload = json.loads(tables.read_text(encoding="utf-8")) + new_result = tables_payload["tables"]["tbl1"]["llm_analyze_result"] + # The newly-enabled modality was analyzed. + assert new_result["name"] == "Item" + assert new_result["summary"] == "ok" + + # Exactly ONE VLM call total (for the table). If analyze_time had + # short-circuited the function, the call count would be 0; if the + # idempotency check was missing, it would be 2. + assert call_count["n"] == 1 + + # meta.analyze_time was refreshed to reflect this most-recent pass. + meta = json.loads(blocks.read_text(encoding="utf-8").splitlines()[0]) + assert meta.get("analyze_time") + + asyncio.run(_run()) + + @pytest.mark.offline def test_enqueue_dedupes_by_filename_and_content_hash(tmp_path): async def _run(): From 1dae83bf1ccefde3e269b2ab6bd6088c0ac58d07 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 5 May 2026 02:41:04 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9C=A8=20feat(pipeline):=20resume=20alre?= =?UTF-8?q?ady-extracted=20documents=20under=20current=20process=5Foptions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ``apipeline_process_enqueue_documents`` picks up a half-processed document whose content is already extracted into ``full_docs`` (raw content or LightRAG blocks file present), redo the post-extraction stages cleanly under the *current* ``process_options`` rather than mixing stale and fresh chunks/entities. - New ``LightRAG._purge_doc_chunks_and_kg(doc_id, chunk_ids)`` helper removes a document's chunks from ``chunks_vdb`` / ``text_chunks``, classifies its entity / relation contributions into delete-outright vs rebuild-from-remaining, applies the corresponding cleanup, and rebuilds entries that other documents still source. Does NOT touch ``doc_status`` / ``full_docs`` / ``llm_response_cache`` / pipeline busy state — it is the focused KG-cleanup core suitable for both deletion and resume callers. ``adelete_by_doc_id`` remains unchanged for now (deduplicating it can be a future PR). - ``process_document`` gains a resume guard at the convergence point of the worker-driven and inline parse paths. When content is already extracted, it warns on engine mismatch (extracted content is the source of truth — switching engines requires delete + re-upload), purges any stale chunks recorded in ``chunks_list`` via the new helper, and resets ``status_doc.chunks_list`` / ``chunks_count`` so subsequent state-machine upserts do not re-write stale IDs. - ``parse_native`` already returns existing content for format=lightrag and format=raw without re-parsing, so the resume branch reuses the existing parse-stage dispatch unchanged. - New regression tests: - ``_purge_doc_chunks_and_kg`` is a no-op for empty chunk_ids. - ``_purge_doc_chunks_and_kg`` clears chunks_vdb / text_chunks for a document with no graph contributions yet. - The pipeline calls the purge helper with the previous run's chunk IDs when resuming an already-extracted document. - The pipeline skips the purge when ``chunks_list`` is empty. - ``test_extract_failure_before_chunking_preserves_previous_chunk_snapshot`` renamed to ``..._clears_stale_chunk_snapshot`` and inverted: the previous snapshot is now intentionally not preserved across resume + failure, matching the documented "已抽取文档一律删旧 chunks 重做" rule. Co-Authored-By: Claude Opus 4.7 --- lightrag/lightrag.py | 411 ++++++++++++++++++++ lightrag/pipeline.py | 92 +++++ tests/test_doc_status_chunk_preservation.py | 20 +- tests/test_pipeline_release_closure.py | 281 +++++++++++++ 4 files changed, 801 insertions(+), 3 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 37ad0c4c26..af8ddb4099 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -2106,6 +2106,417 @@ async def aget_docs_by_ids( # Return the dictionary containing statuses only for the found document IDs return found_statuses + async def _purge_doc_chunks_and_kg( + self, + doc_id: str, + chunk_ids: set[str], + *, + pipeline_status: dict, + pipeline_status_lock: Any, + ) -> None: + """Remove a document's chunks and clean up its knowledge-graph contributions. + + Used by: + - The pipeline resume branch in ``process_document`` when a + document whose content is already extracted is re-processed + under different ``process_options``: chunks must be wiped and + entities/relations rebuilt fresh. + - Future deletion paths that want a focused "purge KG only" + operation without the LLM-cache / doc_status / full_docs + cleanup that ``adelete_by_doc_id`` also performs. + + What this method does: + 1. Reads ``full_entities`` / ``full_relations`` to identify which + graph nodes / edges this document contributed to. + 2. For each affected entity / relation, intersects the doc's + ``chunk_ids`` with the union of chunk-tracking entries + (``entity_chunks`` / ``relation_chunks``) and graph + ``source_id`` lists, then classifies it as either + *delete-outright* (no remaining sources) or *rebuild* + (still references chunks from other documents). + 3. Deletes the chunks themselves from ``chunks_vdb`` and + ``text_chunks``. + 4. For *delete-outright* entries: removes the relationship / + entity from the graph storage, vector storage, and chunk + tracking. + 5. Calls :py:meth:`_insert_done` to persist graph changes + before rebuilding (so the rebuild step sees a consistent + state). + 6. Calls :func:`rebuild_knowledge_from_chunks` to rebuild any + *rebuild* entries from their remaining chunks (so other + documents that also contributed to the same entity / + relation keep their data intact). + 7. Deletes the per-doc ``full_entities`` / ``full_relations`` + index rows so subsequent re-extraction starts fresh. + + Does NOT touch: + - ``doc_status`` / ``full_docs`` records — caller manages those. + - ``llm_response_cache`` — orthogonal to KG cleanup. + - Pipeline busy-flag — assumes the caller already holds the + pipeline (i.e. this runs inside a pipeline run). + + Idempotent: passing an empty ``chunk_ids`` returns immediately + without touching storage. + """ + if not chunk_ids: + return + + # ---- 1. Analyze affected entities/relations from full_entities/full_relations ---- + entities_to_delete: set[str] = set() + entities_to_rebuild: dict[str, list[str]] = {} + relationships_to_delete: set[tuple[str, str]] = set() + relationships_to_rebuild: dict[tuple[str, str], list[str]] = {} + entity_chunk_updates: dict[str, list[str]] = {} + relation_chunk_updates: dict[tuple[str, str], list[str]] = {} + + try: + doc_entities_data = await self.full_entities.get_by_id(doc_id) + doc_relations_data = await self.full_relations.get_by_id(doc_id) + + affected_nodes: list[dict[str, Any]] = [] + affected_edges: list[dict[str, Any]] = [] + + if doc_entities_data and "entity_names" in doc_entities_data: + entity_names = doc_entities_data["entity_names"] + nodes_dict = await self.chunk_entity_relation_graph.get_nodes_batch( + entity_names + ) + for entity_name in entity_names: + node_data = nodes_dict.get(entity_name) + if node_data: + if "id" not in node_data: + node_data["id"] = entity_name + affected_nodes.append(node_data) + + if doc_relations_data and "relation_pairs" in doc_relations_data: + relation_pairs = doc_relations_data["relation_pairs"] + edge_pairs_dicts = [ + {"src": pair[0], "tgt": pair[1]} for pair in relation_pairs + ] + edges_dict = await self.chunk_entity_relation_graph.get_edges_batch( + edge_pairs_dicts + ) + for pair in relation_pairs: + src, tgt = pair[0], pair[1] + edge_data = edges_dict.get((src, tgt)) + if edge_data: + if "source" not in edge_data: + edge_data["source"] = src + if "target" not in edge_data: + edge_data["target"] = tgt + affected_edges.append(edge_data) + except Exception as e: + logger.error( + f"[purge] Failed to analyze affected graph elements for {doc_id}: {e}" + ) + raise Exception(f"Failed to analyze graph dependencies: {e}") from e + + # ---- 2. Classify entities/relations into delete vs rebuild ---- + try: + for node_data in affected_nodes: + node_label = node_data.get("entity_id") + if not node_label: + continue + + existing_sources: list[str] = [] + graph_sources: list[str] = [] + if self.entity_chunks: + stored_chunks = await self.entity_chunks.get_by_id(node_label) + if stored_chunks and isinstance(stored_chunks, dict): + existing_sources = [ + chunk_id + for chunk_id in stored_chunks.get("chunk_ids", []) + if chunk_id + ] + + if node_data.get("source_id"): + graph_sources = [ + chunk_id + for chunk_id in node_data["source_id"].split(GRAPH_FIELD_SEP) + if chunk_id + ] + + if not existing_sources: + existing_sources = graph_sources + + if not existing_sources: + entities_to_delete.add(node_label) + entity_chunk_updates[node_label] = [] + continue + + remaining_sources = subtract_source_ids(existing_sources, chunk_ids) + graph_references_deleted_chunks = bool( + graph_sources and set(graph_sources) & chunk_ids + ) + + if not remaining_sources: + entities_to_delete.add(node_label) + entity_chunk_updates[node_label] = [] + elif ( + remaining_sources != existing_sources + or graph_references_deleted_chunks + ): + entities_to_rebuild[node_label] = remaining_sources + entity_chunk_updates[node_label] = remaining_sources + + async with pipeline_status_lock: + log_message = ( + f"[purge] {doc_id}: {len(entities_to_rebuild)} entity(ies) " + f"to rebuild, {len(entities_to_delete)} to delete" + ) + logger.info(log_message) + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append(log_message) + + for edge_data in affected_edges: + src = edge_data.get("source") + tgt = edge_data.get("target") + if not src or not tgt or "source_id" not in edge_data: + continue + + edge_tuple = tuple(sorted((src, tgt))) + if ( + edge_tuple in relationships_to_delete + or edge_tuple in relationships_to_rebuild + ): + continue + + existing_sources = [] + graph_sources = [] + if self.relation_chunks: + storage_key = make_relation_chunk_key(src, tgt) + stored_chunks = await self.relation_chunks.get_by_id(storage_key) + if stored_chunks and isinstance(stored_chunks, dict): + existing_sources = [ + chunk_id + for chunk_id in stored_chunks.get("chunk_ids", []) + if chunk_id + ] + + if edge_data.get("source_id"): + graph_sources = [ + chunk_id + for chunk_id in edge_data["source_id"].split(GRAPH_FIELD_SEP) + if chunk_id + ] + + if not existing_sources: + existing_sources = graph_sources + + if not existing_sources: + relationships_to_delete.add(edge_tuple) + relation_chunk_updates[edge_tuple] = [] + continue + + remaining_sources = subtract_source_ids(existing_sources, chunk_ids) + graph_references_deleted_chunks = bool( + graph_sources and set(graph_sources) & chunk_ids + ) + + if not remaining_sources: + relationships_to_delete.add(edge_tuple) + relation_chunk_updates[edge_tuple] = [] + elif ( + remaining_sources != existing_sources + or graph_references_deleted_chunks + ): + relationships_to_rebuild[edge_tuple] = remaining_sources + relation_chunk_updates[edge_tuple] = remaining_sources + + async with pipeline_status_lock: + log_message = ( + f"[purge] {doc_id}: {len(relationships_to_rebuild)} relation(s) " + f"to rebuild, {len(relationships_to_delete)} to delete" + ) + logger.info(log_message) + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append(log_message) + + # Update entity/relation chunk-tracking with the remaining sources. + current_time = int(time.time()) + if entity_chunk_updates and self.entity_chunks: + entity_upsert_payload = {} + for entity_name, remaining in entity_chunk_updates.items(): + if not remaining: + continue + entity_upsert_payload[entity_name] = { + "chunk_ids": remaining, + "count": len(remaining), + "updated_at": current_time, + } + if entity_upsert_payload: + await self.entity_chunks.upsert(entity_upsert_payload) + + if relation_chunk_updates and self.relation_chunks: + relation_upsert_payload = {} + for edge_tuple, remaining in relation_chunk_updates.items(): + if not remaining: + continue + storage_key = make_relation_chunk_key(*edge_tuple) + relation_upsert_payload[storage_key] = { + "chunk_ids": remaining, + "count": len(remaining), + "updated_at": current_time, + } + if relation_upsert_payload: + await self.relation_chunks.upsert(relation_upsert_payload) + except Exception as e: + logger.error( + f"[purge] Failed to process graph analysis results for {doc_id}: {e}" + ) + raise Exception(f"Failed to process graph dependencies: {e}") from e + + # ---- 3. Delete chunks themselves ---- + try: + await self.chunks_vdb.delete(chunk_ids) + await self.text_chunks.delete(chunk_ids) + async with pipeline_status_lock: + log_message = ( + f"[purge] {doc_id}: deleted {len(chunk_ids)} chunk(s) from storage" + ) + logger.info(log_message) + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append(log_message) + except Exception as e: + logger.error(f"[purge] Failed to delete chunks for {doc_id}: {e}") + raise Exception(f"Failed to delete document chunks: {e}") from e + + # ---- 4. Delete relationships with no remaining sources ---- + if relationships_to_delete: + try: + rel_ids_to_delete = [] + for src, tgt in relationships_to_delete: + rel_ids_to_delete.extend( + [ + compute_mdhash_id(src + tgt, prefix="rel-"), + compute_mdhash_id(tgt + src, prefix="rel-"), + ] + ) + await self.relationships_vdb.delete(rel_ids_to_delete) + await self.chunk_entity_relation_graph.remove_edges( + list(relationships_to_delete) + ) + if self.relation_chunks: + relation_storage_keys = [ + make_relation_chunk_key(src, tgt) + for src, tgt in relationships_to_delete + ] + await self.relation_chunks.delete(relation_storage_keys) + async with pipeline_status_lock: + log_message = ( + f"[purge] {doc_id}: deleted " + f"{len(relationships_to_delete)} relation(s)" + ) + logger.info(log_message) + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append(log_message) + except Exception as e: + logger.error( + f"[purge] Failed to delete relationships for {doc_id}: {e}" + ) + raise Exception(f"Failed to delete relationships: {e}") from e + + # ---- 5. Delete entities with no remaining sources ---- + if entities_to_delete: + try: + nodes_edges_dict = ( + await self.chunk_entity_relation_graph.get_nodes_edges_batch( + list(entities_to_delete) + ) + ) + + edges_to_delete: set[tuple[str, str]] = set() + for entity, edges in nodes_edges_dict.items(): + if edges: + for src, tgt in edges: + edges_to_delete.add(tuple(sorted((src, tgt)))) + + if edges_to_delete: + rel_ids_to_delete = [] + for src, tgt in edges_to_delete: + rel_ids_to_delete.extend( + [ + compute_mdhash_id(src + tgt, prefix="rel-"), + compute_mdhash_id(tgt + src, prefix="rel-"), + ] + ) + await self.relationships_vdb.delete(rel_ids_to_delete) + if self.relation_chunks: + relation_storage_keys = [ + make_relation_chunk_key(src, tgt) + for src, tgt in edges_to_delete + ] + await self.relation_chunks.delete(relation_storage_keys) + logger.info( + f"[purge] {doc_id}: cleaned {len(edges_to_delete)} residual " + f"edge(s) from VDB and chunk-tracking storage" + ) + + await self.chunk_entity_relation_graph.remove_nodes( + list(entities_to_delete) + ) + + entity_vdb_ids = [ + compute_mdhash_id(entity, prefix="ent-") + for entity in entities_to_delete + ] + await self.entities_vdb.delete(entity_vdb_ids) + + if self.entity_chunks: + await self.entity_chunks.delete(list(entities_to_delete)) + + async with pipeline_status_lock: + log_message = ( + f"[purge] {doc_id}: deleted " + f"{len(entities_to_delete)} entity(ies)" + ) + logger.info(log_message) + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append(log_message) + except Exception as e: + logger.error(f"[purge] Failed to delete entities for {doc_id}: {e}") + raise Exception(f"Failed to delete entities: {e}") from e + + # ---- 6. Persist pre-rebuild changes ---- + try: + await self._insert_done() + except Exception as e: + logger.error(f"[purge] Failed to persist pre-rebuild changes: {e}") + raise Exception(f"Failed to persist pre-rebuild changes: {e}") from e + + # ---- 7. Rebuild entities/relations that still have remaining sources ---- + if entities_to_rebuild or relationships_to_rebuild: + try: + await rebuild_knowledge_from_chunks( + entities_to_rebuild=entities_to_rebuild, + relationships_to_rebuild=relationships_to_rebuild, + knowledge_graph_inst=self.chunk_entity_relation_graph, + entities_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + text_chunks_storage=self.text_chunks, + llm_response_cache=self.llm_response_cache, + global_config=self._build_global_config(), + pipeline_status=pipeline_status, + pipeline_status_lock=pipeline_status_lock, + entity_chunks_storage=self.entity_chunks, + relation_chunks_storage=self.relation_chunks, + ) + except Exception as e: + logger.error(f"[purge] Failed to rebuild knowledge from chunks: {e}") + raise Exception(f"Failed to rebuild knowledge graph: {e}") from e + + # ---- 8. Delete per-doc full_entities / full_relations index rows ---- + try: + await self.full_entities.delete([doc_id]) + await self.full_relations.delete([doc_id]) + except Exception as e: + logger.error( + f"[purge] Failed to delete full_entities/full_relations rows for {doc_id}: {e}" + ) + raise Exception( + f"Failed to delete from full_entities/full_relations: {e}" + ) from e + async def adelete_by_doc_id( self, doc_id: str, delete_llm_cache: bool = False ) -> DeletionResult: diff --git a/lightrag/pipeline.py b/lightrag/pipeline.py index bf570d178a..6460cff31f 100644 --- a/lightrag/pipeline.py +++ b/lightrag/pipeline.py @@ -47,6 +47,7 @@ from lightrag.extraction.interchange import parse_interchange_jsonl from lightrag.parser_routing import ( canonicalize_parser_hinted_basename, + resolve_file_parser_directives, resolve_stored_document_parser_engine, ) from lightrag.utils import ( @@ -1159,6 +1160,97 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: (content_data or {}).get("process_options", "") ) + # ---- Resume guard ---- + # When the pipeline picks up a non-fresh document whose + # content has already been extracted into full_docs, we + # must purge any stale chunks / entities / relations + # from a previous interrupted attempt BEFORE re-running + # chunking + entity extraction under the *current* + # process_options. Skipping this would either leave + # orphaned chunk-IDs in the vector DB or mix old and + # new chunks together, neither of which is safe. + # + # Both pipeline entry points (worker-driven and inline) + # converge here, so this is the single canonical place + # to do the purge regardless of which path got us here. + content_already_extracted = isinstance( + content_data, dict + ) and ( + ( + content_data.get("format") + == FULL_DOCS_FORMAT_LIGHTRAG + and content_data.get("lightrag_document_path") + ) + or ( + content_data.get("format") == FULL_DOCS_FORMAT_RAW + and (content_data.get("content") or "").strip() + ) + ) + stored_chunk_ids = set( + chunk_id + for chunk_id in (status_doc.chunks_list or []) + if isinstance(chunk_id, str) and chunk_id + ) + if content_already_extracted: + # Engine-mismatch warning: changing the parser engine + # after extraction is *not* honoured — the extracted + # content is the source of truth. Users wanting to + # re-extract with a new engine must delete + + # re-upload. + intended_engine, _ = resolve_file_parser_directives( + file_path + ) + stored_engine = ( + content_data.get("parsed_engine") or "" + ).lower() + if ( + intended_engine + and stored_engine + and intended_engine != stored_engine + ): + log_message = ( + f"[resume] {doc_id}: filename hint / " + f"LIGHTRAG_PARSER implies engine=" + f"{intended_engine!r} but full_docs " + f"already has parsed_engine=" + f"{stored_engine!r}; keeping the existing " + f"extraction. Delete + re-upload to " + f"switch engines." + ) + logger.warning(log_message) + async with pipeline_status_lock: + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append( + log_message + ) + + if stored_chunk_ids: + log_message = ( + f"[resume] {doc_id}: purging " + f"{len(stored_chunk_ids)} chunk(s) and " + f"associated KG entries from a previous run " + f"before rebuilding under current " + f"process_options" + ) + logger.info(log_message) + async with pipeline_status_lock: + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append( + log_message + ) + await self._purge_doc_chunks_and_kg( + doc_id, + stored_chunk_ids, + pipeline_status=pipeline_status, + pipeline_status_lock=pipeline_status_lock, + ) + # The status_doc carries chunks_list / chunks_count + # from the prior run; clear them so subsequent + # state-machine upserts don't accidentally + # re-write stale IDs. + status_doc.chunks_list = [] + status_doc.chunks_count = 0 + # Try to parse as interchange JSONL (smart extraction output) parsed_interchange = parse_interchange_jsonl( content, self.tokenizer diff --git a/tests/test_doc_status_chunk_preservation.py b/tests/test_doc_status_chunk_preservation.py index 8cfc6d9db5..8d0d72f871 100644 --- a/tests/test_doc_status_chunk_preservation.py +++ b/tests/test_doc_status_chunk_preservation.py @@ -354,9 +354,20 @@ async def fail_extract(self, chunks, pipeline_status, pipeline_status_lock): @pytest.mark.asyncio -async def test_extract_failure_before_chunking_preserves_previous_chunk_snapshot( +async def test_extract_failure_before_chunking_clears_stale_chunk_snapshot( tmp_path, ): + """The resume branch of ``apipeline_process_enqueue_documents`` purges + any stale ``chunks_list`` from a previous interrupted run *before* + chunking starts (so the new run does not mix old and new chunks). + Therefore, when chunking subsequently fails on the retry, the failed + doc_status reflects the post-purge state — the previous snapshot is + intentionally not preserved any more. + + Earlier this test asserted the opposite ("preserve previous snapshot + across failure"), which conflicted with the documented resume rule + that "已抽取文档一律删掉所有的文本块,重新走多模态分析和实体关系提取". + """ rag = await _build_rag(tmp_path, "extract_failure_pre_chunking", _failing_chunking) try: content = "chunking failure document" @@ -390,8 +401,11 @@ async def test_extract_failure_before_chunking_preserves_previous_chunk_snapshot failed_status = await rag.doc_status.get_by_id(doc_id) assert failed_status is not None assert _status_to_text(failed_status["status"]) == "failed" - assert failed_status.get("chunks_list") == previous_chunks - assert failed_status.get("chunks_count") == len(previous_chunks) + # Resume purged the stale list before chunking; the failure record + # therefore shows zero chunks rather than the previous snapshot. + assert failed_status.get("chunks_list") == [] + assert failed_status.get("chunks_count") == 0 + assert "chunking fail sentinel" in (failed_status.get("error_msg") or "") finally: await rag.finalize_storages() diff --git a/tests/test_pipeline_release_closure.py b/tests/test_pipeline_release_closure.py index 5fed6939c1..8bd1643d02 100644 --- a/tests/test_pipeline_release_closure.py +++ b/tests/test_pipeline_release_closure.py @@ -290,6 +290,287 @@ async def _run(): asyncio.run(_run()) +@pytest.mark.offline +def test_purge_doc_chunks_and_kg_is_noop_for_empty_chunks(tmp_path): + """``_purge_doc_chunks_and_kg`` with an empty chunk_ids set must be a + no-op so callers (including the resume branch) can invoke it + unconditionally without first checking for non-empty chunks_list. + """ + + async def _run(): + from lightrag.kg.shared_storage import ( + get_namespace_data, + get_namespace_lock, + ) + + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + pipeline_status = await get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status_lock = get_namespace_lock( + "pipeline_status", workspace=rag.workspace + ) + # Empty set: must return immediately without touching storage. + await rag._purge_doc_chunks_and_kg( + "doc-empty", + set(), + pipeline_status=pipeline_status, + pipeline_status_lock=pipeline_status_lock, + ) + # No exceptions → success. Calling twice in a row is also fine + # since the helper is idempotent on the empty input. + await rag._purge_doc_chunks_and_kg( + "doc-empty", + set(), + pipeline_status=pipeline_status, + pipeline_status_lock=pipeline_status_lock, + ) + finally: + await rag.finalize_storages() + + asyncio.run(_run()) + + +@pytest.mark.offline +def test_purge_doc_chunks_and_kg_clears_chunks_for_unknown_doc(tmp_path): + """When the doc has chunk_ids but no graph contributions yet + (full_entities / full_relations empty), the helper must still clear + the chunks from chunks_vdb / text_chunks without raising. This + exercises the resume path for documents whose previous run was + interrupted between chunking and entity extraction. + """ + + async def _run(): + from lightrag.kg.shared_storage import ( + get_namespace_data, + get_namespace_lock, + ) + + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + # Seed text_chunks + chunks_vdb with two stale chunks. + await rag.text_chunks.upsert( + { + "doc-X-chunk-0": { + "content": "stale chunk 0", + "chunk_order_index": 0, + "full_doc_id": "doc-X", + "tokens": 4, + "file_path": "x.txt", + }, + "doc-X-chunk-1": { + "content": "stale chunk 1", + "chunk_order_index": 1, + "full_doc_id": "doc-X", + "tokens": 4, + "file_path": "x.txt", + }, + } + ) + await rag.chunks_vdb.upsert( + { + "doc-X-chunk-0": { + "content": "stale chunk 0", + "chunk_order_index": 0, + "full_doc_id": "doc-X", + "tokens": 4, + "file_path": "x.txt", + }, + "doc-X-chunk-1": { + "content": "stale chunk 1", + "chunk_order_index": 1, + "full_doc_id": "doc-X", + "tokens": 4, + "file_path": "x.txt", + }, + } + ) + await rag.text_chunks.index_done_callback() + await rag.chunks_vdb.index_done_callback() + + pipeline_status = await get_namespace_data( + "pipeline_status", workspace=rag.workspace + ) + pipeline_status_lock = get_namespace_lock( + "pipeline_status", workspace=rag.workspace + ) + + await rag._purge_doc_chunks_and_kg( + "doc-X", + {"doc-X-chunk-0", "doc-X-chunk-1"}, + pipeline_status=pipeline_status, + pipeline_status_lock=pipeline_status_lock, + ) + + # Both chunks gone from text_chunks. + remaining = await rag.text_chunks.get_by_ids( + ["doc-X-chunk-0", "doc-X-chunk-1"] + ) + assert remaining == [None, None] + finally: + await rag.finalize_storages() + + asyncio.run(_run()) + + +@pytest.mark.offline +def test_resume_purges_old_chunks_when_content_already_extracted(tmp_path): + """When ``apipeline_process_enqueue_documents`` picks up a document + whose content is already extracted (full_docs.format=raw with content) + and whose doc_status carries a non-empty chunks_list from a previous + half-finished run, the resume branch must call + ``_purge_doc_chunks_and_kg`` with the old chunk-IDs *before* the + chunking and entity-extraction stages run. This test wraps the + helper so we can assert it is invoked exactly once with the expected + inputs, then bails out so we don't have to mock the whole VLM / + entity-extract stack. + """ + + async def _run(): + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + doc_id = compute_mdhash_id("resume.txt", prefix="doc-") + + # Seed full_docs as if extraction already completed. + await rag.full_docs.upsert( + { + doc_id: { + "content": "previously extracted body", + "file_path": "resume.txt", + "canonical_basename": "resume.txt", + "format": "raw", + "parsed_engine": "legacy", + "content_hash": "deadbeef", + } + } + ) + # Seed doc_status as PROCESSING with chunks_list from a prior + # half-finished run so the resume branch has something to purge. + stale_chunks = [f"{doc_id}-chunk-{i:03d}" for i in range(2)] + await rag.doc_status.upsert( + { + doc_id: { + "status": DocStatus.PROCESSING, + "content_summary": "previously extracted body", + "content_length": len("previously extracted body"), + "created_at": "2026-01-01T00:00:00+00:00", + "updated_at": "2026-01-01T00:00:01+00:00", + "file_path": "resume.txt", + "canonical_basename": "resume.txt", + "track_id": "track-resume", + "content_hash": "deadbeef", + "chunks_list": stale_chunks, + "chunks_count": len(stale_chunks), + } + } + ) + + # Wrap the helper to record invocations, and raise after the call + # so the test exits cleanly without exercising downstream stages. + calls: list[tuple[str, set[str]]] = [] + original = rag._purge_doc_chunks_and_kg + + class _ResumePurged(Exception): + pass + + async def _wrapped(doc_id_arg, chunk_ids_arg, **kwargs): + calls.append((doc_id_arg, set(chunk_ids_arg))) + # Run the real helper so the side-effects (chunks gone from + # storage) are observable, then short-circuit. + await original(doc_id_arg, chunk_ids_arg, **kwargs) + raise _ResumePurged() + + rag._purge_doc_chunks_and_kg = _wrapped # type: ignore[method-assign] + + # Pipeline will pick up the PROCESSING document, hit the resume + # branch, call our wrapped purge, and our wrapper raises. + await rag.apipeline_process_enqueue_documents() + + # Helper was invoked exactly once with the stale chunk-IDs. + assert len(calls) == 1 + invoked_doc_id, invoked_chunks = calls[0] + assert invoked_doc_id == doc_id + assert invoked_chunks == set(stale_chunks) + finally: + await rag.finalize_storages() + + asyncio.run(_run()) + + +@pytest.mark.offline +def test_resume_skips_purge_when_chunks_list_empty(tmp_path): + """If the doc was extracted but never chunked (chunks_list empty), + the resume branch must NOT call the purge helper — there's nothing + to clean up. + """ + + async def _run(): + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + doc_id = compute_mdhash_id("noskip.txt", prefix="doc-") + + await rag.full_docs.upsert( + { + doc_id: { + "content": "fresh body", + "file_path": "noskip.txt", + "canonical_basename": "noskip.txt", + "format": "raw", + "parsed_engine": "legacy", + "content_hash": "fresh", + } + } + ) + await rag.doc_status.upsert( + { + doc_id: { + "status": DocStatus.PARSING, + "content_summary": "fresh body", + "content_length": len("fresh body"), + "created_at": "2026-01-01T00:00:00+00:00", + "updated_at": "2026-01-01T00:00:01+00:00", + "file_path": "noskip.txt", + "canonical_basename": "noskip.txt", + "track_id": "track-noskip", + "content_hash": "fresh", + "chunks_list": [], + "chunks_count": 0, + } + } + ) + + calls: list[tuple[str, set[str]]] = [] + + async def _spy(doc_id_arg, chunk_ids_arg, **kwargs): + calls.append((doc_id_arg, set(chunk_ids_arg))) + # Don't actually purge; just record the call and let the + # pipeline continue past this test boundary. + raise RuntimeError("test stop after purge check") + + rag._purge_doc_chunks_and_kg = _spy # type: ignore[method-assign] + + try: + await rag.apipeline_process_enqueue_documents() + except Exception: + # Whether the pipeline reaches our spy or fails downstream + # doesn't matter for this test; we only care that the spy + # was NOT called for an empty chunks_list. + pass + + assert ( + calls == [] + ), "purge helper should not be called when chunks_list is empty" + finally: + await rag.finalize_storages() + + asyncio.run(_run()) + + @pytest.mark.offline def test_apipeline_enqueue_rejects_when_pipeline_busy(tmp_path): """Pipeline busy / scanning state forbids any new enqueue. This is the From 648cb9ace44e3efa825dfaea7aaddbded55d0758 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 5 May 2026 02:55:54 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=90=9B=20fix(pipeline):=20apply=20S?= =?UTF-8?q?=20chunking=20option=20to=20native=20parsed=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the documented ``S`` (heading-driven) chunking option never took effect for the native / mineru / docling path: ``parse_native()`` writes a structured ``*.blocks.jsonl`` and returns the merged plain text as ``parsed_data['content']``, so ``parse_interchange_jsonl`` returned ``None`` against that merged text and the chunking branch unconditionally fell through to fixed chunking with a "no structured interchange output is available" warning. This change adds a heading-aware chunker over the already-written blocks file and wires the ``S`` mode to it when ``parsed_data`` carries a ``blocks_path``. - New ``chunk_lightrag_blocks_by_heading`` helper in ``lightrag/utils_pipeline.py``: groups consecutive content blocks by their ``(level, heading, parent_headings)`` key into a chunk with the heading rendered as a "Heading: " preface, then splits any group whose accumulated tokens exceed ``max_tokens`` along a sliding window with overlap (heading boundaries are hard splits). - ``process_document`` chunking branch in ``lightrag/pipeline.py`` gains a new path: when ``parse_interchange_jsonl`` declines AND ``process_options.chunking == 'S'`` AND ``parsed_data['blocks_path']`` is non-empty, invoke the heading chunker and tag ``extraction_meta['chunking_method'] = 'heading_driven'``. If the heading chunker returns no chunks (corrupt or trivially empty blocks file) the warning + fixed-chunking fallback is preserved. - ``R`` mode and ``S`` mode without a structured blocks file remain on the existing fixed-chunking fallback with the original warning. - New regression tests in ``tests/test_pipeline_release_closure.py``: - heading boundary creates a new chunk; consecutive blocks under the same heading concatenate into a single chunk with the heading preface - oversize single-heading group is split into multiple chunks with overlap, never crossing into another heading - missing blocks file returns an empty list (caller falls back to F) - end-to-end: pipeline invokes the heading chunker for a native ``S``-tagged document with a real blocks file Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- lightrag/pipeline.py | 72 ++++++- lightrag/utils_pipeline.py | 181 ++++++++++++++++++ tests/test_pipeline_release_closure.py | 254 +++++++++++++++++++++++++ 3 files changed, 501 insertions(+), 6 deletions(-) diff --git a/lightrag/pipeline.py b/lightrag/pipeline.py index 6460cff31f..bba6f00290 100644 --- a/lightrag/pipeline.py +++ b/lightrag/pipeline.py @@ -63,6 +63,7 @@ archive_source_after_full_docs_sync, augment_chunk_results_with_mm_entities, chunk_fields_from_status_doc, + chunk_lightrag_blocks_by_heading, compute_file_content_hash, compute_text_content_hash, doc_status_field, @@ -1255,6 +1256,9 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: parsed_interchange = parse_interchange_jsonl( content, self.tokenizer ) + structured_blocks_path = str( + parsed_data.get("blocks_path") or "" + ).strip() if parsed_interchange is not None: interchange_meta, interchange_chunks = ( parsed_interchange @@ -1273,21 +1277,77 @@ def get_failed_chunk_snapshot() -> tuple[list[str], int]: "chunking_method" ), } + elif ( + doc_process_opts.chunking == "S" + and structured_blocks_path + ): + # ``S`` mode against the native / mineru / docling + # structured-output path: chunk by heading + # boundaries straight from the ``*.blocks.jsonl`` + # file produced by the parser. ``content`` here is + # the merged-text view of those blocks (joined + # with blank lines), which has lost the heading + # metadata; the blocks file still has it. + heading_chunks = chunk_lightrag_blocks_by_heading( + structured_blocks_path, + self.tokenizer, + doc_id=doc_id, + max_tokens=self.chunk_token_size, + overlap_tokens=self.chunk_overlap_token_size, + ) + if heading_chunks: + logger.info( + f"[chunking] heading-driven chunking for " + f"d-id: {doc_id}: {len(heading_chunks)} " + f"chunks from {structured_blocks_path}" + ) + chunking_result = heading_chunks + extraction_meta = { + "extraction_format": "heading_driven", + "engine": (content_data or {}).get( + "parsed_engine", "native" + ), + "chunking_method": "heading_driven", + } + else: + logger.warning( + f"[chunking] process_options chunking='S' " + f"requested for d-id: {doc_id}, file: " + f"{file_path}, but {structured_blocks_path!r} " + f"yielded no heading-driven chunks; falling " + f"back to fixed chunking ('F')." + ) + chunking_result = self.chunking_func( + self.tokenizer, + content, + split_by_character, + split_by_character_only, + self.chunk_overlap_token_size, + self.chunk_token_size, + ) + if inspect.isawaitable(chunking_result): + chunking_result = await chunking_result + extraction_meta = { + "extraction_format": "plain_text_chunking", + "engine": "legacy", + "chunking_method": "fixed_token_fallback", + } else: # Per-document chunking strategy: # - 'F' (default): use the configured chunking_func # (chunking_by_token_size). - # - 'S' / 'R': require structured input which the - # legacy text path cannot provide; fall back to - # 'F' and log a warning so the user knows their - # selection had no effect for this document. + # - 'S' without a structured blocks file (e.g. + # legacy text path): cannot honour heading + # chunking; fall back to 'F' and warn. + # - 'R': recursive semantic chunking is not yet + # implemented; fall back to 'F' and warn. if doc_process_opts.chunking != "F": logger.warning( f"[chunking] process_options chunking=" f"{doc_process_opts.chunking!r} requested for d-id: " f"{doc_id}, file: {file_path}, but no structured " - f"interchange output is available; falling back to " - f"fixed chunking ('F')." + f"interchange / blocks output is available; " + f"falling back to fixed chunking ('F')." ) # Call chunking function, supporting both sync and async implementations chunking_result = self.chunking_func( diff --git a/lightrag/utils_pipeline.py b/lightrag/utils_pipeline.py index 7539c2dbd8..953b1adca0 100644 --- a/lightrag/utils_pipeline.py +++ b/lightrag/utils_pipeline.py @@ -330,6 +330,187 @@ async def archive_source_after_full_docs_sync(source_path: str) -> str | None: # --------------------------------------------------------------------------- +def chunk_lightrag_blocks_by_heading( + blocks_path: str, + tokenizer: Any, + *, + doc_id: str, + max_tokens: int, + overlap_tokens: int = 100, +) -> list[dict[str, Any]]: + """Heading-driven chunking over a LightRAG Document ``*.blocks.jsonl`` file. + + Implements the documented ``S`` chunking mode for the native / mineru / + docling structured-output path: groups consecutive content blocks under + the same ``heading`` (and ancestor ``parent_headings``) into a chunk, + using the heading boundary as a hard split point. Heading lines are + inserted as a "Heading: <title>" preface inside each chunk so retrieval + contexts stay self-describing. + + Within a single heading group, the accumulated text may still exceed + ``max_tokens``; in that case the group is split into multiple chunks + along block boundaries (each block is the smallest atomic unit) with + ``overlap_tokens`` of overlap. Splits never cross into a different + heading group. + + Each output chunk is a dict ready for downstream consumption (matches + the shape produced by ``parse_interchange_jsonl``): + + { + "chunk_id": "...", + "chunk_order_index": int, + "content": str, + "content_type": "body", + "tokens": int, + "table_chunk_role": "none", + "heading": str, + "parent_headings": list[str], + "level": int, + } + + Returns an empty list if the blocks file is missing or contains only + the meta line. + """ + path = Path(blocks_path) + if not path.exists() or not path.is_file(): + return [] + + # Read blocks; first line is meta (skipped), the rest are content blocks. + content_blocks: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as f: + for i, line in enumerate(f): + text = line.strip() + if not text: + continue + try: + obj = json.loads(text) + except Exception: + continue + if i == 0: + # Meta line; skip. + continue + if obj.get("type") != "content": + continue + content = obj.get("content") + if not isinstance(content, str) or not content.strip(): + continue + content_blocks.append(obj) + + if not content_blocks: + return [] + + def _heading_key(block: dict[str, Any]) -> tuple: + # Group by (level, heading, tuple(parent_headings)) so blocks under + # the same heading boundary share a key. + return ( + int(block.get("level") or 0), + str(block.get("heading") or ""), + tuple(str(p) for p in (block.get("parent_headings") or [])), + ) + + def _emit_chunk( + chunks: list[dict[str, Any]], + *, + text: str, + heading: str, + level: int, + parent_headings: list[str], + ) -> None: + text = text.strip() + if not text: + return + idx = len(chunks) + chunk_id = f"{doc_id}-h-{idx:03d}" + try: + tokens = len(tokenizer.encode(text)) + except Exception: + tokens = max(1, int(len(text) * 0.5)) + chunks.append( + { + "chunk_id": chunk_id, + "chunk_order_index": idx, + "content": text, + "content_type": "body", + "tokens": tokens, + "table_chunk_role": "none", + "heading": heading, + "parent_headings": list(parent_headings), + "level": level, + } + ) + + # Group consecutive blocks by heading, then split each group as needed. + chunks: list[dict[str, Any]] = [] + current_key: tuple | None = None + current_blocks: list[dict[str, Any]] = [] + + def _flush_group() -> None: + if not current_blocks: + return + first = current_blocks[0] + heading = str(first.get("heading") or "") + level = int(first.get("level") or 0) + parent_headings = [ + str(p) for p in (first.get("parent_headings") or []) + ] + # Build text with heading preface to keep retrieval contexts + # self-contained. + body_parts: list[str] = [] + if heading: + body_parts.append(f"Heading: {heading}") + for blk in current_blocks: + body_parts.append(str(blk.get("content") or "").strip()) + full_text = "\n\n".join(p for p in body_parts if p) + + try: + full_tokens_ids = tokenizer.encode(full_text) + except Exception: + full_tokens_ids = [] + + if not full_tokens_ids or len(full_tokens_ids) <= max_tokens: + # Fits in one chunk. + _emit_chunk( + chunks, + text=full_text, + heading=heading, + level=level, + parent_headings=parent_headings, + ) + return + + # Group is too large; split with token-window + overlap, but keep + # heading boundary intact (no cross-heading splits). + step = max(1, max_tokens - max(0, overlap_tokens)) + for start in range(0, len(full_tokens_ids), step): + window = full_tokens_ids[start : start + max_tokens] + try: + segment_text = tokenizer.decode(window).strip() + except Exception: + segment_text = "" + if not segment_text: + continue + _emit_chunk( + chunks, + text=segment_text, + heading=heading, + level=level, + parent_headings=parent_headings, + ) + if start + max_tokens >= len(full_tokens_ids): + break + + for block in content_blocks: + key = _heading_key(block) + if current_key is None or key != current_key: + _flush_group() + current_blocks = [] + current_key = key + current_blocks.append(block) + _flush_group() + + return chunks + + async def load_lightrag_document_content( lightrag_document_path: str, ) -> tuple[str, str]: diff --git a/tests/test_pipeline_release_closure.py b/tests/test_pipeline_release_closure.py index 8bd1643d02..465c9705ad 100644 --- a/tests/test_pipeline_release_closure.py +++ b/tests/test_pipeline_release_closure.py @@ -290,6 +290,260 @@ async def _run(): asyncio.run(_run()) +@pytest.mark.offline +def test_chunk_lightrag_blocks_by_heading_groups_consecutive_blocks(tmp_path): + """``chunk_lightrag_blocks_by_heading`` groups blocks under the same + heading into a single chunk, with the heading rendered as a "Heading:" + preface so retrieval contexts stay self-describing. A heading change + creates a new chunk; ``chunk_order_index`` is sequential. + """ + from lightrag.utils_pipeline import chunk_lightrag_blocks_by_heading + + blocks_path = tmp_path / "doc.blocks.jsonl" + blocks_path.write_text( + "\n".join( + [ + json.dumps({"type": "meta", "format_version": "1.0"}), + json.dumps( + { + "type": "content", + "content": "Intro paragraph one.", + "heading": "Introduction", + "level": 1, + "parent_headings": [], + } + ), + json.dumps( + { + "type": "content", + "content": "Intro paragraph two.", + "heading": "Introduction", + "level": 1, + "parent_headings": [], + } + ), + json.dumps( + { + "type": "content", + "content": "Methods paragraph.", + "heading": "Methods", + "level": 1, + "parent_headings": [], + } + ), + ] + ) + + "\n", + encoding="utf-8", + ) + + chunks = chunk_lightrag_blocks_by_heading( + str(blocks_path), + Tokenizer("mock-tokenizer", _SimpleTokenizerImpl()), + doc_id="doc-test", + max_tokens=10000, + overlap_tokens=0, + ) + + assert len(chunks) == 2 + # First chunk: both intro paragraphs under "Introduction". + assert chunks[0]["chunk_order_index"] == 0 + assert chunks[0]["chunk_id"] == "doc-test-h-000" + assert chunks[0]["heading"] == "Introduction" + assert chunks[0]["content_type"] == "body" + assert "Heading: Introduction" in chunks[0]["content"] + assert "Intro paragraph one." in chunks[0]["content"] + assert "Intro paragraph two." in chunks[0]["content"] + # Heading boundary creates the second chunk. + assert chunks[1]["chunk_order_index"] == 1 + assert chunks[1]["chunk_id"] == "doc-test-h-001" + assert chunks[1]["heading"] == "Methods" + assert "Methods paragraph." in chunks[1]["content"] + # The second chunk does NOT carry over the first heading's content. + assert "Intro paragraph" not in chunks[1]["content"] + + +@pytest.mark.offline +def test_chunk_lightrag_blocks_by_heading_splits_oversize_group(tmp_path): + """If the accumulated text under a single heading exceeds ``max_tokens``, + the group is split into multiple chunks (each carrying the same + heading metadata) using a sliding window with overlap, but splits + never cross into another heading. + """ + from lightrag.utils_pipeline import chunk_lightrag_blocks_by_heading + + long_body = "x" * 200 # mock tokenizer returns one token per char + blocks_path = tmp_path / "long.blocks.jsonl" + blocks_path.write_text( + "\n".join( + [ + json.dumps({"type": "meta", "format_version": "1.0"}), + json.dumps( + { + "type": "content", + "content": long_body, + "heading": "Solo", + "level": 1, + "parent_headings": [], + } + ), + ] + ) + + "\n", + encoding="utf-8", + ) + + chunks = chunk_lightrag_blocks_by_heading( + str(blocks_path), + Tokenizer("mock-tokenizer", _SimpleTokenizerImpl()), + doc_id="doc-long", + max_tokens=80, + overlap_tokens=10, + ) + + # 200-token body under one heading with max_tokens=80 + overlap=10 + # → at least 3 windows. All chunks share the same heading metadata. + assert len(chunks) >= 3 + assert all(chunk["heading"] == "Solo" for chunk in chunks) + # chunk_order_index runs 0..n-1 contiguously. + assert [chunk["chunk_order_index"] for chunk in chunks] == list( + range(len(chunks)) + ) + + +@pytest.mark.offline +def test_chunk_lightrag_blocks_by_heading_returns_empty_for_missing_file(tmp_path): + """Missing or empty blocks file returns ``[]``; the caller falls back + to fixed chunking with a warning. + """ + from lightrag.utils_pipeline import chunk_lightrag_blocks_by_heading + + chunks = chunk_lightrag_blocks_by_heading( + str(tmp_path / "does_not_exist.blocks.jsonl"), + Tokenizer("mock-tokenizer", _SimpleTokenizerImpl()), + doc_id="doc-missing", + max_tokens=1000, + ) + assert chunks == [] + + +@pytest.mark.offline +def test_pipeline_uses_heading_chunker_for_native_S_with_blocks(tmp_path): + """End-to-end: when ``process_options`` sets chunking='S' AND the + parsed_data carries a ``blocks_path`` (native / mineru / docling + structured output), the pipeline must invoke the heading-driven + chunker instead of falling through to fixed chunking. + + Wraps ``chunk_lightrag_blocks_by_heading`` with a spy and seeds a + minimal blocks file + full_docs row + doc_status so we can drive + ``apipeline_process_enqueue_documents`` to the chunking branch. + The spy raises after recording the call so the test exits cleanly + without exercising downstream embedding / entity stages. + """ + + async def _run(): + rag = _new_rag(tmp_path) + await rag.initialize_storages() + try: + doc_id = compute_mdhash_id("native-s.docx", prefix="doc-") + blocks_path = tmp_path / "native-s.docx.parsed" / "native-s.blocks.jsonl" + blocks_path.parent.mkdir(parents=True, exist_ok=True) + blocks_path.write_text( + "\n".join( + [ + json.dumps( + { + "type": "meta", + "format": "lightrag", + "version": "1.0", + "format_version": "1.0", + } + ), + json.dumps( + { + "type": "content", + "content": "Body under heading.", + "heading": "Section A", + "level": 1, + "parent_headings": [], + } + ), + ] + ) + + "\n", + encoding="utf-8", + ) + + await rag.full_docs.upsert( + { + doc_id: { + "content": "{{LRdoc}}Body under heading.", + "file_path": "native-s.docx", + "canonical_basename": "native-s.docx", + "format": "lightrag", + "lightrag_document_path": str(blocks_path), + "parsed_engine": "native", + "process_options": "S", + "content_hash": "lr-1", + } + } + ) + await rag.doc_status.upsert( + { + doc_id: { + "status": DocStatus.PENDING, + "content_summary": "Body under heading.", + "content_length": 20, + "created_at": "2026-01-01T00:00:00+00:00", + "updated_at": "2026-01-01T00:00:01+00:00", + "file_path": "native-s.docx", + "canonical_basename": "native-s.docx", + "track_id": "track-s", + "content_hash": "lr-1", + "chunks_list": [], + "chunks_count": 0, + "metadata": {"process_options": "S"}, + } + } + ) + + calls: list[dict] = [] + import lightrag.pipeline as pipeline_module + + original = pipeline_module.chunk_lightrag_blocks_by_heading + + class _ChunkerCalled(Exception): + pass + + def _spy(blocks_path_arg, tokenizer, **kwargs): + calls.append( + { + "blocks_path": blocks_path_arg, + "doc_id": kwargs.get("doc_id"), + } + ) + # Run the real chunker so chunks_result is non-empty + # (otherwise the pipeline would fall back to F), then + # short-circuit before downstream stages. + result = original(blocks_path_arg, tokenizer, **kwargs) + raise _ChunkerCalled(repr(result)) + + pipeline_module.chunk_lightrag_blocks_by_heading = _spy + try: + await rag.apipeline_process_enqueue_documents() + finally: + pipeline_module.chunk_lightrag_blocks_by_heading = original + + # Spy was invoked exactly once, with our blocks path + doc_id. + assert len(calls) == 1 + assert calls[0]["doc_id"] == doc_id + assert calls[0]["blocks_path"].endswith("native-s.blocks.jsonl") + finally: + await rag.finalize_storages() + + asyncio.run(_run()) + + @pytest.mark.offline def test_purge_doc_chunks_and_kg_is_noop_for_empty_chunks(tmp_path): """``_purge_doc_chunks_and_kg`` with an empty chunk_ids set must be a From 7d654c154fd57e07ccf17c4c2918ef584032e227 Mon Sep 17 00:00:00 2001 From: yangdx <gzdaniel@me.com> Date: Tue, 5 May 2026 02:57:25 +0800 Subject: [PATCH 5/5] Fix lintings --- lightrag/utils_pipeline.py | 4 +--- tests/test_pipeline_release_closure.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/lightrag/utils_pipeline.py b/lightrag/utils_pipeline.py index 953b1adca0..e3d6350c4f 100644 --- a/lightrag/utils_pipeline.py +++ b/lightrag/utils_pipeline.py @@ -450,9 +450,7 @@ def _flush_group() -> None: first = current_blocks[0] heading = str(first.get("heading") or "") level = int(first.get("level") or 0) - parent_headings = [ - str(p) for p in (first.get("parent_headings") or []) - ] + parent_headings = [str(p) for p in (first.get("parent_headings") or [])] # Build text with heading preface to keep retrieval contexts # self-contained. body_parts: list[str] = [] diff --git a/tests/test_pipeline_release_closure.py b/tests/test_pipeline_release_closure.py index 465c9705ad..4457176098 100644 --- a/tests/test_pipeline_release_closure.py +++ b/tests/test_pipeline_release_closure.py @@ -406,9 +406,7 @@ def test_chunk_lightrag_blocks_by_heading_splits_oversize_group(tmp_path): assert len(chunks) >= 3 assert all(chunk["heading"] == "Solo" for chunk in chunks) # chunk_order_index runs 0..n-1 contiguously. - assert [chunk["chunk_order_index"] for chunk in chunks] == list( - range(len(chunks)) - ) + assert [chunk["chunk_order_index"] for chunk in chunks] == list(range(len(chunks))) @pytest.mark.offline