HKUDS · danielaskdd · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/docs/FileProcessingConfiguration-zh.md b/docs/FileProcessingConfiguration-zh.md
diff --git a/lightrag/addon_params.py b/lightrag/addon_params.py
@@ -17,7 +17,25 @@
 from typing import Any, Callable, Mapping
 
 from lightrag.constants import DEFAULT_SUMMARY_LANGUAGE
-from lightrag.utils import get_env_value
+from lightrag.utils import get_env_value, logger
+
+
+# Keys that used to live in addon_params but have been superseded by
+# per-document ``process_options``.  We log once when callers still pass them
+# so existing configs surface their drift without breaking.
+_DEPRECATED_ADDON_PARAM_KEYS: tuple[str, ...] = ("enable_multimodal_pipeline",)
+_warned_deprecated_keys: set[str] = set()
+
+
+def _emit_deprecated_addon_warnings(params: Mapping[str, Any]) -> None:
+    for key in _DEPRECATED_ADDON_PARAM_KEYS:
+        if key in params and key not in _warned_deprecated_keys:
+            logger.warning(
+                f"addon_params['{key}'] is deprecated and ignored; per-document "
+                f"behaviour is now controlled by filename-hint process_options "
+                f"(see docs/FileProcessingConfiguration-zh.md)."
+            )
+            _warned_deprecated_keys.add(key)
 
 
 def default_addon_params() -> dict[str, Any]:
@@ -32,7 +50,12 @@ def normalize_addon_params(addon_params: Mapping[str, Any] | None) -> dict[str,
     if addon_params is None:
         normalized = default_addon_params()
     elif isinstance(addon_params, Mapping):
-        normalized = dict(addon_params)
+        _emit_deprecated_addon_warnings(addon_params)
+        normalized = {
+            k: v
+            for k, v in addon_params.items()
+            if k not in _DEPRECATED_ADDON_PARAM_KEYS
+        }
     else:
         raise TypeError(
             "addon_params must be a Mapping or None, got "

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
diff --git a/lightrag/base.py b/lightrag/base.py
@@ -806,6 +806,12 @@ class DocProcessingStatus:
     Used together with file_path basename for duplicate detection. Empty for
     pending_parse records whose content has not been extracted yet.
     """
+    canonical_basename: str | None = None
+    """Canonical (parser-hint stripped) basename used as the dedup index key.
+
+    Stored alongside the original ``file_path`` so backends can look records
+    up by ``abc.docx`` even when the user-visible name is ``abc.[native].docx``.
+    """
     """Internal field: indicates if multimodal processing is complete. Not shown in repr() but accessible for debugging."""
 
     def __post_init__(self):
@@ -917,9 +923,12 @@ async def get_doc_by_file_basename(
     ) -> tuple[str, dict[str, Any]] | None:
         """Get document by file basename (filename without directory).
 
-        Used for filename-based deduplication. Backends that can index by
-        basename should override this for efficiency. The default implementation
-        scans all documents via get_docs_by_statuses().
+        Used for filename-based deduplication.  Comparison is performed
+        against the canonical (parser-hint stripped) basename, so callers may
+        pass either the original or the canonical form.  Backends that can
+        index by ``canonical_basename`` should override this for efficiency.
+        The default implementation scans all documents via
+        :meth:`get_docs_by_statuses`.
 
         Args:
             basename: The filename basename to search for (e.g. "report.pdf").
@@ -929,21 +938,35 @@ async def get_doc_by_file_basename(
         """
         if not basename:
             return None
+        # Imported lazily to avoid a hard dependency at module-load time and
+        # because parser_routing already depends on lightrag.constants.
+        from lightrag.parser_routing import canonicalize_parser_hinted_basename
+
+        target = canonicalize_parser_hinted_basename(basename)
         try:
             docs = await self.get_docs_by_statuses(list(DocStatus))
         except NotImplementedError:
             raise
         except Exception:
             return None
         for doc_id, doc in docs.items():
-            existing_path = (
-                doc.get("file_path")
+            stored_canonical = (
+                doc.get("canonical_basename")
                 if isinstance(doc, dict)
-                else getattr(doc, "file_path", None)
+                else getattr(doc, "canonical_basename", None)
             )
-            if not existing_path:
-                continue
-            if Path(str(existing_path)).name == basename:
+            if not stored_canonical:
+                existing_path = (
+                    doc.get("file_path")
+                    if isinstance(doc, dict)
+                    else getattr(doc, "file_path", None)
+                )
+                if not existing_path:
+                    continue
+                stored_canonical = canonicalize_parser_hinted_basename(
+                    Path(str(existing_path)).name
+                )
+            if stored_canonical == target:
                 return doc_id, (doc if isinstance(doc, dict) else asdict(doc))
         return None
 

diff --git a/lightrag/constants.py b/lightrag/constants.py
@@ -168,6 +168,37 @@
 }
 PARSED_DIR_NAME = "__parsed__"  # Dir for parsed files (renamed from __enqueued__)
 
+# Per-file processing options carried by filename hints / LIGHTRAG_PARSER rules.
+# See docs/FileProcessingConfiguration-zh.md for the full specification.
+PROCESS_OPTION_IMAGES = "i"  # Enable VLM analysis for drawings/images
+PROCESS_OPTION_TABLES = "t"  # Enable VLM analysis for tables
+PROCESS_OPTION_EQUATIONS = "e"  # Enable VLM analysis for equations
+PROCESS_OPTION_SKIP_KG = "!"  # Skip entity/relation extraction (no KG build)
+PROCESS_OPTION_CHUNK_FIXED = "F"  # Fixed-length / separator chunking (default)
+PROCESS_OPTION_CHUNK_RECURSIVE = (
+    "R"  # Recursive semantic chunking (currently aliased to F)
+)
+PROCESS_OPTION_CHUNK_HEADING = "S"  # Heading-driven semantic chunking
+
+PROCESS_OPTION_CHUNK_CHARS = frozenset(
+    {
+        PROCESS_OPTION_CHUNK_FIXED,
+        PROCESS_OPTION_CHUNK_RECURSIVE,
+        PROCESS_OPTION_CHUNK_HEADING,
+    }
+)
+SUPPORTED_PROCESS_OPTIONS = frozenset(
+    {
+        PROCESS_OPTION_IMAGES,
+        PROCESS_OPTION_TABLES,
+        PROCESS_OPTION_EQUATIONS,
+        PROCESS_OPTION_SKIP_KG,
+        PROCESS_OPTION_CHUNK_FIXED,
+        PROCESS_OPTION_CHUNK_RECURSIVE,
+        PROCESS_OPTION_CHUNK_HEADING,
+    }
+)
+
 DEFAULT_MAX_PARALLEL_ANALYZE = 2  # Multimodal analysis (VLM) concurrency
 
 # Embedding configuration defaults

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
@@ -404,23 +404,33 @@ async def get_doc_by_file_path(self, file_path: str) -> Union[dict[str, Any], No
     async def get_doc_by_file_basename(
         self, basename: str
     ) -> Union[tuple[str, dict[str, Any]], None]:
-        """Find an existing record whose file_path basename matches.
+        """Find an existing record whose canonical basename matches.
 
-        Compares against the stored file_path's basename so legacy records
-        that still hold a full path are matched the same way as new records
-        whose file_path is already a basename.
+        Compares against the stored ``canonical_basename`` field, falling
+        back to ``canonicalize_parser_hinted_basename(file_path)`` for
+        legacy records that pre-date the field.  Inputs are likewise
+        canonicalized so callers can pass either ``abc.docx`` or
+        ``abc.[native].docx``.
         """
         if not basename:
             return None
         if self._storage_lock is None:
             raise StorageNotInitializedError("JsonDocStatusStorage")
 
+        from lightrag.parser_routing import canonicalize_parser_hinted_basename
+
+        target = canonicalize_parser_hinted_basename(basename)
         async with self._storage_lock:
             for doc_id, doc_data in self._data.items():
-                stored_path = doc_data.get("file_path")
-                if not stored_path:
-                    continue
-                if Path(str(stored_path)).name == basename:
+                stored_canonical = doc_data.get("canonical_basename")
+                if not stored_canonical:
+                    stored_path = doc_data.get("file_path")
+                    if not stored_path:
+                        continue
+                    stored_canonical = canonicalize_parser_hinted_basename(
+                        Path(str(stored_path)).name
+                    )
+                if stored_canonical == target:
                     return doc_id, doc_data
         return None
 

diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
@@ -1289,6 +1289,7 @@ async def initialize_pipeline_status(workspace: str | None = None):
             {
                 "autoscanned": False,  # Auto-scan started
                 "busy": False,  # Control concurrent processes
+                "scanning": False,  # /documents/scan in progress (independent of busy)
                 "job_name": "-",  # Current job name (indexing files/indexing texts)
                 "job_start": None,  # Job start time
                 "docs": 0,  # Total number of documents to be indexed