Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 142 additions & 12 deletions docs/FileProcessingConfiguration-zh.md

Large diffs are not rendered by default.

27 changes: 25 additions & 2 deletions lightrag/addon_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,25 @@
from typing import Any, Callable, Mapping

from lightrag.constants import DEFAULT_SUMMARY_LANGUAGE
from lightrag.utils import get_env_value
from lightrag.utils import get_env_value, logger


# Keys that used to live in addon_params but have been superseded by
# per-document ``process_options``. We log once when callers still pass them
# so existing configs surface their drift without breaking.
_DEPRECATED_ADDON_PARAM_KEYS: tuple[str, ...] = ("enable_multimodal_pipeline",)
_warned_deprecated_keys: set[str] = set()


def _emit_deprecated_addon_warnings(params: Mapping[str, Any]) -> None:
for key in _DEPRECATED_ADDON_PARAM_KEYS:
if key in params and key not in _warned_deprecated_keys:
logger.warning(
f"addon_params['{key}'] is deprecated and ignored; per-document "
f"behaviour is now controlled by filename-hint process_options "
f"(see docs/FileProcessingConfiguration-zh.md)."
)
_warned_deprecated_keys.add(key)


def default_addon_params() -> dict[str, Any]:
Expand All @@ -32,7 +50,12 @@ def normalize_addon_params(addon_params: Mapping[str, Any] | None) -> dict[str,
if addon_params is None:
normalized = default_addon_params()
elif isinstance(addon_params, Mapping):
normalized = dict(addon_params)
_emit_deprecated_addon_warnings(addon_params)
normalized = {
k: v
for k, v in addon_params.items()
if k not in _DEPRECATED_ADDON_PARAM_KEYS
}
else:
raise TypeError(
"addon_params must be a Mapping or None, got "
Expand Down
273 changes: 218 additions & 55 deletions lightrag/api/routers/document_routes.py

Large diffs are not rendered by default.

41 changes: 32 additions & 9 deletions lightrag/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,12 @@ class DocProcessingStatus:
Used together with file_path basename for duplicate detection. Empty for
pending_parse records whose content has not been extracted yet.
"""
canonical_basename: str | None = None
"""Canonical (parser-hint stripped) basename used as the dedup index key.

Stored alongside the original ``file_path`` so backends can look records
up by ``abc.docx`` even when the user-visible name is ``abc.[native].docx``.
"""
"""Internal field: indicates if multimodal processing is complete. Not shown in repr() but accessible for debugging."""

def __post_init__(self):
Expand Down Expand Up @@ -917,9 +923,12 @@ async def get_doc_by_file_basename(
) -> tuple[str, dict[str, Any]] | None:
"""Get document by file basename (filename without directory).

Used for filename-based deduplication. Backends that can index by
basename should override this for efficiency. The default implementation
scans all documents via get_docs_by_statuses().
Used for filename-based deduplication. Comparison is performed
against the canonical (parser-hint stripped) basename, so callers may
pass either the original or the canonical form. Backends that can
index by ``canonical_basename`` should override this for efficiency.
The default implementation scans all documents via
:meth:`get_docs_by_statuses`.

Args:
basename: The filename basename to search for (e.g. "report.pdf").
Expand All @@ -929,21 +938,35 @@ async def get_doc_by_file_basename(
"""
if not basename:
return None
# Imported lazily to avoid a hard dependency at module-load time and
# because parser_routing already depends on lightrag.constants.
from lightrag.parser_routing import canonicalize_parser_hinted_basename

target = canonicalize_parser_hinted_basename(basename)
try:
docs = await self.get_docs_by_statuses(list(DocStatus))
except NotImplementedError:
raise
except Exception:
return None
for doc_id, doc in docs.items():
existing_path = (
doc.get("file_path")
stored_canonical = (
doc.get("canonical_basename")
if isinstance(doc, dict)
else getattr(doc, "file_path", None)
else getattr(doc, "canonical_basename", None)
)
if not existing_path:
continue
if Path(str(existing_path)).name == basename:
if not stored_canonical:
existing_path = (
doc.get("file_path")
if isinstance(doc, dict)
else getattr(doc, "file_path", None)
)
if not existing_path:
continue
stored_canonical = canonicalize_parser_hinted_basename(
Path(str(existing_path)).name
)
if stored_canonical == target:
return doc_id, (doc if isinstance(doc, dict) else asdict(doc))
return None

Expand Down
31 changes: 31 additions & 0 deletions lightrag/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,37 @@
}
PARSED_DIR_NAME = "__parsed__" # Dir for parsed files (renamed from __enqueued__)

# Per-file processing options carried by filename hints / LIGHTRAG_PARSER rules.
# See docs/FileProcessingConfiguration-zh.md for the full specification.
PROCESS_OPTION_IMAGES = "i" # Enable VLM analysis for drawings/images
PROCESS_OPTION_TABLES = "t" # Enable VLM analysis for tables
PROCESS_OPTION_EQUATIONS = "e" # Enable VLM analysis for equations
PROCESS_OPTION_SKIP_KG = "!" # Skip entity/relation extraction (no KG build)
PROCESS_OPTION_CHUNK_FIXED = "F" # Fixed-length / separator chunking (default)
PROCESS_OPTION_CHUNK_RECURSIVE = (
"R" # Recursive semantic chunking (currently aliased to F)
)
PROCESS_OPTION_CHUNK_HEADING = "S" # Heading-driven semantic chunking

PROCESS_OPTION_CHUNK_CHARS = frozenset(
{
PROCESS_OPTION_CHUNK_FIXED,
PROCESS_OPTION_CHUNK_RECURSIVE,
PROCESS_OPTION_CHUNK_HEADING,
}
)
SUPPORTED_PROCESS_OPTIONS = frozenset(
{
PROCESS_OPTION_IMAGES,
PROCESS_OPTION_TABLES,
PROCESS_OPTION_EQUATIONS,
PROCESS_OPTION_SKIP_KG,
PROCESS_OPTION_CHUNK_FIXED,
PROCESS_OPTION_CHUNK_RECURSIVE,
PROCESS_OPTION_CHUNK_HEADING,
}
)

DEFAULT_MAX_PARALLEL_ANALYZE = 2 # Multimodal analysis (VLM) concurrency

# Embedding configuration defaults
Expand Down
26 changes: 18 additions & 8 deletions lightrag/kg/json_doc_status_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,23 +404,33 @@ async def get_doc_by_file_path(self, file_path: str) -> Union[dict[str, Any], No
async def get_doc_by_file_basename(
self, basename: str
) -> Union[tuple[str, dict[str, Any]], None]:
"""Find an existing record whose file_path basename matches.
"""Find an existing record whose canonical basename matches.

Compares against the stored file_path's basename so legacy records
that still hold a full path are matched the same way as new records
whose file_path is already a basename.
Compares against the stored ``canonical_basename`` field, falling
back to ``canonicalize_parser_hinted_basename(file_path)`` for
legacy records that pre-date the field. Inputs are likewise
canonicalized so callers can pass either ``abc.docx`` or
``abc.[native].docx``.
"""
if not basename:
return None
if self._storage_lock is None:
raise StorageNotInitializedError("JsonDocStatusStorage")

from lightrag.parser_routing import canonicalize_parser_hinted_basename

target = canonicalize_parser_hinted_basename(basename)
async with self._storage_lock:
for doc_id, doc_data in self._data.items():
stored_path = doc_data.get("file_path")
if not stored_path:
continue
if Path(str(stored_path)).name == basename:
stored_canonical = doc_data.get("canonical_basename")
if not stored_canonical:
stored_path = doc_data.get("file_path")
if not stored_path:
continue
stored_canonical = canonicalize_parser_hinted_basename(
Path(str(stored_path)).name
)
if stored_canonical == target:
return doc_id, doc_data
return None

Expand Down
1 change: 1 addition & 0 deletions lightrag/kg/shared_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,7 @@ async def initialize_pipeline_status(workspace: str | None = None):
{
"autoscanned": False, # Auto-scan started
"busy": False, # Control concurrent processes
"scanning": False, # /documents/scan in progress (independent of busy)
"job_name": "-", # Current job name (indexing files/indexing texts)
"job_start": None, # Job start time
"docs": 0, # Total number of documents to be indexed
Expand Down
Loading
Loading