Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lightrag/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
DEFAULT_OLLAMA_MODEL_TAG,
DEFAULT_RERANK_BINDING,
DEFAULT_ENTITY_TYPES,
DEFAULT_ENTITY_ATTRIBUTES,
)

# use the .env that is inside the current folder
Expand Down Expand Up @@ -474,6 +475,9 @@ def parse_args() -> argparse.Namespace:
args.cors_origins = get_env_value("CORS_ORIGINS", "*")
args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)
args.entity_types = get_env_value("ENTITY_TYPES", DEFAULT_ENTITY_TYPES, list)
args.entity_attributes = get_env_value(
"ENTITY_ATTRIBUTES", DEFAULT_ENTITY_ATTRIBUTES, list
)
args.whitelist_paths = get_env_value("WHITELIST_PATHS", "/health,/api/*")

# For JWT Auth
Expand Down
1 change: 1 addition & 0 deletions lightrag/api/lightrag_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,7 @@ async def server_rerank_func(
addon_params={
"language": args.summary_language,
"entity_types": args.entity_types,
"entity_attributes": args.entity_attributes,
},
ollama_server_infos=ollama_server_infos,
)
Expand Down
5 changes: 5 additions & 0 deletions lightrag/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
"NaturalObject",
]

# Default entity attributes: empty means no extra attribute field is extracted.
# Override with ENTITY_ATTRIBUTES env var, e.g.:
# ENTITY_ATTRIBUTES='["sentiment","urgency","business_impact","confidence"]'
DEFAULT_ENTITY_ATTRIBUTES: list[str] = []

# Separator for: description, source_id and relation-key fields(Can not be changed after data inserted)
GRAPH_FIELD_SEP = "<SEP>"

Expand Down
97 changes: 94 additions & 3 deletions lightrag/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,11 +388,14 @@ def _handle_single_entity_extraction(
chunk_key: str,
timestamp: int,
file_path: str = "unknown_source",
entity_attributes: list[str] | None = None,
):
if len(record_attributes) != 4 or "entity" not in record_attributes[0]:
# Accept 4 fields (no attributes) or 5 fields (with attributes JSON as 5th field).
expected_fields = 5 if entity_attributes else 4
if len(record_attributes) not in (4, 5) or "entity" not in record_attributes[0]:
if len(record_attributes) > 1 and "entity" in record_attributes[0]:
logger.warning(
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 fields on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`"
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/{expected_fields} fields on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`"
)
logger.debug(record_attributes)
return None
Expand Down Expand Up @@ -449,13 +452,36 @@ def _handle_single_entity_extraction(
)
return None

# Parse optional 5th field: compact JSON object with entity attributes.
# Produced only when entity_attributes is non-empty and the LLM chose to
# output them. Malformed JSON is logged and silently dropped so extraction
# still succeeds for the entity itself.
attributes: dict | None = None
if len(record_attributes) >= 5 and record_attributes[4].strip():
raw_attrs = record_attributes[4].strip()
try:
parsed = json.loads(raw_attrs)
if isinstance(parsed, dict):
attributes = parsed
else:
logger.warning(
f"{chunk_key}: entity attributes field is not a JSON object "
f"for '{entity_name}': {raw_attrs[:120]}"
)
except json.JSONDecodeError:
logger.warning(
f"{chunk_key}: could not parse entity attributes JSON "
f"for '{entity_name}': {raw_attrs[:120]}"
)

return dict(
entity_name=entity_name,
entity_type=entity_type,
description=entity_description,
source_id=chunk_key,
file_path=file_path,
timestamp=timestamp,
attributes=attributes,
)

except ValueError as e:
Expand Down Expand Up @@ -941,6 +967,7 @@ async def _process_extraction_result(
file_path: str = "unknown_source",
tuple_delimiter: str = "<|#|>",
completion_delimiter: str = "<|COMPLETE|>",
entity_attributes: list[str] | None = None,
) -> tuple[dict, dict]:
"""Process a single extraction result (either initial or gleaning)
Args:
Expand Down Expand Up @@ -1023,7 +1050,8 @@ async def _process_extraction_result(

# Try to parse as entity
entity_data = _handle_single_entity_extraction(
record_attributes, chunk_key, timestamp, file_path
record_attributes, chunk_key, timestamp, file_path,
entity_attributes=entity_attributes,
)
if entity_data is not None:
truncated_name = _truncate_entity_identifier(
Expand Down Expand Up @@ -1124,6 +1152,7 @@ async def _update_entity_storage(
file_paths: list[str],
source_chunk_ids: list[str],
truncation_info: str = "",
attributes: dict | None = None,
):
try:
# Update entity in graph storage (critical path)
Expand All @@ -1138,6 +1167,11 @@ async def _update_entity_storage(
"created_at": int(time.time()),
"truncate": truncation_info,
}
# Store merged attributes when present. Stored as a JSON string so that
# all graph backends (NetworkX, Postgres, Neo4j) can round-trip it without
# schema changes. Consumers read it back with json.loads.
if attributes:
updated_entity_data["attributes"] = json.dumps(attributes)
await knowledge_graph_inst.upsert_node(entity_name, updated_entity_data)

# Update entity in vector database (equally critical)
Expand Down Expand Up @@ -1255,6 +1289,8 @@ async def _update_entity_storage(
entity_types = []
file_paths_list = []
seen_paths = set()
# Collect attributes from all entity instances; last non-null value per key wins.
merged_attributes: dict = {}

for entity_data in all_entity_data:
if entity_data.get("description"):
Expand All @@ -1266,6 +1302,11 @@ async def _update_entity_storage(
if file_path and file_path not in seen_paths:
file_paths_list.append(file_path)
seen_paths.add(file_path)
attrs = entity_data.get("attributes")
if isinstance(attrs, dict):
merged_attributes.update(
{k: v for k, v in attrs.items() if v is not None}
)

# Apply MAX_FILE_PATHS limit
max_file_paths = global_config.get("max_file_paths", DEFAULT_MAX_FILE_PATHS)
Expand Down Expand Up @@ -1327,6 +1368,7 @@ async def _update_entity_storage(
file_paths_list,
limited_chunk_ids,
truncation_info,
attributes=merged_attributes or None,
)

# Log rebuild completion with truncation info
Expand Down Expand Up @@ -1761,6 +1803,30 @@ async def _merge_nodes_then_upsert(
reverse=True,
)[0][0]

# Merge attributes across all new entity instances (last non-null value per key wins).
# Seed from the existing node so re-processing a document doesn't wipe old attrs.
merged_attributes: dict = {}
if already_node:
existing_attrs_raw = already_node.get("attributes")
if existing_attrs_raw:
try:
existing_attrs = (
json.loads(existing_attrs_raw)
if isinstance(existing_attrs_raw, str)
else existing_attrs_raw
)
if isinstance(existing_attrs, dict):
merged_attributes.update(existing_attrs)
except (json.JSONDecodeError, TypeError):
pass

for dp in nodes_data:
attrs = dp.get("attributes")
if isinstance(attrs, dict):
merged_attributes.update(
{k: v for k, v in attrs.items() if v is not None}
)

# 7. Deduplicate nodes by description, keeping first occurrence in the same document
unique_nodes = {}
for i, dp in enumerate(nodes_data, start=1):
Expand Down Expand Up @@ -1912,6 +1978,8 @@ async def _merge_nodes_then_upsert(
created_at=int(time.time()),
truncate=truncation_info,
)
if merged_attributes:
node_data["attributes"] = json.dumps(merged_attributes)
await knowledge_graph_inst.upsert_node(
entity_name,
node_data=node_data,
Expand Down Expand Up @@ -2905,13 +2973,32 @@ async def extract_entities(
entity_types = global_config["addon_params"].get(
"entity_types", DEFAULT_ENTITY_TYPES
)
entity_attributes: list[str] = global_config["addon_params"].get(
"entity_attributes", []
) or []

examples = "\n".join(PROMPTS["entity_extraction_examples"])

# Build the attributes instruction injected into the user prompt.
# Empty list → plain 4-field format (backward-compatible).
if entity_attributes:
attrs_quoted = ", ".join(f'"{a}"' for a in entity_attributes)
entity_attributes_instruction = (
f"For each entity append a 5th field: a compact single-line JSON object "
f"with keys [{attrs_quoted}]. Infer each value from the context; "
f"use null if not determinable. "
f"Numeric confidence must be 0.0–1.0. "
f"Example: {{\"sentiment\": \"negative\", \"urgency\": \"high\", \"confidence\": 0.85}}"
)
else:
entity_attributes_instruction = "No additional attributes are required."

example_context_base = dict(
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
entity_types=", ".join(entity_types),
entity_attributes=", ".join(entity_attributes),
entity_attributes_instruction=entity_attributes_instruction,
language=language,
)
# add example's format
Expand All @@ -2921,6 +3008,8 @@ async def extract_entities(
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
entity_types=",".join(entity_types),
entity_attributes=", ".join(entity_attributes),
entity_attributes_instruction=entity_attributes_instruction,
examples=examples,
language=language,
)
Expand Down Expand Up @@ -2981,6 +3070,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
file_path,
tuple_delimiter=context_base["tuple_delimiter"],
completion_delimiter=context_base["completion_delimiter"],
entity_attributes=entity_attributes,
)

# Process additional gleaning results only 1 time when entity_extract_max_gleaning is greater than zero.
Expand Down Expand Up @@ -3024,6 +3114,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
file_path,
tuple_delimiter=context_base["tuple_delimiter"],
completion_delimiter=context_base["completion_delimiter"],
entity_attributes=entity_attributes,
)

# Merge results - compare description lengths to choose better version
Expand Down
4 changes: 4 additions & 0 deletions lightrag/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
* `entity_description`: Provide a concise yet comprehensive description of the entity's attributes and activities, based *solely* on the information present in the input text.
* **Output Format - Entities:** Output a total of 4 fields for each entity, delimited by `{tuple_delimiter}`, on a single line. The first field *must* be the literal string `entity`.
* Format: `entity{tuple_delimiter}entity_name{tuple_delimiter}entity_type{tuple_delimiter}entity_description`
* **Attributes (when requested):** If `{entity_attributes}` is non-empty, append a 5th field containing a compact JSON object with keys `{entity_attributes}`. Infer each value from the entity context; use `null` if not determinable. The JSON must be on a single line with no surrounding whitespace. Numeric confidence values must be in the range 0.0–1.0.
* Extended format: `entity{tuple_delimiter}entity_name{tuple_delimiter}entity_type{tuple_delimiter}entity_description{tuple_delimiter}{{"attr1": "val1", "attr2": "val2"}}`

2. **Relationship Extraction & Output:**
* **Identification:** Identify direct, clearly stated, and meaningful relationships between previously extracted entities.
Expand All @@ -32,6 +34,7 @@
* `relationship_description`: A concise explanation of the nature of the relationship between the source and target entities, providing a clear rationale for their connection.
* **Output Format - Relationships:** Output a total of 5 fields for each relationship, delimited by `{tuple_delimiter}`, on a single line. The first field *must* be the literal string `relation`.
* Format: `relation{tuple_delimiter}source_entity{tuple_delimiter}target_entity{tuple_delimiter}relationship_keywords{tuple_delimiter}relationship_description`
* Relationships never carry an attributes field — the 5-field format is unchanged regardless of `{entity_attributes}`.

3. **Delimiter Usage Protocol:**
* The `{tuple_delimiter}` is a complete, atomic marker and **must not be filled with content**. It serves strictly as a field separator.
Expand Down Expand Up @@ -68,6 +71,7 @@
2. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list.
3. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant entities and relationships have been extracted and presented.
4. **Output Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated.
5. **Entity Attributes:** {entity_attributes_instruction}

---Data to be Processed---
<Entity_types>
Expand Down