-
Notifications
You must be signed in to change notification settings - Fork 260
fix(langchain): preserve LangGraph control flow traces #1632
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
f5e5865
19a231e
ad2e408
fccbffc
12bc17b
9414be8
55bb373
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,6 +135,9 @@ | |
| self._updated_completion_start_time_memo: Set[UUID] = set() | ||
| self._propagation_context_manager: Optional[_AgnosticContextManager] = None | ||
| self._trace_context = trace_context | ||
| # LangGraph resumes as a fresh root callback run after interrupting, so we keep | ||
| # just enough trace context to stitch the resume back onto the original trace. | ||
| self._resume_trace_context: Optional[TraceContext] = None | ||
| self._child_to_parent_run_id_map: Dict[UUID, Optional[UUID]] = {} | ||
|
|
||
| self.last_trace_id: Optional[str] = None | ||
|
|
@@ -161,6 +164,44 @@ | |
|
|
||
| self._updated_completion_start_time_memo.add(run_id) | ||
|
|
||
| def _consume_root_trace_context(self) -> Optional[TraceContext]: | ||
| if self._trace_context is not None: | ||
| return self._trace_context | ||
|
|
||
| current_span_context = trace.get_current_span().get_span_context() | ||
|
|
||
| # Only reuse the pending resume context when this callback run has no active | ||
| # parent span of its own. Nested callbacks should attach normally. | ||
| if current_span_context.is_valid: | ||
| return None | ||
|
|
||
| trace_context = self._resume_trace_context | ||
| self._resume_trace_context = None | ||
|
|
||
|
hassiebp marked this conversation as resolved.
Outdated
|
||
| return trace_context | ||
|
|
||
| def _clear_resume_trace_context(self) -> None: | ||
| self._resume_trace_context = None | ||
|
|
||
| def _persist_resume_trace_context(self, observation: Any) -> None: | ||
| if self._trace_context is not None: | ||
| return | ||
|
|
||
| self._resume_trace_context = { | ||
| "trace_id": observation.trace_id, | ||
| "parent_span_id": observation.id, | ||
| } | ||
|
hassiebp marked this conversation as resolved.
Outdated
|
||
|
|
||
| def _get_error_level_and_status_message( | ||
| self, error: BaseException | ||
| ) -> tuple[Literal["DEFAULT", "ERROR"], str]: | ||
| # LangGraph uses GraphBubbleUp subclasses for expected control flow such as | ||
| # interrupts and handoffs, so they should stay visible without being errors. | ||
| if any(isinstance(error, t) for t in CONTROL_FLOW_EXCEPTION_TYPES): | ||
| return "DEFAULT", str(error) or type(error).__name__ | ||
|
|
||
| return "ERROR", str(error) | ||
|
|
||
| def _get_observation_type_from_serialized( | ||
| self, serialized: Optional[Dict[str, Any]], callback_type: str, **kwargs: Any | ||
| ) -> Union[ | ||
|
|
@@ -256,13 +297,22 @@ | |
| observation = self._detach_observation(run_id) | ||
|
|
||
| if observation is not None: | ||
| level, status_message = self._get_error_level_and_status_message(error) | ||
| observation.update( | ||
| level="ERROR", | ||
| status_message=str(error), | ||
| level=cast( | ||
| Optional[Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]], | ||
| level, | ||
| ), | ||
| status_message=status_message, | ||
| input=kwargs.get("inputs"), | ||
| cost_details={"total": 0}, | ||
| ).end() | ||
|
|
||
| if parent_run_id is None and level == "DEFAULT": | ||
| self._persist_resume_trace_context(observation) | ||
| elif parent_run_id is None: | ||
| self._clear_resume_trace_context() | ||
|
|
||
|
Comment on lines
460
to
+478
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 In on_chain_error, _persist_resume_trace_context is correctly called before observation.end() (line 868), so the resume key is saved even if end() throws. In the three non-chain handlers (on_retriever_error, on_tool_error, on_llm_error), observation.update().end() is chained and _persist_resume_trace_context is called afterward; if end() throws, the except block is entered without saving the resume key, and the subsequent finally: _reset(run_id) destroys the entire _RootRunState including its resume_key, losing it permanently. Fix by moving _persist_resume_trace_context before the .end() call in these three handlers, mirroring on_chain_error. Extended reasoning...What the bug is and how it manifests In on_chain_error, observation.update() and observation.end() are separate calls, with _persist_resume_trace_context invoked between them (line 868 before line 875). Even if observation.end() raises, the resume key has already been transferred into _pending_resume_trace_contexts. The three non-chain handlers (on_retriever_error, on_tool_error, on_llm_error) use observation.update(...).end() as a single chained expression, then call _persist_resume_trace_context afterward. If the chained .end() raises, Python immediately jumps to the except block and _persist_resume_trace_context is never reached. The specific code path that triggers it Three conditions must hold simultaneously: (1) the error handler is invoked for a root-level run (parent_run_id is None); (2) the exception is a LangGraph control-flow type so level == "DEFAULT"; (3) observation.update().end() raises internally. When all three hold, the resume key stored in root_run_state.resume_key is never popped into _pending_resume_trace_contexts. Why existing code does not prevent it The finally: if parent_run_id is None: self._reset(run_id) block then executes unconditionally. _reset calls self._root_run_states.pop(run_state.root_run_id, None), which removes the entire _RootRunState object. Since _persist_resume_trace_context was never called, root_run_state.resume_key still holds the thread's resume key — but the object is now discarded. The key never reaches _pending_resume_trace_contexts. Step-by-step proof
How to fix Split observation.update().end() into separate calls and move _persist_resume_trace_context / _clear_root_run_resume_key before the .end() call in all three non-chain handlers, mirroring on_chain_error. The observation.trace_id and observation.id fields are assigned at span creation time, not at end time, so calling persist before end() is safe. |
||
| except Exception as e: | ||
| langfuse_logger.exception(e) | ||
|
|
||
|
|
@@ -382,7 +432,7 @@ | |
| obs = self._get_parent_observation(parent_run_id) | ||
| if isinstance(obs, Langfuse): | ||
| span = obs.start_observation( | ||
| trace_context=self._trace_context, | ||
| trace_context=self._consume_root_trace_context(), | ||
| name=span_name, | ||
| as_type=observation_type, | ||
| metadata=span_metadata, | ||
|
|
@@ -586,6 +636,7 @@ | |
| ) | ||
|
|
||
| if parent_run_id is None: | ||
| self._clear_resume_trace_context() | ||
| self._exit_propagation_context() | ||
|
|
||
| span.end() | ||
|
|
@@ -611,10 +662,7 @@ | |
| ) -> None: | ||
| try: | ||
| self._log_debug_event("on_chain_error", run_id, parent_run_id, error=error) | ||
| if any(isinstance(error, t) for t in CONTROL_FLOW_EXCEPTION_TYPES): | ||
| level = None | ||
| else: | ||
| level = "ERROR" | ||
| level, status_message = self._get_error_level_and_status_message(error) | ||
|
|
||
| observation = self._detach_observation(run_id) | ||
|
|
||
|
|
@@ -624,12 +672,16 @@ | |
| Optional[Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]], | ||
| level, | ||
| ), | ||
| status_message=str(error) if level else None, | ||
| status_message=status_message, | ||
| input=kwargs.get("inputs"), | ||
| cost_details={"total": 0}, | ||
| ) | ||
|
|
||
| if parent_run_id is None: | ||
| if level == "DEFAULT": | ||
| self._persist_resume_trace_context(observation) | ||
| else: | ||
| self._clear_resume_trace_context() | ||
| self._exit_propagation_context() | ||
|
|
||
| observation.end() | ||
|
|
@@ -739,13 +791,24 @@ | |
| serialized, "tool", **kwargs | ||
| ) | ||
|
|
||
| span = self._get_parent_observation(parent_run_id).start_observation( | ||
| name=self.get_langchain_run_name(serialized, **kwargs), | ||
| as_type=observation_type, | ||
| input=input_str, | ||
| metadata=meta, | ||
| level="DEBUG" if tags and LANGSMITH_TAG_HIDDEN in tags else None, | ||
| ) | ||
| parent_observation = self._get_parent_observation(parent_run_id) | ||
| if isinstance(parent_observation, Langfuse): | ||
| span = parent_observation.start_observation( | ||
| trace_context=self._consume_root_trace_context(), | ||
| name=self.get_langchain_run_name(serialized, **kwargs), | ||
| as_type=observation_type, | ||
| input=input_str, | ||
| metadata=meta, | ||
| level="DEBUG" if tags and LANGSMITH_TAG_HIDDEN in tags else None, | ||
| ) | ||
| else: | ||
| span = parent_observation.start_observation( | ||
| name=self.get_langchain_run_name(serialized, **kwargs), | ||
| as_type=observation_type, | ||
| input=input_str, | ||
| metadata=meta, | ||
| level="DEBUG" if tags and LANGSMITH_TAG_HIDDEN in tags else None, | ||
| ) | ||
|
|
||
| self._attach_observation(run_id, span) | ||
|
|
||
|
claude[bot] marked this conversation as resolved.
|
||
|
|
@@ -780,16 +843,30 @@ | |
| observation_type = self._get_observation_type_from_serialized( | ||
| serialized, "retriever", **kwargs | ||
| ) | ||
| span = self._get_parent_observation(parent_run_id).start_observation( | ||
| name=span_name, | ||
| as_type=observation_type, | ||
| metadata=span_metadata, | ||
| input=query, | ||
| level=cast( | ||
| Optional[Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]], | ||
| span_level, | ||
| ), | ||
| ) | ||
| parent_observation = self._get_parent_observation(parent_run_id) | ||
| if isinstance(parent_observation, Langfuse): | ||
| span = parent_observation.start_observation( | ||
| trace_context=self._consume_root_trace_context(), | ||
| name=span_name, | ||
| as_type=observation_type, | ||
| metadata=span_metadata, | ||
| input=query, | ||
| level=cast( | ||
| Optional[Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]], | ||
| span_level, | ||
| ), | ||
| ) | ||
| else: | ||
| span = parent_observation.start_observation( | ||
| name=span_name, | ||
| as_type=observation_type, | ||
| metadata=span_metadata, | ||
| input=query, | ||
| level=cast( | ||
| Optional[Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]], | ||
| span_level, | ||
| ), | ||
| ) | ||
|
|
||
| self._attach_observation(run_id, span) | ||
|
|
||
|
|
@@ -811,6 +888,8 @@ | |
| observation = self._detach_observation(run_id) | ||
|
|
||
| if observation is not None: | ||
| if parent_run_id is None: | ||
| self._clear_resume_trace_context() | ||
| observation.update( | ||
| output=documents, | ||
| input=kwargs.get("inputs"), | ||
|
|
@@ -833,6 +912,8 @@ | |
| observation = self._detach_observation(run_id) | ||
|
|
||
| if observation is not None: | ||
| if parent_run_id is None: | ||
| self._clear_resume_trace_context() | ||
| observation.update( | ||
| output=output, | ||
| input=kwargs.get("inputs"), | ||
|
claude[bot] marked this conversation as resolved.
|
||
|
|
@@ -854,15 +935,24 @@ | |
| observation = self._detach_observation(run_id) | ||
|
|
||
| if observation is not None: | ||
| level, status_message = self._get_error_level_and_status_message(error) | ||
| observation.update( | ||
| status_message=str(error), | ||
| level="ERROR", | ||
| status_message=status_message, | ||
| level=cast( | ||
| Optional[Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]], | ||
| level, | ||
| ), | ||
| input=kwargs.get("inputs"), | ||
| cost_details={"total": 0}, | ||
| ).end() | ||
|
|
||
| if parent_run_id is None and level == "DEFAULT": | ||
| self._persist_resume_trace_context(observation) | ||
| elif parent_run_id is None: | ||
| self._clear_resume_trace_context() | ||
|
|
||
| except Exception as e: | ||
| langfuse_logger.exception(e) | ||
|
Check warning on line 955 in langfuse/langchain/CallbackHandler.py
|
||
|
claude[bot] marked this conversation as resolved.
|
||
|
|
||
| def __on_llm_action( | ||
| self, | ||
|
|
@@ -919,9 +1009,17 @@ | |
| "prompt": registered_prompt, | ||
| } | ||
|
|
||
| generation = self._get_parent_observation(parent_run_id).start_observation( | ||
| as_type="generation", **content | ||
| ) # type: ignore | ||
| parent_observation = self._get_parent_observation(parent_run_id) | ||
| if isinstance(parent_observation, Langfuse): | ||
| generation = parent_observation.start_observation( | ||
| trace_context=self._consume_root_trace_context(), | ||
| as_type="generation", | ||
| **content, | ||
| ) # type: ignore | ||
| else: | ||
| generation = parent_observation.start_observation( | ||
| as_type="generation", **content | ||
| ) # type: ignore | ||
| self._attach_observation(run_id, generation) | ||
|
|
||
| self.last_trace_id = self._runs[run_id].trace_id | ||
|
|
@@ -1034,6 +1132,7 @@ | |
| self._updated_completion_start_time_memo.discard(run_id) | ||
|
|
||
| if parent_run_id is None: | ||
| self._clear_resume_trace_context() | ||
| self._reset() | ||
|
hassiebp marked this conversation as resolved.
Outdated
|
||
|
|
||
| def on_llm_error( | ||
|
|
@@ -1050,13 +1149,22 @@ | |
| generation = self._detach_observation(run_id) | ||
|
|
||
| if generation is not None: | ||
| level, status_message = self._get_error_level_and_status_message(error) | ||
| generation.update( | ||
| status_message=str(error), | ||
| level="ERROR", | ||
| status_message=status_message, | ||
| level=cast( | ||
| Optional[Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]], | ||
| level, | ||
| ), | ||
| input=kwargs.get("inputs"), | ||
| cost_details={"total": 0}, | ||
| ).end() | ||
|
|
||
| if parent_run_id is None and level == "DEFAULT": | ||
| self._persist_resume_trace_context(generation) | ||
| elif parent_run_id is None: | ||
| self._clear_resume_trace_context() | ||
|
|
||
| except Exception as e: | ||
| langfuse_logger.exception(e) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.