diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index ae5eba235..d2faaba83 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2858,56 +2858,56 @@ async def _process_experiment_item( metadata=final_observation_metadata, ) - # Run evaluators - evaluations = [] + except Exception as e: + span.update( + output=f"Error: {str(e)}", level="ERROR", status_message=str(e) + ) + raise e - for evaluator in evaluators: - try: - eval_metadata: Optional[Dict[str, Any]] = None + # Run evaluators + evaluations = [] - if isinstance(item, dict): - eval_metadata = item.get("metadata") - elif hasattr(item, "metadata"): - eval_metadata = item.metadata + for evaluator in evaluators: + try: + eval_metadata: Optional[Dict[str, Any]] = None - eval_results = await _run_evaluator( - evaluator, - input=input_data, - output=output, - expected_output=expected_output, - metadata=eval_metadata, - ) - evaluations.extend(eval_results) - - # Store evaluations as scores - for evaluation in eval_results: - self.create_score( - trace_id=trace_id, - observation_id=span.id, - name=evaluation.name, - value=evaluation.value, # type: ignore - comment=evaluation.comment, - metadata=evaluation.metadata, - config_id=evaluation.config_id, - data_type=evaluation.data_type, # type: ignore - ) + if isinstance(item, dict): + eval_metadata = item.get("metadata") + elif hasattr(item, "metadata"): + eval_metadata = item.metadata - except Exception as e: - langfuse_logger.error(f"Evaluator failed: {e}") + eval_results = await _run_evaluator( + evaluator, + input=input_data, + output=output, + expected_output=expected_output, + metadata=eval_metadata, + ) + evaluations.extend(eval_results) + + # Store evaluations as scores + for evaluation in eval_results: + self.create_score( + trace_id=trace_id, + observation_id=span.id, + name=evaluation.name, + value=evaluation.value, # type: ignore + comment=evaluation.comment, + metadata=evaluation.metadata, + config_id=evaluation.config_id, + data_type=evaluation.data_type, # type: ignore + ) - return ExperimentItemResult( - item=item, - output=output, - evaluations=evaluations, - trace_id=trace_id, - dataset_run_id=dataset_run_id, - ) + except Exception as e: + langfuse_logger.error(f"Evaluator failed: {e}") - except Exception as e: - span.update( - output=f"Error: {str(e)}", level="ERROR", status_message=str(e) - ) - raise e + return ExperimentItemResult( + item=item, + output=output, + evaluations=evaluations, + trace_id=trace_id, + dataset_run_id=dataset_run_id, + ) def _create_experiment_run_name( self, *, name: Optional[str] = None, run_name: Optional[str] = None