langfuse
diff --git a/‎langfuse/__init__.py‎
Lines changed: 10 additions & 5 deletions b/‎langfuse/__init__.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎langfuse/_client/client.py‎
Lines changed: 451 additions & 7 deletions b/‎langfuse/_client/client.py‎
Lines changed: 451 additions & 7 deletions
diff --git a/‎langfuse/_client/datasets.py‎
Lines changed: 236 additions & 2 deletions b/‎langfuse/_client/datasets.py‎
Lines changed: 236 additions & 2 deletions
diff --git a/‎langfuse/_client/environment_variables.py‎
Lines changed: 9 additions & 0 deletions b/‎langfuse/_client/environment_variables.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎langfuse/_client/get_client.py‎
Lines changed: 24 additions & 12 deletions b/‎langfuse/_client/get_client.py‎
Lines changed: 24 additions & 12 deletions
diff --git a/‎langfuse/_client/resource_manager.py‎
Lines changed: 11 additions & 0 deletions b/‎langfuse/_client/resource_manager.py‎
Lines changed: 11 additions & 0 deletions
@@ -1,21 +1,23 @@
 """.. include:: ../README.md"""
 
+from langfuse.experiment import Evaluation
+
 from ._client import client as _client_module
 from ._client.attributes import LangfuseOtelSpanAttributes
 from ._client.constants import ObservationTypeLiteral
 from ._client.get_client import get_client
 from ._client.observe import observe
 from ._client.span import (
-    LangfuseEvent,
-    LangfuseGeneration,
-    LangfuseSpan,
     LangfuseAgent,
-    LangfuseTool,
     LangfuseChain,
     LangfuseEmbedding,
     LangfuseEvaluator,
-    LangfuseRetriever,
+    LangfuseEvent,
+    LangfuseGeneration,
     LangfuseGuardrail,
+    LangfuseRetriever,
+    LangfuseSpan,
+    LangfuseTool,
 )
 
 Langfuse = _client_module.Langfuse
@@ -36,4 +38,7 @@
     "LangfuseEvaluator",
     "LangfuseRetriever",
     "LangfuseGuardrail",
+    "Evaluation",
+    "experiment",
+    "api",
 ]
@@ -1,17 +1,24 @@
 import datetime as dt
 import logging
-from .span import LangfuseSpan
-from typing import TYPE_CHECKING, Any, Generator, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional
 
 from opentelemetry.util._decorator import _agnosticcontextmanager
 
+from langfuse.experiment import (
+    EvaluatorFunction,
+    ExperimentResult,
+    RunEvaluatorFunction,
+    TaskFunction,
+)
 from langfuse.model import (
     CreateDatasetRunItemRequest,
     Dataset,
     DatasetItem,
     DatasetStatus,
 )
 
+from .span import LangfuseSpan
+
 if TYPE_CHECKING:
     from langfuse._client.client import Langfuse
 
@@ -181,3 +188,230 @@ def __init__(self, dataset: Dataset, items: List[DatasetItemClient]):
         self.created_at = dataset.created_at
         self.updated_at = dataset.updated_at
         self.items = items
+        self._langfuse: Optional["Langfuse"] = None
+
+    def _get_langfuse_client(self) -> Optional["Langfuse"]:
+        """Get the Langfuse client from the first item."""
+        if self._langfuse is None and self.items:
+            self._langfuse = self.items[0].langfuse
+        return self._langfuse
+
+    def run_experiment(
+        self,
+        *,
+        name: str,
+        run_name: Optional[str] = None,
+        description: Optional[str] = None,
+        task: TaskFunction,
+        evaluators: List[EvaluatorFunction] = [],
+        run_evaluators: List[RunEvaluatorFunction] = [],
+        max_concurrency: int = 50,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> ExperimentResult:
+        """Run an experiment on this Langfuse dataset with automatic tracking.
+
+        This is a convenience method that runs an experiment using all items in this
+        dataset. It automatically creates a dataset run in Langfuse for tracking and
+        comparison purposes, linking all experiment results to the dataset.
+
+        Key benefits of using dataset.run_experiment():
+        - Automatic dataset run creation and linking in Langfuse UI
+        - Built-in experiment tracking and versioning
+        - Easy comparison between different experiment runs
+        - Direct access to dataset items with their metadata and expected outputs
+        - Automatic URL generation for viewing results in Langfuse dashboard
+
+        Args:
+            name: Human-readable name for the experiment run. This will be used as
+                the dataset run name in Langfuse for tracking and identification.
+            run_name: Optional exact name for the dataset run. If provided, this will be
+                used as the exact dataset run name in Langfuse. If not provided, this will
+                default to the experiment name appended with an ISO timestamp.
+            description: Optional description of the experiment's purpose, methodology,
+                or what you're testing. Appears in the Langfuse UI for context.
+            task: Function that processes each dataset item and returns output.
+                The function will receive DatasetItem objects with .input, .expected_output,
+                .metadata attributes. Signature should be: task(*, item, **kwargs) -> Any
+            evaluators: List of functions to evaluate each item's output individually.
+                These will have access to the item's expected_output for comparison.
+            run_evaluators: List of functions to evaluate the entire experiment run.
+                Useful for computing aggregate statistics across all dataset items.
+            max_concurrency: Maximum number of concurrent task executions (default: 50).
+                Adjust based on API rate limits and system resources.
+            metadata: Optional metadata to attach to the experiment run and all traces.
+                Will be combined with individual item metadata.
+
+        Returns:
+            ExperimentResult object containing:
+            - name: The experiment name.
+            - run_name: The experiment run name (equivalent to the dataset run name).
+            - description: Optional experiment description.
+            - item_results: Results for each dataset item with outputs and evaluations.
+            - run_evaluations: Aggregate evaluation results for the entire run.
+            - dataset_run_id: ID of the created dataset run in Langfuse.
+            - dataset_run_url: Direct URL to view the experiment results in Langfuse UI.
+
+            The result object provides a format() method for human-readable output:
+            ```python
+            result = dataset.run_experiment(...)
+            print(result.format())  # Summary view
+            print(result.format(include_item_results=True))  # Detailed view
+            ```
+
+        Raises:
+            ValueError: If the dataset has no items or no Langfuse client is available.
+
+        Examples:
+            Basic dataset experiment:
+            ```python
+            dataset = langfuse.get_dataset("qa-evaluation-set")
+
+            def answer_questions(*, item, **kwargs):
+                # item is a DatasetItem with .input, .expected_output, .metadata
+                question = item.input
+                return my_qa_system.answer(question)
+
+            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
+                if not expected_output:
+                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
+
+                is_correct = output.strip().lower() == expected_output.strip().lower()
+                return {
+                    "name": "accuracy",
+                    "value": 1.0 if is_correct else 0.0,
+                    "comment": "Correct" if is_correct else "Incorrect"
+                }
+
+            result = dataset.run_experiment(
+                name="QA System v2.0 Evaluation",
+                description="Testing improved QA system on curated question set",
+                task=answer_questions,
+                evaluators=[accuracy_evaluator]
+            )
+
+            print(f"Evaluated {len(result['item_results'])} questions")
+            print(f"View detailed results: {result['dataset_run_url']}")
+            ```
+
+            Advanced experiment with multiple evaluators and run-level analysis:
+            ```python
+            dataset = langfuse.get_dataset("content-generation-benchmark")
+
+            async def generate_content(*, item, **kwargs):
+                prompt = item.input
+                response = await openai_client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.7
+                )
+                return response.choices[0].message.content
+
+            def quality_evaluator(*, input, output, expected_output=None, metadata=None, **kwargs):
+                # Use metadata for context-aware evaluation
+                content_type = metadata.get("type", "general") if metadata else "general"
+
+                # Basic quality checks
+                word_count = len(output.split())
+                min_words = {"blog": 300, "tweet": 10, "summary": 100}.get(content_type, 50)
+
+                return [
+                    {
+                        "name": "word_count",
+                        "value": word_count,
+                        "comment": f"Generated {word_count} words"
+                    },
+                    {
+                        "name": "meets_length_requirement",
+                        "value": word_count >= min_words,
+                        "comment": f"{'Meets' if word_count >= min_words else 'Below'} minimum {min_words} words for {content_type}"
+                    }
+                ]
+
+            def content_diversity(*, item_results, **kwargs):
+                # Analyze diversity across all generated content
+                all_outputs = [result["output"] for result in item_results]
+                unique_words = set()
+                total_words = 0
+
+                for output in all_outputs:
+                    words = output.lower().split()
+                    unique_words.update(words)
+                    total_words += len(words)
+
+                diversity_ratio = len(unique_words) / total_words if total_words > 0 else 0
+
+                return {
+                    "name": "vocabulary_diversity",
+                    "value": diversity_ratio,
+                    "comment": f"Used {len(unique_words)} unique words out of {total_words} total ({diversity_ratio:.2%} diversity)"
+                }
+
+            result = dataset.run_experiment(
+                name="Content Generation Diversity Test",
+                description="Evaluating content quality and vocabulary diversity across different content types",
+                task=generate_content,
+                evaluators=[quality_evaluator],
+                run_evaluators=[content_diversity],
+                max_concurrency=3,  # Limit API calls
+                metadata={"model": "gpt-4", "temperature": 0.7}
+            )
+
+            # Results are automatically linked to dataset in Langfuse
+            print(f"Experiment completed! View in Langfuse: {result['dataset_run_url']}")
+
+            # Access individual results
+            for i, item_result in enumerate(result["item_results"]):
+                print(f"Item {i+1}: {item_result['evaluations']}")
+            ```
+
+            Comparing different model versions:
+            ```python
+            # Run multiple experiments on the same dataset for comparison
+            dataset = langfuse.get_dataset("model-benchmark")
+
+            # Experiment 1: GPT-4
+            result_gpt4 = dataset.run_experiment(
+                name="GPT-4 Baseline",
+                description="Baseline performance with GPT-4",
+                task=lambda *, item, **kwargs: gpt4_model.generate(item.input),
+                evaluators=[accuracy_evaluator, fluency_evaluator]
+            )
+
+            # Experiment 2: Custom model
+            result_custom = dataset.run_experiment(
+                name="Custom Model v1.2",
+                description="Testing our fine-tuned model",
+                task=lambda *, item, **kwargs: custom_model.generate(item.input),
+                evaluators=[accuracy_evaluator, fluency_evaluator]
+            )
+
+            # Both experiments are now visible in Langfuse for easy comparison
+            print("Compare results in Langfuse:")
+            print(f"GPT-4: {result_gpt4.dataset_run_url}")
+            print(f"Custom: {result_custom.dataset_run_url}")
+            ```
+
+        Note:
+            - All experiment results are automatically tracked in Langfuse as dataset runs
+            - Dataset items provide .input, .expected_output, and .metadata attributes
+            - Results can be easily compared across different experiment runs in the UI
+            - The dataset_run_url provides direct access to detailed results and analysis
+            - Failed items are handled gracefully and logged without stopping the experiment
+            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
+            - Async execution is handled automatically with smart event loop detection
+        """
+        langfuse_client = self._get_langfuse_client()
+        if not langfuse_client:
+            raise ValueError("No Langfuse client available. Dataset items are empty.")
+
+        return langfuse_client.run_experiment(
+            name=name,
+            run_name=run_name,
+            description=description,
+            data=self.items,
+            task=task,
+            evaluators=evaluators,
+            run_evaluators=run_evaluators,
+            max_concurrency=max_concurrency,
+            metadata=metadata,
+        )
@@ -44,6 +44,15 @@
 **Default value:** ``"https://cloud.langfuse.com"``
 """
 
+LANGFUSE_OTEL_TRACES_EXPORT_PATH = "LANGFUSE_OTEL_TRACES_EXPORT_PATH"
+"""
+.. envvar:: LANGFUSE_OTEL_TRACES_EXPORT_PATH
+
+URL path on the configured host to export traces to.
+
+**Default value:** ``/api/public/otel/v1/traces``
+"""
+
 LANGFUSE_DEBUG = "LANGFUSE_DEBUG"
 """
 .. envvar:: LANGFUSE_DEBUG
 
@@ -33,6 +33,28 @@ def _set_current_public_key(public_key: Optional[str]) -> Iterator[None]:
         _current_public_key.reset(token)
 
 
+def _create_client_from_instance(
+    instance: "LangfuseResourceManager", public_key: Optional[str] = None
+) -> Langfuse:
+    """Create a Langfuse client from a resource manager instance with all settings preserved."""
+    return Langfuse(
+        public_key=public_key or instance.public_key,
+        secret_key=instance.secret_key,
+        host=instance.host,
+        tracing_enabled=instance.tracing_enabled,
+        environment=instance.environment,
+        timeout=instance.timeout,
+        flush_at=instance.flush_at,
+        flush_interval=instance.flush_interval,
+        release=instance.release,
+        media_upload_thread_count=instance.media_upload_thread_count,
+        sample_rate=instance.sample_rate,
+        mask=instance.mask,
+        blocked_instrumentation_scopes=instance.blocked_instrumentation_scopes,
+        additional_headers=instance.additional_headers,
+    )
+
+
 def get_client(*, public_key: Optional[str] = None) -> Langfuse:
     """Get or create a Langfuse client instance.
 
@@ -93,12 +115,7 @@ def get_client(*, public_key: Optional[str] = None) -> Langfuse:
                 # Initialize with the credentials bound to the instance
                 # This is important if the original instance was instantiated
                 # via constructor arguments
-                return Langfuse(
-                    public_key=instance.public_key,
-                    secret_key=instance.secret_key,
-                    host=instance.host,
-                    tracing_enabled=instance.tracing_enabled,
-                )
+                return _create_client_from_instance(instance)
 
             else:
                 # Multiple clients exist but no key specified - disable tracing
@@ -126,9 +143,4 @@ def get_client(*, public_key: Optional[str] = None) -> Langfuse:
                 )
 
             # target_instance is guaranteed to be not None at this point
-            return Langfuse(
-                public_key=public_key,
-                secret_key=target_instance.secret_key,
-                host=target_instance.host,
-                tracing_enabled=target_instance.tracing_enabled,
-            )
+            return _create_client_from_instance(target_instance, public_key)
@@ -162,6 +162,17 @@ def _initialize_instance(
         self.tracing_enabled = tracing_enabled
         self.host = host
         self.mask = mask
+        self.environment = environment
+
+        # Store additional client settings for get_client() to use
+        self.timeout = timeout
+        self.flush_at = flush_at
+        self.flush_interval = flush_interval
+        self.release = release
+        self.media_upload_thread_count = media_upload_thread_count
+        self.sample_rate = sample_rate
+        self.blocked_instrumentation_scopes = blocked_instrumentation_scopes
+        self.additional_headers = additional_headers
 
         # OTEL Tracer
         if tracing_enabled: