Skip to content

Commit 8ee0576

Browse files
wochingeclaude
andcommitted
feat(ci): add RunnerContext and RegressionError for experiment GH action
Adds the SDK-side primitives consumed by the upcoming `langfuse/experiment-action` GitHub Action (LFE-9241): - `RunnerContext` wraps `Langfuse.run_experiment` with action-injected defaults (data, dataset_version, name, run_name, metadata). Users can override any default on the call site; metadata is merged with user-supplied keys winning on collision. - `RegressionError` lets users signal a CI gate failure and optionally pass structured `metric`/`value`/`threshold` fields so the action can render a callout in the PR comment. Both live in a dedicated `langfuse/ci.py` module so the CI surface stays isolated from the general experiment API. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent cd9812c commit 8ee0576

3 files changed

Lines changed: 400 additions & 0 deletions

File tree

langfuse/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
EvaluatorStats,
99
MapperFunction,
1010
)
11+
from langfuse.ci import RegressionError, RunnerContext
1112
from langfuse.experiment import Evaluation
1213

1314
from ._client import client as _client_module
@@ -63,6 +64,8 @@
6364
"EvaluatorStats",
6465
"BatchEvaluationResumeToken",
6566
"BatchEvaluationResult",
67+
"RunnerContext",
68+
"RegressionError",
6669
"__version__",
6770
"is_default_export_span",
6871
"is_langfuse_span",

langfuse/ci.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
"""CI/CD helpers for running Langfuse experiments in GitHub Actions.
2+
3+
Designed to be used in conjunction with the ``langfuse/experiment-action``
4+
GitHub Action (https://github.com/langfuse/experiment-action). The action
5+
constructs a :class:`RunnerContext` pre-populated with dataset, run name, and
6+
GitHub-sourced metadata, then calls the user's ``experiment(context)``
7+
function.
8+
"""
9+
10+
from datetime import datetime
11+
from typing import TYPE_CHECKING, Dict, List, Optional
12+
13+
from langfuse.batch_evaluation import CompositeEvaluatorFunction
14+
from langfuse.experiment import (
15+
EvaluatorFunction,
16+
ExperimentData,
17+
ExperimentResult,
18+
RunEvaluatorFunction,
19+
TaskFunction,
20+
)
21+
22+
if TYPE_CHECKING:
23+
from langfuse._client.client import Langfuse
24+
25+
26+
class RunnerContext:
27+
"""Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
28+
29+
Intended for use with the ``langfuse/experiment-action`` GitHub Action
30+
(https://github.com/langfuse/experiment-action). The action builds a
31+
``RunnerContext`` before invoking the user's ``experiment(context)``
32+
function. Defaults set here (dataset, name, run name, metadata tags) are
33+
applied when the user omits them on the :meth:`run_experiment` call;
34+
users can override any default by passing the corresponding argument
35+
explicitly.
36+
"""
37+
38+
def __init__(
39+
self,
40+
*,
41+
client: "Langfuse",
42+
data: Optional[ExperimentData] = None,
43+
dataset_version: Optional[datetime] = None,
44+
name: Optional[str] = None,
45+
run_name: Optional[str] = None,
46+
metadata: Optional[Dict[str, str]] = None,
47+
):
48+
"""Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
49+
50+
Typically called by the ``langfuse/experiment-action`` GitHub Action,
51+
not by end users directly. Every field except ``client`` is optional:
52+
fields left as ``None`` simply mean the corresponding argument must be
53+
supplied on the :meth:`run_experiment` call.
54+
55+
Args:
56+
client: Initialized Langfuse SDK client used to execute the
57+
experiment. The action creates this from the
58+
``langfuse_public_key`` / ``langfuse_secret_key`` /
59+
``langfuse_base_url`` inputs.
60+
data: Default dataset items to run the experiment on. Accepts
61+
either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
62+
Injected by the action when ``dataset_name`` is configured.
63+
If ``None``, the user must pass ``data=`` to
64+
:meth:`run_experiment`.
65+
dataset_version: Optional pinned dataset version. Injected by the
66+
action when ``dataset_version`` is configured.
67+
name: Default human-readable experiment name (e.g. the action's
68+
``experiment_name`` input). If ``None``, the user must pass
69+
``name=`` to :meth:`run_experiment`.
70+
run_name: Default exact run name. The action typically derives
71+
this from the commit SHA / PR number so that reruns produce
72+
distinct runs in Langfuse.
73+
metadata: Default metadata attached to every experiment trace and
74+
the dataset run. The action injects GitHub-sourced tags (SHA,
75+
PR link, workflow run link, branch, GH user, etc.). Merged
76+
with any ``metadata`` passed to :meth:`run_experiment`, with
77+
user-supplied keys winning on collision.
78+
"""
79+
self.client = client
80+
self.data = data
81+
self.dataset_version = dataset_version
82+
self.name = name
83+
self.run_name = run_name
84+
self.metadata = metadata
85+
86+
def run_experiment(
87+
self,
88+
*,
89+
name: Optional[str] = None,
90+
run_name: Optional[str] = None,
91+
description: Optional[str] = None,
92+
data: Optional[ExperimentData] = None,
93+
task: TaskFunction,
94+
evaluators: List[EvaluatorFunction] = [],
95+
composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
96+
run_evaluators: List[RunEvaluatorFunction] = [],
97+
max_concurrency: int = 50,
98+
metadata: Optional[Dict[str, str]] = None,
99+
_dataset_version: Optional[datetime] = None,
100+
) -> ExperimentResult:
101+
resolved_name = name if name is not None else self.name
102+
if resolved_name is None:
103+
raise ValueError(
104+
"`name` must be provided either on the RunnerContext or the run_experiment call"
105+
)
106+
107+
resolved_data = data if data is not None else self.data
108+
if resolved_data is None:
109+
raise ValueError(
110+
"`data` must be provided either on the RunnerContext or the run_experiment call"
111+
)
112+
113+
resolved_run_name = run_name if run_name is not None else self.run_name
114+
resolved_dataset_version = (
115+
_dataset_version if _dataset_version is not None else self.dataset_version
116+
)
117+
118+
merged_metadata: Optional[Dict[str, str]]
119+
if self.metadata is None and metadata is None:
120+
merged_metadata = None
121+
else:
122+
merged_metadata = {**(self.metadata or {}), **(metadata or {})}
123+
124+
return self.client.run_experiment(
125+
name=resolved_name,
126+
run_name=resolved_run_name,
127+
description=description,
128+
data=resolved_data,
129+
task=task,
130+
evaluators=evaluators,
131+
composite_evaluator=composite_evaluator,
132+
run_evaluators=run_evaluators,
133+
max_concurrency=max_concurrency,
134+
metadata=merged_metadata,
135+
_dataset_version=resolved_dataset_version,
136+
)
137+
138+
139+
class RegressionError(Exception):
140+
"""Raised by a user's ``experiment`` function to signal a CI gate failure.
141+
142+
The GitHub action catches this exception and, when ``should_fail_on_error``
143+
is enabled, fails the workflow run and renders a callout in the PR comment
144+
using ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
145+
"""
146+
147+
def __init__(
148+
self,
149+
*,
150+
result: ExperimentResult,
151+
metric: Optional[str] = None,
152+
value: Optional[float] = None,
153+
threshold: Optional[float] = None,
154+
message: Optional[str] = None,
155+
):
156+
self.result = result
157+
self.metric = metric
158+
self.value = value
159+
self.threshold = threshold
160+
if message is not None:
161+
formatted = message
162+
elif metric is not None:
163+
formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
164+
else:
165+
formatted = "Experiment regression detected"
166+
super().__init__(formatted)

0 commit comments

Comments
 (0)