-
Notifications
You must be signed in to change notification settings - Fork 262
Expand file tree
/
Copy pathci.py
More file actions
166 lines (147 loc) · 6.55 KB
/
ci.py
File metadata and controls
166 lines (147 loc) · 6.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""CI/CD helpers for running Langfuse experiments in GitHub Actions.
Designed to be used in conjunction with the ``langfuse/experiment-action``
GitHub Action (https://github.com/langfuse/experiment-action). The action
constructs a :class:`RunnerContext` pre-populated with dataset, run name, and
GitHub-sourced metadata, then calls the user's ``experiment(context)``
function.
"""
from datetime import datetime
from typing import TYPE_CHECKING, Dict, List, Optional
from langfuse.batch_evaluation import CompositeEvaluatorFunction
from langfuse.experiment import (
EvaluatorFunction,
ExperimentData,
ExperimentResult,
RunEvaluatorFunction,
TaskFunction,
)
if TYPE_CHECKING:
from langfuse._client.client import Langfuse
class RunnerContext:
"""Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
Intended for use with the ``langfuse/experiment-action`` GitHub Action
(https://github.com/langfuse/experiment-action). The action builds a
``RunnerContext`` before invoking the user's ``experiment(context)``
function. Defaults set here (dataset, name, run name, metadata tags) are
applied when the user omits them on the :meth:`run_experiment` call;
users can override any default by passing the corresponding argument
explicitly.
"""
def __init__(
self,
*,
client: "Langfuse",
data: Optional[ExperimentData] = None,
dataset_version: Optional[datetime] = None,
name: Optional[str] = None,
run_name: Optional[str] = None,
metadata: Optional[Dict[str, str]] = None,
):
"""Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
Typically called by the ``langfuse/experiment-action`` GitHub Action,
not by end users directly. Every field except ``client`` is optional:
fields left as ``None`` simply mean the corresponding argument must be
supplied on the :meth:`run_experiment` call.
Args:
client: Initialized Langfuse SDK client used to execute the
experiment. The action creates this from the
``langfuse_public_key`` / ``langfuse_secret_key`` /
``langfuse_base_url`` inputs.
data: Default dataset items to run the experiment on. Accepts
either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
Injected by the action when ``dataset_name`` is configured.
If ``None``, the user must pass ``data=`` to
:meth:`run_experiment`.
dataset_version: Optional pinned dataset version. Injected by the
action when ``dataset_version`` is configured.
name: Default human-readable experiment name (e.g. the action's
``experiment_name`` input). If ``None``, the user must pass
``name=`` to :meth:`run_experiment`.
run_name: Default exact run name. The action typically derives
this from the commit SHA / PR number so that reruns produce
distinct runs in Langfuse.
metadata: Default metadata attached to every experiment trace and
the dataset run. The action injects GitHub-sourced tags (SHA,
PR link, workflow run link, branch, GH user, etc.). Merged
with any ``metadata`` passed to :meth:`run_experiment`, with
user-supplied keys winning on collision.
"""
self.client = client
self.data = data
self.dataset_version = dataset_version
self.name = name
self.run_name = run_name
self.metadata = metadata
def run_experiment(
self,
*,
name: Optional[str] = None,
run_name: Optional[str] = None,
description: Optional[str] = None,
data: Optional[ExperimentData] = None,
task: TaskFunction,
evaluators: List[EvaluatorFunction] = [],
composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
run_evaluators: List[RunEvaluatorFunction] = [],
max_concurrency: int = 50,
metadata: Optional[Dict[str, str]] = None,
_dataset_version: Optional[datetime] = None,
) -> ExperimentResult:
resolved_name = name if name is not None else self.name
if resolved_name is None:
raise ValueError(
"`name` must be provided either on the RunnerContext or the run_experiment call"
)
resolved_data = data if data is not None else self.data
if resolved_data is None:
raise ValueError(
"`data` must be provided either on the RunnerContext or the run_experiment call"
)
resolved_run_name = run_name if run_name is not None else self.run_name
resolved_dataset_version = (
_dataset_version if _dataset_version is not None else self.dataset_version
)
merged_metadata: Optional[Dict[str, str]]
if self.metadata is None and metadata is None:
merged_metadata = None
else:
merged_metadata = {**(self.metadata or {}), **(metadata or {})}
return self.client.run_experiment(
name=resolved_name,
run_name=resolved_run_name,
description=description,
data=resolved_data,
task=task,
evaluators=evaluators,
composite_evaluator=composite_evaluator,
run_evaluators=run_evaluators,
max_concurrency=max_concurrency,
metadata=merged_metadata,
_dataset_version=resolved_dataset_version,
)
class RegressionError(Exception):
"""Raised by a user's ``experiment`` function to signal a CI gate failure.
The GitHub action catches this exception and, when ``should_fail_on_error``
is enabled, fails the workflow run and renders a callout in the PR comment
using ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
"""
def __init__(
self,
*,
result: ExperimentResult,
metric: Optional[str] = None,
value: Optional[float] = None,
threshold: Optional[float] = None,
message: Optional[str] = None,
):
self.result = result
self.metric = metric
self.value = value
self.threshold = threshold
if message is not None:
formatted = message
elif metric is not None:
formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
else:
formatted = "Experiment regression detected"
super().__init__(formatted)