|
| 1 | +"""CI/CD helpers for running Langfuse experiments in GitHub Actions. |
| 2 | +
|
| 3 | +Designed to be used in conjunction with the ``langfuse/experiment-action`` |
| 4 | +GitHub Action (https://github.com/langfuse/experiment-action). The action |
| 5 | +constructs a :class:`RunnerContext` pre-populated with dataset, run name, and |
| 6 | +GitHub-sourced metadata, then calls the user's ``experiment(context)`` |
| 7 | +function. |
| 8 | +""" |
| 9 | + |
| 10 | +from datetime import datetime |
| 11 | +from typing import TYPE_CHECKING, Dict, List, Optional |
| 12 | + |
| 13 | +from langfuse.batch_evaluation import CompositeEvaluatorFunction |
| 14 | +from langfuse.experiment import ( |
| 15 | + EvaluatorFunction, |
| 16 | + ExperimentData, |
| 17 | + ExperimentResult, |
| 18 | + RunEvaluatorFunction, |
| 19 | + TaskFunction, |
| 20 | +) |
| 21 | + |
| 22 | +if TYPE_CHECKING: |
| 23 | + from langfuse._client.client import Langfuse |
| 24 | + |
| 25 | + |
| 26 | +class RunnerContext: |
| 27 | + """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. |
| 28 | +
|
| 29 | + Intended for use with the ``langfuse/experiment-action`` GitHub Action |
| 30 | + (https://github.com/langfuse/experiment-action). The action builds a |
| 31 | + ``RunnerContext`` before invoking the user's ``experiment(context)`` |
| 32 | + function. Defaults set here (dataset, name, run name, metadata tags) are |
| 33 | + applied when the user omits them on the :meth:`run_experiment` call; |
| 34 | + users can override any default by passing the corresponding argument |
| 35 | + explicitly. |
| 36 | + """ |
| 37 | + |
| 38 | + def __init__( |
| 39 | + self, |
| 40 | + *, |
| 41 | + client: "Langfuse", |
| 42 | + data: Optional[ExperimentData] = None, |
| 43 | + dataset_version: Optional[datetime] = None, |
| 44 | + name: Optional[str] = None, |
| 45 | + run_name: Optional[str] = None, |
| 46 | + metadata: Optional[Dict[str, str]] = None, |
| 47 | + ): |
| 48 | + """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. |
| 49 | +
|
| 50 | + Typically called by the ``langfuse/experiment-action`` GitHub Action, |
| 51 | + not by end users directly. Every field except ``client`` is optional: |
| 52 | + fields left as ``None`` simply mean the corresponding argument must be |
| 53 | + supplied on the :meth:`run_experiment` call. |
| 54 | +
|
| 55 | + Args: |
| 56 | + client: Initialized Langfuse SDK client used to execute the |
| 57 | + experiment. The action creates this from the |
| 58 | + ``langfuse_public_key`` / ``langfuse_secret_key`` / |
| 59 | + ``langfuse_base_url`` inputs. |
| 60 | + data: Default dataset items to run the experiment on. Accepts |
| 61 | + either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. |
| 62 | + Injected by the action when ``dataset_name`` is configured. |
| 63 | + If ``None``, the user must pass ``data=`` to |
| 64 | + :meth:`run_experiment`. |
| 65 | + dataset_version: Optional pinned dataset version. Injected by the |
| 66 | + action when ``dataset_version`` is configured. |
| 67 | + name: Default human-readable experiment name (e.g. the action's |
| 68 | + ``experiment_name`` input). If ``None``, the user must pass |
| 69 | + ``name=`` to :meth:`run_experiment`. |
| 70 | + run_name: Default exact run name. The action typically derives |
| 71 | + this from the commit SHA / PR number so that reruns produce |
| 72 | + distinct runs in Langfuse. |
| 73 | + metadata: Default metadata attached to every experiment trace and |
| 74 | + the dataset run. The action injects GitHub-sourced tags (SHA, |
| 75 | + PR link, workflow run link, branch, GH user, etc.). Merged |
| 76 | + with any ``metadata`` passed to :meth:`run_experiment`, with |
| 77 | + user-supplied keys winning on collision. |
| 78 | + """ |
| 79 | + self.client = client |
| 80 | + self.data = data |
| 81 | + self.dataset_version = dataset_version |
| 82 | + self.name = name |
| 83 | + self.run_name = run_name |
| 84 | + self.metadata = metadata |
| 85 | + |
| 86 | + def run_experiment( |
| 87 | + self, |
| 88 | + *, |
| 89 | + name: Optional[str] = None, |
| 90 | + run_name: Optional[str] = None, |
| 91 | + description: Optional[str] = None, |
| 92 | + data: Optional[ExperimentData] = None, |
| 93 | + task: TaskFunction, |
| 94 | + evaluators: List[EvaluatorFunction] = [], |
| 95 | + composite_evaluator: Optional[CompositeEvaluatorFunction] = None, |
| 96 | + run_evaluators: List[RunEvaluatorFunction] = [], |
| 97 | + max_concurrency: int = 50, |
| 98 | + metadata: Optional[Dict[str, str]] = None, |
| 99 | + _dataset_version: Optional[datetime] = None, |
| 100 | + ) -> ExperimentResult: |
| 101 | + resolved_name = name if name is not None else self.name |
| 102 | + if resolved_name is None: |
| 103 | + raise ValueError( |
| 104 | + "`name` must be provided either on the RunnerContext or the run_experiment call" |
| 105 | + ) |
| 106 | + |
| 107 | + resolved_data = data if data is not None else self.data |
| 108 | + if resolved_data is None: |
| 109 | + raise ValueError( |
| 110 | + "`data` must be provided either on the RunnerContext or the run_experiment call" |
| 111 | + ) |
| 112 | + |
| 113 | + resolved_run_name = run_name if run_name is not None else self.run_name |
| 114 | + resolved_dataset_version = ( |
| 115 | + _dataset_version if _dataset_version is not None else self.dataset_version |
| 116 | + ) |
| 117 | + |
| 118 | + merged_metadata: Optional[Dict[str, str]] |
| 119 | + if self.metadata is None and metadata is None: |
| 120 | + merged_metadata = None |
| 121 | + else: |
| 122 | + merged_metadata = {**(self.metadata or {}), **(metadata or {})} |
| 123 | + |
| 124 | + return self.client.run_experiment( |
| 125 | + name=resolved_name, |
| 126 | + run_name=resolved_run_name, |
| 127 | + description=description, |
| 128 | + data=resolved_data, |
| 129 | + task=task, |
| 130 | + evaluators=evaluators, |
| 131 | + composite_evaluator=composite_evaluator, |
| 132 | + run_evaluators=run_evaluators, |
| 133 | + max_concurrency=max_concurrency, |
| 134 | + metadata=merged_metadata, |
| 135 | + _dataset_version=resolved_dataset_version, |
| 136 | + ) |
| 137 | + |
| 138 | + |
| 139 | +class RegressionError(Exception): |
| 140 | + """Raised by a user's ``experiment`` function to signal a CI gate failure. |
| 141 | +
|
| 142 | + The GitHub action catches this exception and, when ``should_fail_on_error`` |
| 143 | + is enabled, fails the workflow run and renders a callout in the PR comment |
| 144 | + using ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. |
| 145 | + """ |
| 146 | + |
| 147 | + def __init__( |
| 148 | + self, |
| 149 | + *, |
| 150 | + result: ExperimentResult, |
| 151 | + metric: Optional[str] = None, |
| 152 | + value: Optional[float] = None, |
| 153 | + threshold: Optional[float] = None, |
| 154 | + message: Optional[str] = None, |
| 155 | + ): |
| 156 | + self.result = result |
| 157 | + self.metric = metric |
| 158 | + self.value = value |
| 159 | + self.threshold = threshold |
| 160 | + if message is not None: |
| 161 | + formatted = message |
| 162 | + elif metric is not None: |
| 163 | + formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" |
| 164 | + else: |
| 165 | + formatted = "Experiment regression detected" |
| 166 | + super().__init__(formatted) |
0 commit comments