diff --git a/pyproject.toml b/pyproject.toml index eec74b2ca7..be2ee7d0ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,6 +85,7 @@ llm = [ "evaluate>=0.4.1", "transformers[torch]>=4.39.3", "sentence-transformers>=2.7.0", + "rouge-score>=0.1.2", "sqlvalidator>=0.0.20", "litellm>=1.74.3", "llama-index>=0.10", @@ -148,6 +149,7 @@ module = [ "transformers.*", "openai.*", "sentence_transformers.*", + "rouge_score.*", "scipy.*", "sklearn.*", "plotly.*", diff --git a/src/evidently/core/registries/metrics.py b/src/evidently/core/registries/metrics.py index 5319122dd3..aa738f4172 100644 --- a/src/evidently/core/registries/metrics.py +++ b/src/evidently/core/registries/metrics.py @@ -109,3 +109,5 @@ register_type_alias(Metric, "evidently.core.metric_types.DataframeMetric", "evidently:metric_v2:DataframeMetric") register_type_alias(Metric, "evidently.metrics.embeddings.EmbeddingsDrift", "evidently:metric_v2:EmbeddingsDrift") + +register_type_alias(Metric, "evidently.metrics.text_evals.RougeScoreMetric", "evidently:metric_v2:RougeScoreMetric") diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py index c835e4a33c..b773fefc3e 100644 --- a/src/evidently/descriptors/__init__.py +++ b/src/evidently/descriptors/__init__.py @@ -32,6 +32,7 @@ from ._context_relevance import ContextRelevance from ._custom_descriptors import CustomColumnDescriptor +from ._rouge_score import RougeScore from ._custom_descriptors import CustomDescriptor from ._text_length import TextLength from .generated_descriptors import BeginsWith @@ -118,6 +119,7 @@ "OpenAI", "PIILLMEval", "RegExp", + "RougeScore", "SemanticSimilarity", "SentenceCount", "Sentiment", diff --git a/src/evidently/descriptors/_rouge_score.py b/src/evidently/descriptors/_rouge_score.py new file mode 100644 index 0000000000..cd5b7eb393 --- /dev/null +++ b/src/evidently/descriptors/_rouge_score.py @@ -0,0 +1,99 @@ +from typing import List +from typing import Optional +from typing import Union + +import pandas as pd + +from evidently.core.datasets import AnyDescriptorTest +from evidently.core.datasets import Dataset +from evidently.core.datasets import DatasetColumn +from evidently.core.datasets import Descriptor +from evidently.legacy.core import ColumnType +from evidently.legacy.options.base import Options + +VALID_ROUGE_TYPES = ("rouge1", "rouge2", "rougeL", "rougeLsum") +VALID_SCORE_TYPES = ("f", "precision", "recall") + + +class RougeScore(Descriptor): + """Compute ROUGE score between a prediction column and a reference column, row by row. + + ROUGE (Recall-Oriented Understudy for Gisting Evaluation) measures n-gram + overlap between generated text and reference text. Returns a numeric score + in [0, 1] for each row. + + Requires the ``rouge-score`` package (``pip install evidently[llm]``). + + Args: + prediction_column: Column containing generated/predicted text. + reference_column: Column containing the reference/ground-truth text. + rouge_type: ROUGE variant — ``'rouge1'`` (unigrams), ``'rouge2'`` (bigrams), + or ``'rougeL'`` (longest common subsequence). Default ``'rouge1'``. + score_type: Which score component to return — ``'f'`` (F1, default), + ``'precision'``, or ``'recall'``. + alias: Optional display name for the resulting column. + tests: Optional pass/fail conditions on the score. + + Example:: + + from evidently.descriptors import RougeScore + + RougeScore("response", "ground_truth", rouge_type="rouge1", alias="ROUGE-1") + """ + + prediction_column: str + reference_column: str + rouge_type: str = "rouge1" + score_type: str = "f" + + def __init__( + self, + prediction_column: str, + reference_column: str, + rouge_type: str = "rouge1", + score_type: str = "f", + alias: Optional[str] = None, + tests: Optional[List[AnyDescriptorTest]] = None, + ): + if rouge_type not in VALID_ROUGE_TYPES: + raise ValueError(f"rouge_type must be one of {VALID_ROUGE_TYPES}, got '{rouge_type}'") + if score_type not in VALID_SCORE_TYPES: + raise ValueError(f"score_type must be one of {VALID_SCORE_TYPES}, got '{score_type}'") + self.prediction_column = prediction_column + self.reference_column = reference_column + self.rouge_type = rouge_type + self.score_type = score_type + default_alias = f"{rouge_type}_{score_type}({prediction_column},{reference_column})" + super().__init__(alias=alias or default_alias, tests=tests) + + def generate_data( + self, + dataset: Dataset, + options: Options, + ) -> Union[DatasetColumn, dict]: + from rouge_score import rouge_scorer + + scorer = rouge_scorer.RougeScorer([self.rouge_type], use_stemmer=False) + df = dataset.as_dataframe() + + scores = [] + for pred, ref in zip( + df[self.prediction_column].fillna("").tolist(), + df[self.reference_column].fillna("").tolist(), + ): + result = scorer.score(str(ref), str(pred)) + score_obj = result[self.rouge_type] + if self.score_type == "f": + scores.append(score_obj.fmeasure) + elif self.score_type == "precision": + scores.append(score_obj.precision) + else: + scores.append(score_obj.recall) + + return DatasetColumn( + type=ColumnType.Numerical, + data=pd.Series(scores, index=df.index), + ) + + def list_input_columns(self) -> Optional[List[str]]: + return [self.prediction_column, self.reference_column] diff --git a/src/evidently/legacy/features/_registry.py b/src/evidently/legacy/features/_registry.py index 9d3b4e6694..9c518741a2 100644 --- a/src/evidently/legacy/features/_registry.py +++ b/src/evidently/legacy/features/_registry.py @@ -21,6 +21,7 @@ register_type_alias(GeneratedFeatures, "evidently.legacy.features.non_letter_character_percentage_feature.NonLetterCharacterPercentage", "evidently:feature:NonLetterCharacterPercentage") register_type_alias(GeneratedFeatures, "evidently.legacy.features.openai_feature.OpenAIFeature", "evidently:feature:OpenAIFeature") register_type_alias(GeneratedFeatures, "evidently.legacy.features.regexp_feature.RegExp", "evidently:feature:RegExp") +register_type_alias(GeneratedFeatures, "evidently.legacy.features.rouge_score_feature.RougeScoreFeature", "evidently:feature:RougeScoreFeature") register_type_alias(GeneratedFeatures, "evidently.legacy.features.semantic_similarity_feature.SemanticSimilarityFeature", "evidently:feature:SemanticSimilarityFeature") register_type_alias(GeneratedFeatures, "evidently.legacy.features.sentence_count_feature.SentenceCount", "evidently:feature:SentenceCount") register_type_alias(GeneratedFeatures, "evidently.legacy.features.sentiment_feature.Sentiment", "evidently:feature:Sentiment") diff --git a/src/evidently/legacy/features/rouge_score_feature.py b/src/evidently/legacy/features/rouge_score_feature.py new file mode 100644 index 0000000000..5fdbf08971 --- /dev/null +++ b/src/evidently/legacy/features/rouge_score_feature.py @@ -0,0 +1,73 @@ +from typing import ClassVar +from typing import List + +import pandas as pd + +from evidently.legacy.base_metric import ColumnName +from evidently.legacy.core import ColumnType +from evidently.legacy.features.generated_features import GeneratedFeature +from evidently.legacy.utils.data_preprocessing import DataDefinition + +VALID_ROUGE_TYPES = ("rouge1", "rouge2", "rougeL", "rougeLsum") +VALID_SCORE_TYPES = ("f", "precision", "recall") + + +class RougeScoreFeature(GeneratedFeature): + """Compute ROUGE score between two text columns row by row. + + ROUGE (Recall-Oriented Understudy for Gisting Evaluation) measures + n-gram overlap between a generated text (prediction) and a reference text. + + Requires the ``rouge-score`` package (``pip install evidently[llm]``). + """ + + class Config: + type_alias = "evidently:feature:RougeScoreFeature" + + __feature_type__: ClassVar = ColumnType.Numerical + + columns: List[str] + """Two-element list: [prediction_column, reference_column].""" + + rouge_type: str = "rouge1" + """ROUGE variant to compute: 'rouge1', 'rouge2', 'rougeL', or 'rougeLsum'.""" + + score_type: str = "f" + """Which score to return: 'f' (F1), 'precision', or 'recall'.""" + + def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame: + from rouge_score import rouge_scorer + + if self.rouge_type not in VALID_ROUGE_TYPES: + raise ValueError(f"rouge_type must be one of {VALID_ROUGE_TYPES}, got '{self.rouge_type}'") + if self.score_type not in VALID_SCORE_TYPES: + raise ValueError(f"score_type must be one of {VALID_SCORE_TYPES}, got '{self.score_type}'") + + scorer = rouge_scorer.RougeScorer([self.rouge_type], use_stemmer=False) + prediction_col, reference_col = self.columns[0], self.columns[1] + + scores = [] + for pred, ref in zip( + data[prediction_col].fillna("").tolist(), + data[reference_col].fillna("").tolist(), + ): + result = scorer.score(str(ref), str(pred)) + score_obj = result[self.rouge_type] + if self.score_type == "f": + scores.append(score_obj.fmeasure) + elif self.score_type == "precision": + scores.append(score_obj.precision) + else: + scores.append(score_obj.recall) + + return pd.DataFrame( + {self._feature_name(): pd.Series(scores, index=data.index)} + ) + + def _feature_name(self) -> str: + return "|".join(self.columns) + f"|{self.rouge_type}|{self.score_type}" + + def _as_column(self) -> "ColumnName": + pred, ref = self.columns[0], self.columns[1] + display = f"{self.rouge_type.upper()} ({self.score_type}) for {pred} vs {ref}" + return self._create_column(self._feature_name(), default_display_name=display) diff --git a/src/evidently/metrics/__init__.py b/src/evidently/metrics/__init__.py index 6b5f0aedec..2722c47948 100644 --- a/src/evidently/metrics/__init__.py +++ b/src/evidently/metrics/__init__.py @@ -109,6 +109,7 @@ from .regression import MeanError from .regression import R2Score from .row_test_summary import RowTestSummary +from .text_evals import RougeScoreMetric __all__ = [ "GroupBy", @@ -199,4 +200,6 @@ "ColumnCorrelationMatrix", # Embeddings "EmbeddingsDrift", + # Text evals + "RougeScoreMetric", ] diff --git a/src/evidently/metrics/text_evals.py b/src/evidently/metrics/text_evals.py new file mode 100644 index 0000000000..726830cc66 --- /dev/null +++ b/src/evidently/metrics/text_evals.py @@ -0,0 +1,186 @@ +"""Dataset-level text evaluation metrics. + +Provides aggregate metrics for evaluating text quality across a dataset, +including ROUGE score for summarisation and generation tasks. +""" + +from typing import List +from typing import Optional +from typing import Tuple + +import pandas as pd + +from evidently.core.datasets import Dataset +from evidently.core.datasets import DatasetColumn +from evidently.core.metric_types import BoundTest +from evidently.core.metric_types import SingleValue +from evidently.core.metric_types import SingleValueCalculation +from evidently.core.metric_types import SingleValueMetric +from evidently.core.report import Context +from evidently.legacy.core import ColumnType +from evidently.legacy.metric_results import HistogramData +from evidently.legacy.model.widget import BaseWidgetInfo +from evidently.legacy.options import ColorOptions +from evidently.legacy.renderers.html_widgets import WidgetSize +from evidently.legacy.renderers.html_widgets import counter +from evidently.legacy.renderers.html_widgets import plotly_figure +from evidently.legacy.renderers.html_widgets import CounterData +from evidently.legacy.utils.visualizations import get_distribution_for_column +from evidently.legacy.utils.visualizations import plot_distr_with_perc_button +from evidently.tests import Reference +from evidently.tests import eq + +VALID_ROUGE_TYPES = ("rouge1", "rouge2", "rougeL", "rougeLsum") +VALID_SCORE_TYPES = ("f", "precision", "recall") + + +def _compute_rouge_series( + df: pd.DataFrame, + prediction_column: str, + reference_column: str, + rouge_type: str, + score_type: str, +) -> pd.Series: + """Compute per-row ROUGE scores between prediction and reference columns.""" + from rouge_score import rouge_scorer + + scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=False) + scores = [] + for pred, ref in zip( + df[prediction_column].fillna("").tolist(), + df[reference_column].fillna("").tolist(), + ): + result = scorer.score(str(ref), str(pred)) + score_obj = result[rouge_type] + if score_type == "f": + scores.append(score_obj.fmeasure) + elif score_type == "precision": + scores.append(score_obj.precision) + else: + scores.append(score_obj.recall) + return pd.Series(scores, index=df.index) + + +class RougeScoreMetric(SingleValueMetric): + """Mean ROUGE score across all rows for a prediction vs reference column pair. + + Computes ROUGE (Recall-Oriented Understudy for Gisting Evaluation) between + each row's prediction and reference text, then returns the dataset-level mean. + Also renders a histogram of the per-row score distribution. + + Requires the ``rouge-score`` package (``pip install evidently[llm]``). + + Args: + prediction_column: Column containing generated/predicted text. + reference_column: Column containing the reference/ground-truth text. + rouge_type: ROUGE variant — ``'rouge1'`` (unigrams), ``'rouge2'`` (bigrams), + or ``'rougeL'`` (longest common subsequence). Default ``'rouge1'``. + score_type: Which score component to return — ``'f'`` (F1, default), + ``'precision'``, or ``'recall'``. + tests: Optional pass/fail conditions on the mean score. + + Example:: + + from evidently import Report + from evidently.metrics import RougeScoreMetric + + report = Report([ + RougeScoreMetric( + prediction_column="response", + reference_column="ground_truth", + rouge_type="rouge1", + ) + ]) + result = report.run(current_dataset, reference_dataset) + """ + + prediction_column: str + reference_column: str + rouge_type: str = "rouge1" + score_type: str = "f" + + def _default_tests_with_reference(self, context: "Context") -> List[BoundTest]: + return [eq(Reference(relative=0.1)).bind_single(self.get_fingerprint())] + + +class RougeScoreMetricCalculation(SingleValueCalculation[RougeScoreMetric]): + def calculate( + self, + context: "Context", + current_data: Dataset, + reference_data: Optional[Dataset], + ) -> Tuple[SingleValue, Optional[SingleValue]]: + m = self.metric + if m.rouge_type not in VALID_ROUGE_TYPES: + raise ValueError(f"rouge_type must be one of {VALID_ROUGE_TYPES}, got '{m.rouge_type}'") + if m.score_type not in VALID_SCORE_TYPES: + raise ValueError(f"score_type must be one of {VALID_SCORE_TYPES}, got '{m.score_type}'") + + cur_df = current_data.as_dataframe() + cur_scores = _compute_rouge_series( + cur_df, m.prediction_column, m.reference_column, m.rouge_type, m.score_type + ) + cur_mean = float(cur_scores.mean()) + + ref_mean: Optional[float] = None + ref_scores: Optional[pd.Series] = None + if reference_data is not None: + ref_df = reference_data.as_dataframe() + ref_scores = _compute_rouge_series( + ref_df, m.prediction_column, m.reference_column, m.rouge_type, m.score_type + ) + ref_mean = float(ref_scores.mean()) + + cur_col = DatasetColumn(type=ColumnType.Numerical, data=cur_scores) + ref_col = DatasetColumn(type=ColumnType.Numerical, data=ref_scores) if ref_scores is not None else None + + widget = self._build_widget(cur_mean, ref_mean, cur_col, ref_col) + + current_result = self.result(cur_mean) + current_result.widget = widget + + ref_result = self.result(ref_mean) if ref_mean is not None else None + return current_result, ref_result + + def _build_widget( + self, + cur_mean: float, + ref_mean: Optional[float], + cur_col: DatasetColumn, + ref_col: Optional[DatasetColumn], + ) -> List[BaseWidgetInfo]: + m = self.metric + title = self.display_name() + + counter_items = [CounterData.float(label="current", value=cur_mean, precision=3)] + if ref_mean is not None: + counter_items.append(CounterData.float(label="reference", value=ref_mean, precision=3)) + + distr_cur, distr_ref = get_distribution_for_column( + column_type=ColumnType.Numerical.value, + current=cur_col.data, + reference=ref_col.data if ref_col is not None else None, + ) + distr_fig = plot_distr_with_perc_button( + hist_curr=HistogramData.from_distribution(distr_cur), + hist_ref=HistogramData.from_distribution(distr_ref), + xaxis_name=f"{m.rouge_type.upper()} {m.score_type}", + yaxis_name="Count", + yaxis_name_perc="Percent", + same_color=False, + color_options=ColorOptions(), + subplots=False, + to_json=False, + current_name="current", + reference_name="reference", + ) + + return [ + counter(title=title, counters=counter_items), + plotly_figure(title=f"{title}: distribution", figure=distr_fig, size=WidgetSize.FULL), + ] + + def display_name(self) -> str: + m = self.metric + variant = m.rouge_type.upper() + return f"Mean {variant} ({m.score_type}) — '{m.prediction_column}' vs '{m.reference_column}'" diff --git a/tests/descriptors/test_rouge_score_descriptor.py b/tests/descriptors/test_rouge_score_descriptor.py new file mode 100644 index 0000000000..03007b170a --- /dev/null +++ b/tests/descriptors/test_rouge_score_descriptor.py @@ -0,0 +1,119 @@ +"""Tests for the RougeScore Descriptor.""" + +import pandas as pd +import pytest + +from evidently.core.datasets import Dataset +from evidently.core.datasets import DataDefinition +from evidently.descriptors import RougeScore + +rouge_score = pytest.importorskip("rouge_score", reason="rouge-score not installed; run pip install evidently[llm]") + + +def _make_dataset(data: dict) -> Dataset: + df = pd.DataFrame(data) + return Dataset.from_pandas(df, data_definition=DataDefinition()) + + +def test_rouge_score_descriptor_basic(): + """Descriptor should add a numeric column in [0, 1] to the dataset.""" + ds = _make_dataset( + { + "pred": ["the cat sat on the mat", "hello world"], + "ref": ["the cat sat on the mat", "goodbye moon"], + } + ) + desc = RougeScore("pred", "ref", rouge_type="rouge1") + result = desc.generate_data(ds, options=None) # type: ignore[arg-type] + + assert result.type.value == "num" + scores = result.data.tolist() + assert len(scores) == 2 + assert scores[0] == pytest.approx(1.0, abs=1e-6) + assert 0.0 <= scores[1] <= 1.0 + + +def test_rouge_score_descriptor_rouge2(): + ds = _make_dataset( + { + "output": ["the quick brown fox jumps over", "completely different words"], + "ground_truth": ["the quick brown fox jumps", "nothing in common here at all"], + } + ) + desc = RougeScore("output", "ground_truth", rouge_type="rouge2", score_type="f") + result = desc.generate_data(ds, options=None) # type: ignore[arg-type] + scores = result.data.tolist() + assert all(0.0 <= s <= 1.0 for s in scores) + # first pair has bigram overlap; should score > 0 + assert scores[0] > 0.0 + + +def test_rouge_score_descriptor_rougeL(): + ds = _make_dataset( + { + "pred": ["the cat sat on the mat"], + "ref": ["the cat sat on the mat"], + } + ) + desc = RougeScore("pred", "ref", rouge_type="rougeL", score_type="f") + result = desc.generate_data(ds, options=None) # type: ignore[arg-type] + assert result.data.iloc[0] == pytest.approx(1.0, abs=1e-6) + + +def test_rouge_score_descriptor_precision_and_recall(): + ds = _make_dataset( + { + "pred": ["the cat sat on the mat"], + "ref": ["the cat sat"], + } + ) + # Prediction is a superset → high recall on ref, lower precision + precision_desc = RougeScore("pred", "ref", rouge_type="rouge1", score_type="precision") + recall_desc = RougeScore("pred", "ref", rouge_type="rouge1", score_type="recall") + + precision_score = precision_desc.generate_data(ds, options=None).data.iloc[0] # type: ignore[arg-type] + recall_score = recall_desc.generate_data(ds, options=None).data.iloc[0] # type: ignore[arg-type] + + assert 0.0 <= precision_score <= 1.0 + assert 0.0 <= recall_score <= 1.0 + # All ref words are in pred → recall should be 1.0 + assert recall_score == pytest.approx(1.0, abs=1e-6) + + +def test_rouge_score_descriptor_nan_handling(): + ds = _make_dataset( + { + "pred": [None, "hello world"], + "ref": ["some text", None], + } + ) + desc = RougeScore("pred", "ref", rouge_type="rouge1") + result = desc.generate_data(ds, options=None) # type: ignore[arg-type] + assert len(result.data) == 2 + assert all(0.0 <= s <= 1.0 for s in result.data.tolist()) + + +def test_rouge_score_descriptor_alias(): + desc = RougeScore("pred", "ref", rouge_type="rouge1", alias="My ROUGE") + assert desc.alias == "My ROUGE" + + +def test_rouge_score_descriptor_default_alias(): + desc = RougeScore("pred", "ref", rouge_type="rouge1", score_type="f") + assert "rouge1" in desc.alias + assert "f" in desc.alias + + +def test_rouge_score_descriptor_list_input_columns(): + desc = RougeScore("prediction", "reference") + assert desc.list_input_columns() == ["prediction", "reference"] + + +def test_rouge_score_descriptor_invalid_rouge_type(): + with pytest.raises(ValueError, match="rouge_type"): + RougeScore("pred", "ref", rouge_type="rouge99") + + +def test_rouge_score_descriptor_invalid_score_type(): + with pytest.raises(ValueError, match="score_type"): + RougeScore("pred", "ref", score_type="unknown") diff --git a/tests/features/test_rouge_score_feature.py b/tests/features/test_rouge_score_feature.py new file mode 100644 index 0000000000..b1d9c24478 --- /dev/null +++ b/tests/features/test_rouge_score_feature.py @@ -0,0 +1,118 @@ +"""Tests for RougeScoreFeature (legacy feature layer).""" + +import pandas as pd +import pytest + +from evidently.legacy.features.rouge_score_feature import RougeScoreFeature +from evidently.legacy.pipeline.column_mapping import ColumnMapping +from evidently.legacy.utils.data_preprocessing import create_data_definition + +rouge_score = pytest.importorskip("rouge_score", reason="rouge-score not installed; run pip install evidently[llm]") + + +@pytest.mark.parametrize( + "prediction, reference, rouge_type, score_type, expected_range", + [ + # Identical texts → perfect score + ("the cat sat on the mat", "the cat sat on the mat", "rouge1", "f", (0.99, 1.01)), + # Completely different → near zero + ("apple orange banana", "car train airplane", "rouge1", "f", (0.0, 0.1)), + # Partial overlap — rouge1 F1 + ("the cat sat", "the cat sat on the mat", "rouge1", "f", (0.6, 0.9)), + # Bigram overlap — rouge2 stricter than rouge1 + ("the quick brown fox", "the quick brown fox jumps", "rouge2", "f", (0.5, 1.0)), + # rougeL on similar sentences + ("machine learning is powerful", "machine learning is great", "rougeL", "f", (0.5, 1.0)), + # score_type = precision + ("cat sat mat", "the cat sat on the mat", "rouge1", "precision", (0.5, 1.01)), + # score_type = recall + ("the cat sat on the mat", "the cat sat", "rouge1", "recall", (0.5, 1.01)), + ], +) +def test_rouge_score_feature_range(prediction, reference, rouge_type, score_type, expected_range): + feature = RougeScoreFeature( + columns=["prediction", "reference"], + rouge_type=rouge_type, + score_type=score_type, + ) + data = pd.DataFrame({"prediction": [prediction], "reference": [reference]}) + result = feature.generate_feature( + data=data, + data_definition=create_data_definition(None, data, ColumnMapping()), + ) + score = result[feature._feature_name()].iloc[0] + assert expected_range[0] <= score <= expected_range[1], ( + f"Expected score in {expected_range}, got {score:.4f} " + f"for rouge_type={rouge_type}, score_type={score_type}" + ) + + +def test_rouge_score_feature_empty_strings(): + """Empty strings should return 0, not raise an error.""" + feature = RougeScoreFeature(columns=["prediction", "reference"], rouge_type="rouge1", score_type="f") + data = pd.DataFrame({"prediction": [""], "reference": [""]}) + result = feature.generate_feature( + data=data, + data_definition=create_data_definition(None, data, ColumnMapping()), + ) + score = result[feature._feature_name()].iloc[0] + assert score == 0.0 + + +def test_rouge_score_feature_nan_handling(): + """NaN values should be treated as empty strings without raising.""" + feature = RougeScoreFeature(columns=["prediction", "reference"], rouge_type="rouge1", score_type="f") + data = pd.DataFrame({"prediction": [None, "hello world"], "reference": ["some text", None]}) + result = feature.generate_feature( + data=data, + data_definition=create_data_definition(None, data, ColumnMapping()), + ) + assert len(result) == 2 + assert all(0.0 <= v <= 1.0 for v in result[feature._feature_name()]) + + +def test_rouge_score_feature_batch(): + """Multiple rows should each get their own score.""" + feature = RougeScoreFeature(columns=["pred", "ref"], rouge_type="rouge1", score_type="f") + data = pd.DataFrame( + { + "pred": ["the cat sat", "hello world", "identical text"], + "ref": ["the cat sat on the mat", "goodbye universe", "identical text"], + } + ) + result = feature.generate_feature( + data=data, + data_definition=create_data_definition(None, data, ColumnMapping()), + ) + scores = result[feature._feature_name()].tolist() + assert len(scores) == 3 + # last row is identical → score 1.0 + assert scores[2] == pytest.approx(1.0, abs=1e-6) + # first two rows differ → lower scores + assert scores[0] < 1.0 + assert scores[1] < 0.5 + + +def test_rouge_score_feature_invalid_rouge_type(): + feature = RougeScoreFeature(columns=["pred", "ref"], rouge_type="rouge99", score_type="f") + data = pd.DataFrame({"pred": ["text"], "ref": ["text"]}) + with pytest.raises(ValueError, match="rouge_type"): + feature.generate_feature( + data=data, + data_definition=create_data_definition(None, data, ColumnMapping()), + ) + + +def test_rouge_score_feature_invalid_score_type(): + feature = RougeScoreFeature(columns=["pred", "ref"], rouge_type="rouge1", score_type="unknown") + data = pd.DataFrame({"pred": ["text"], "ref": ["text"]}) + with pytest.raises(ValueError, match="score_type"): + feature.generate_feature( + data=data, + data_definition=create_data_definition(None, data, ColumnMapping()), + ) + + +def test_rouge_score_feature_name(): + feature = RougeScoreFeature(columns=["pred", "ref"], rouge_type="rouge2", score_type="recall") + assert feature._feature_name() == "pred|ref|rouge2|recall" diff --git a/tests/metrics/test_rouge_score_metric.py b/tests/metrics/test_rouge_score_metric.py new file mode 100644 index 0000000000..5e4a733d40 --- /dev/null +++ b/tests/metrics/test_rouge_score_metric.py @@ -0,0 +1,132 @@ +"""Tests for RougeScoreMetric (dataset-level aggregate).""" + +import pandas as pd +import pytest + +from evidently.core.datasets import Dataset +from evidently.core.datasets import DataDefinition +from evidently.metrics.text_evals import RougeScoreMetric + +rouge_score = pytest.importorskip("rouge_score", reason="rouge-score not installed; run pip install evidently[llm]") + + +def _make_dataset(data: dict) -> Dataset: + df = pd.DataFrame(data) + return Dataset.from_pandas(df, data_definition=DataDefinition()) + + +SAMPLE_DATA = { + "pred": [ + "the cat sat on the mat", + "machine learning is fascinating", + "hello world", + "the quick brown fox", + ], + "ref": [ + "the cat sat on the mat", + "artificial intelligence is interesting", + "goodbye universe", + "the quick brown fox jumps over the lazy dog", + ], +} + + +def test_rouge_score_metric_returns_value_in_range(): + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + metric = RougeScoreMetric(prediction_column="pred", reference_column="ref", rouge_type="rouge1") + ds = _make_dataset(SAMPLE_DATA) + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + cur, ref = calc.calculate(context=None, current_data=ds, reference_data=None) # type: ignore[arg-type] + + assert 0.0 <= cur.value <= 1.0 + assert ref is None + + +def test_rouge_score_metric_with_reference(): + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + metric = RougeScoreMetric(prediction_column="pred", reference_column="ref", rouge_type="rouge1") + current = _make_dataset(SAMPLE_DATA) + reference = _make_dataset( + { + "pred": ["identical text identical text", "completely different here"], + "ref": ["identical text identical text", "nothing matches at all"], + } + ) + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + cur, ref = calc.calculate(context=None, current_data=current, reference_data=reference) # type: ignore[arg-type] + + assert 0.0 <= cur.value <= 1.0 + assert ref is not None + assert 0.0 <= ref.value <= 1.0 + + +def test_rouge_score_metric_identical_texts_score_one(): + """If all predictions exactly match references, mean ROUGE-1 should be 1.0.""" + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + metric = RougeScoreMetric(prediction_column="pred", reference_column="ref", rouge_type="rouge1") + ds = _make_dataset( + { + "pred": ["the cat sat on the mat", "hello world"], + "ref": ["the cat sat on the mat", "hello world"], + } + ) + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + cur, _ = calc.calculate(context=None, current_data=ds, reference_data=None) # type: ignore[arg-type] + assert cur.value == pytest.approx(1.0, abs=1e-6) + + +def test_rouge_score_metric_rouge2(): + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + metric = RougeScoreMetric(prediction_column="pred", reference_column="ref", rouge_type="rouge2", score_type="f") + ds = _make_dataset(SAMPLE_DATA) + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + cur, _ = calc.calculate(context=None, current_data=ds, reference_data=None) # type: ignore[arg-type] + assert 0.0 <= cur.value <= 1.0 + + +def test_rouge_score_metric_rougeL(): + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + metric = RougeScoreMetric(prediction_column="pred", reference_column="ref", rouge_type="rougeL", score_type="f") + ds = _make_dataset(SAMPLE_DATA) + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + cur, _ = calc.calculate(context=None, current_data=ds, reference_data=None) # type: ignore[arg-type] + assert 0.0 <= cur.value <= 1.0 + + +def test_rouge_score_metric_precision_and_recall(): + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + ds = _make_dataset(SAMPLE_DATA) + for score_type in ("precision", "recall"): + metric = RougeScoreMetric( + prediction_column="pred", reference_column="ref", rouge_type="rouge1", score_type=score_type + ) + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + cur, _ = calc.calculate(context=None, current_data=ds, reference_data=None) # type: ignore[arg-type] + assert 0.0 <= cur.value <= 1.0, f"score_type={score_type} gave {cur.value}" + + +def test_rouge_score_metric_invalid_rouge_type(): + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + metric = RougeScoreMetric(prediction_column="pred", reference_column="ref", rouge_type="rouge99") + ds = _make_dataset(SAMPLE_DATA) + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + with pytest.raises(ValueError, match="rouge_type"): + calc.calculate(context=None, current_data=ds, reference_data=None) # type: ignore[arg-type] + + +def test_rouge_score_metric_display_name(): + from evidently.metrics.text_evals import RougeScoreMetricCalculation + + metric = RougeScoreMetric(prediction_column="pred", reference_column="ref", rouge_type="rouge1", score_type="f") + calc = RougeScoreMetricCalculation(metric_id="test", metric=metric) + name = calc.display_name() + assert "ROUGE1" in name + assert "pred" in name + assert "ref" in name