Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ llm = [
"evaluate>=0.4.1",
"transformers[torch]>=4.39.3",
"sentence-transformers>=2.7.0",
"rouge-score>=0.1.2",
"sqlvalidator>=0.0.20",
"litellm>=1.74.3",
"llama-index>=0.10",
Expand Down Expand Up @@ -148,6 +149,7 @@ module = [
"transformers.*",
"openai.*",
"sentence_transformers.*",
"rouge_score.*",
"scipy.*",
"sklearn.*",
"plotly.*",
Expand Down
2 changes: 2 additions & 0 deletions src/evidently/core/registries/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,5 @@
register_type_alias(Metric, "evidently.core.metric_types.DataframeMetric", "evidently:metric_v2:DataframeMetric")

register_type_alias(Metric, "evidently.metrics.embeddings.EmbeddingsDrift", "evidently:metric_v2:EmbeddingsDrift")

register_type_alias(Metric, "evidently.metrics.text_evals.RougeScoreMetric", "evidently:metric_v2:RougeScoreMetric")
2 changes: 2 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

from ._context_relevance import ContextRelevance
from ._custom_descriptors import CustomColumnDescriptor
from ._rouge_score import RougeScore
from ._custom_descriptors import CustomDescriptor
from ._text_length import TextLength
from .generated_descriptors import BeginsWith
Expand Down Expand Up @@ -118,6 +119,7 @@
"OpenAI",
"PIILLMEval",
"RegExp",
"RougeScore",
"SemanticSimilarity",
"SentenceCount",
"Sentiment",
Expand Down
99 changes: 99 additions & 0 deletions src/evidently/descriptors/_rouge_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from typing import List
from typing import Optional
from typing import Union

import pandas as pd

from evidently.core.datasets import AnyDescriptorTest
from evidently.core.datasets import Dataset
from evidently.core.datasets import DatasetColumn
from evidently.core.datasets import Descriptor
from evidently.legacy.core import ColumnType
from evidently.legacy.options.base import Options

VALID_ROUGE_TYPES = ("rouge1", "rouge2", "rougeL", "rougeLsum")
VALID_SCORE_TYPES = ("f", "precision", "recall")


class RougeScore(Descriptor):
"""Compute ROUGE score between a prediction column and a reference column, row by row.

ROUGE (Recall-Oriented Understudy for Gisting Evaluation) measures n-gram
overlap between generated text and reference text. Returns a numeric score
in [0, 1] for each row.

Requires the ``rouge-score`` package (``pip install evidently[llm]``).

Args:
prediction_column: Column containing generated/predicted text.
reference_column: Column containing the reference/ground-truth text.
rouge_type: ROUGE variant — ``'rouge1'`` (unigrams), ``'rouge2'`` (bigrams),
or ``'rougeL'`` (longest common subsequence). Default ``'rouge1'``.
score_type: Which score component to return — ``'f'`` (F1, default),
``'precision'``, or ``'recall'``.
alias: Optional display name for the resulting column.
tests: Optional pass/fail conditions on the score.

Example::

from evidently.descriptors import RougeScore

RougeScore("response", "ground_truth", rouge_type="rouge1", alias="ROUGE-1")
"""

prediction_column: str
reference_column: str
rouge_type: str = "rouge1"
score_type: str = "f"

def __init__(
self,
prediction_column: str,
reference_column: str,
rouge_type: str = "rouge1",
score_type: str = "f",
alias: Optional[str] = None,
tests: Optional[List[AnyDescriptorTest]] = None,
):
if rouge_type not in VALID_ROUGE_TYPES:
raise ValueError(f"rouge_type must be one of {VALID_ROUGE_TYPES}, got '{rouge_type}'")
if score_type not in VALID_SCORE_TYPES:
raise ValueError(f"score_type must be one of {VALID_SCORE_TYPES}, got '{score_type}'")
self.prediction_column = prediction_column
self.reference_column = reference_column
self.rouge_type = rouge_type
self.score_type = score_type
default_alias = f"{rouge_type}_{score_type}({prediction_column},{reference_column})"
super().__init__(alias=alias or default_alias, tests=tests)

def generate_data(
self,
dataset: Dataset,
options: Options,
) -> Union[DatasetColumn, dict]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer([self.rouge_type], use_stemmer=False)
df = dataset.as_dataframe()

scores = []
for pred, ref in zip(
df[self.prediction_column].fillna("").tolist(),
df[self.reference_column].fillna("").tolist(),
):
result = scorer.score(str(ref), str(pred))
score_obj = result[self.rouge_type]
if self.score_type == "f":
scores.append(score_obj.fmeasure)
elif self.score_type == "precision":
scores.append(score_obj.precision)
else:
scores.append(score_obj.recall)

return DatasetColumn(
type=ColumnType.Numerical,
data=pd.Series(scores, index=df.index),
)

def list_input_columns(self) -> Optional[List[str]]:
return [self.prediction_column, self.reference_column]
1 change: 1 addition & 0 deletions src/evidently/legacy/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
register_type_alias(GeneratedFeatures, "evidently.legacy.features.non_letter_character_percentage_feature.NonLetterCharacterPercentage", "evidently:feature:NonLetterCharacterPercentage")
register_type_alias(GeneratedFeatures, "evidently.legacy.features.openai_feature.OpenAIFeature", "evidently:feature:OpenAIFeature")
register_type_alias(GeneratedFeatures, "evidently.legacy.features.regexp_feature.RegExp", "evidently:feature:RegExp")
register_type_alias(GeneratedFeatures, "evidently.legacy.features.rouge_score_feature.RougeScoreFeature", "evidently:feature:RougeScoreFeature")
register_type_alias(GeneratedFeatures, "evidently.legacy.features.semantic_similarity_feature.SemanticSimilarityFeature", "evidently:feature:SemanticSimilarityFeature")
register_type_alias(GeneratedFeatures, "evidently.legacy.features.sentence_count_feature.SentenceCount", "evidently:feature:SentenceCount")
register_type_alias(GeneratedFeatures, "evidently.legacy.features.sentiment_feature.Sentiment", "evidently:feature:Sentiment")
Expand Down
73 changes: 73 additions & 0 deletions src/evidently/legacy/features/rouge_score_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import ClassVar
from typing import List

import pandas as pd

from evidently.legacy.base_metric import ColumnName
from evidently.legacy.core import ColumnType
from evidently.legacy.features.generated_features import GeneratedFeature
from evidently.legacy.utils.data_preprocessing import DataDefinition

VALID_ROUGE_TYPES = ("rouge1", "rouge2", "rougeL", "rougeLsum")
VALID_SCORE_TYPES = ("f", "precision", "recall")


class RougeScoreFeature(GeneratedFeature):
"""Compute ROUGE score between two text columns row by row.

ROUGE (Recall-Oriented Understudy for Gisting Evaluation) measures
n-gram overlap between a generated text (prediction) and a reference text.

Requires the ``rouge-score`` package (``pip install evidently[llm]``).
"""

class Config:
type_alias = "evidently:feature:RougeScoreFeature"

__feature_type__: ClassVar = ColumnType.Numerical

columns: List[str]
"""Two-element list: [prediction_column, reference_column]."""

rouge_type: str = "rouge1"
"""ROUGE variant to compute: 'rouge1', 'rouge2', 'rougeL', or 'rougeLsum'."""

score_type: str = "f"
"""Which score to return: 'f' (F1), 'precision', or 'recall'."""

def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
from rouge_score import rouge_scorer

if self.rouge_type not in VALID_ROUGE_TYPES:
raise ValueError(f"rouge_type must be one of {VALID_ROUGE_TYPES}, got '{self.rouge_type}'")
if self.score_type not in VALID_SCORE_TYPES:
raise ValueError(f"score_type must be one of {VALID_SCORE_TYPES}, got '{self.score_type}'")

scorer = rouge_scorer.RougeScorer([self.rouge_type], use_stemmer=False)
prediction_col, reference_col = self.columns[0], self.columns[1]

scores = []
for pred, ref in zip(
data[prediction_col].fillna("").tolist(),
data[reference_col].fillna("").tolist(),
):
result = scorer.score(str(ref), str(pred))
score_obj = result[self.rouge_type]
if self.score_type == "f":
scores.append(score_obj.fmeasure)
elif self.score_type == "precision":
scores.append(score_obj.precision)
else:
scores.append(score_obj.recall)

return pd.DataFrame(
{self._feature_name(): pd.Series(scores, index=data.index)}
)

def _feature_name(self) -> str:
return "|".join(self.columns) + f"|{self.rouge_type}|{self.score_type}"

def _as_column(self) -> "ColumnName":
pred, ref = self.columns[0], self.columns[1]
display = f"{self.rouge_type.upper()} ({self.score_type}) for {pred} vs {ref}"
return self._create_column(self._feature_name(), default_display_name=display)
3 changes: 3 additions & 0 deletions src/evidently/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
from .regression import MeanError
from .regression import R2Score
from .row_test_summary import RowTestSummary
from .text_evals import RougeScoreMetric

__all__ = [
"GroupBy",
Expand Down Expand Up @@ -199,4 +200,6 @@
"ColumnCorrelationMatrix",
# Embeddings
"EmbeddingsDrift",
# Text evals
"RougeScoreMetric",
]
Loading