Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from .generated_descriptors import BERTScore
from .generated_descriptors import BiasLLMEval
from .generated_descriptors import BinaryClassificationLLMEval
from .generated_descriptors import CoherenceLLMEval
from .generated_descriptors import CompletenessLLMEval
from .generated_descriptors import ContainsLink
from .generated_descriptors import ContextQualityLLMEval
Expand All @@ -46,6 +47,7 @@
from .generated_descriptors import EndsWith
from .generated_descriptors import ExactMatch
from .generated_descriptors import FaithfulnessLLMEval
from .generated_descriptors import FluencyLLMEval
from .generated_descriptors import HuggingFace
from .generated_descriptors import HuggingFaceToxicity
from .generated_descriptors import IsValidJSON
Expand Down Expand Up @@ -84,6 +86,7 @@
"BeginsWith",
"BiasLLMEval",
"BinaryClassificationLLMEval",
"CoherenceLLMEval",
"ColumnTest",
"CompletenessLLMEval",
"Contains",
Expand All @@ -99,6 +102,7 @@
"ExactMatch",
"ExcludesWords",
"FaithfulnessLLMEval",
"FluencyLLMEval",
"HuggingFace",
"HuggingFaceToxicity",
"IncludesWords",
Expand Down
88 changes: 88 additions & 0 deletions src/evidently/descriptors/generated_descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,50 @@ def BinaryClassificationLLMEval(
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def CoherenceLLMEval(
column_name: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
):
"""Evaluate logical coherence and organization of text using LLM.

Checks whether the text is logically consistent and well-structured, without
requiring any reference output or ground-truth answer.

Args:
* `column_name`: Name of the text column to evaluate.
* `provider`: LLM provider name (e.g., "openai", "anthropic").
* `model`: Model name to use (e.g., "gpt-4o-mini").
* `additional_columns`: Optional mapping of prompt variables to column names.
* `include_category`: Whether to include category in output.
* `include_score`: Whether to include score in output.
* `include_reasoning`: Whether to include reasoning in output.
* `uncertainty`: Optional uncertainty handling strategy.
* `alias`: Optional alias for the descriptor.
* `tests`: Optional list of tests to apply.
"""
from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1

feature = CoherenceLLMEvalV1(
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def CompletenessLLMEval(
column_name: str,
context: str,
Expand Down Expand Up @@ -1014,6 +1058,50 @@ def FaithfulnessLLMEval(
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def FluencyLLMEval(
column_name: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
):
"""Evaluate fluency of text using LLM.

Checks whether the text is grammatically correct, naturally written,
and easy to read — without requiring any reference output.

Args:
* `column_name`: Name of the text column to evaluate.
* `provider`: LLM provider name (e.g., "openai", "anthropic").
* `model`: Model name to use (e.g., "gpt-4o-mini").
* `additional_columns`: Optional mapping of prompt variables to column names.
* `include_category`: Whether to include category in output.
* `include_score`: Whether to include score in output.
* `include_reasoning`: Whether to include reasoning in output.
* `uncertainty`: Optional uncertainty handling strategy.
* `alias`: Optional alias for the descriptor.
* `tests`: Optional list of tests to apply.
"""
from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1

feature = FluencyLLMEvalV1(
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def LLMEval(
column_name: str,
provider: str,
Expand Down
62 changes: 62 additions & 0 deletions src/evidently/legacy/descriptors/llm_judges.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,68 @@ def get_input_columns(self, column_name: str) -> Dict[str, str]:
return input_columns


class FluencyLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:FluencyLLMEval"

name: ClassVar = "Fluency"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria=textwrap.dedent(
"""
A "FLUENT" response is written in clear, natural, grammatically correct language that reads easily and smoothly.
It uses proper sentence structure, appropriate vocabulary, and flows naturally without awkward phrasing, excessive repetition,
or confusing constructions.

A "NOT_FLUENT" response contains significant grammatical errors, broken or incomplete sentences, highly unnatural phrasing,
or is otherwise difficult to read and understand due to language quality issues — regardless of the accuracy of its content.
""" # noqa: E501
).strip(),
target_category="FLUENT",
non_target_category="NOT_FLUENT",
uncertainty=Uncertainty.UNKNOWN,
include_reasoning=True,
pre_messages=[
LLMMessage.system(
"You are an impartial expert evaluator. You will be given a text. "
"Your task is to evaluate the fluency of the text.",
)
],
)
provider = "openai"
model = "gpt-4o-mini"


class CoherenceLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:CoherenceLLMEval"

name: ClassVar = "Coherence"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria=textwrap.dedent(
"""
A "COHERENT" response presents ideas in a logically organized, consistent, and easy-to-follow manner.
Its arguments or statements flow naturally from one to the next, and the overall structure makes sense.
It does not contradict itself and stays on topic.

An "INCOHERENT" response is one that is difficult to follow due to logical inconsistencies, abrupt topic changes,
self-contradictions, or a disorganized structure — even if individual sentences are grammatically correct.
""" # noqa: E501
).strip(),
target_category="COHERENT",
non_target_category="INCOHERENT",
uncertainty=Uncertainty.UNKNOWN,
include_reasoning=True,
pre_messages=[
LLMMessage.system(
"You are an impartial expert evaluator. You will be given a text. "
"Your task is to evaluate the logical coherence and organization of the text.",
)
],
)
provider = "openai"
model = "gpt-4o-mini"


class MulticlassClassificationLLMEval(BaseLLMEval):
class Config:
type_alias = "evidently:descriptor:MulticlassClassificationLLMEval"
Expand Down
2 changes: 1 addition & 1 deletion src/evidently/llm/rag/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,4 @@ def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]
n_results = min(n_results, len(self.chunks))
_, indexes = self.index.search(np.array([query_emb]), n_results)
relevant_chunks = [self.chunks[i] for i in indexes.reshape(-1)]
return relevant_chunks
return relevant_chunks
53 changes: 53 additions & 0 deletions tests/features/test_llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,56 @@ def test_run_snapshot_with_llm_judge():
}
]
}


def test_fluency_llm_eval():
"""FluencyLLMEval should run without a reference column and produce a 'category' column."""
from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1

fluency_eval = FluencyLLMEvalV1(
provider="mock",
model="",
template=BinaryClassificationPromptTemplate(
target_category="FLUENT",
non_target_category="NOT_FLUENT",
),
)
judge = fluency_eval.feature("text")

data = pd.DataFrame({"text": ["FLUENT", "NOT_FLUENT"]})
dd = DataDefinition(columns={}, reference_present=False)
fts = judge.generate_features(data, dd, Options())

# MockLLMWrapper echoes first character of the input text captured by the regex
assert "category" in fts.columns
assert len(fts) == 2


def test_coherence_llm_eval():
"""CoherenceLLMEval should run without a reference column and produce a 'category' column."""
from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1

coherence_eval = CoherenceLLMEvalV1(
provider="mock",
model="",
template=BinaryClassificationPromptTemplate(
target_category="COHERENT",
non_target_category="INCOHERENT",
),
)
judge = coherence_eval.feature("text")

data = pd.DataFrame({"text": ["COHERENT", "INCOHERENT"]})
dd = DataDefinition(columns={}, reference_present=False)
fts = judge.generate_features(data, dd, Options())

# MockLLMWrapper echoes first character of the input text captured by the regex
assert "category" in fts.columns
assert len(fts) == 2


def test_reference_free_evals_importable():
"""Both new descriptors should be importable from the public evidently.descriptors module."""
from evidently.descriptors import CoherenceLLMEval # noqa: F401
from evidently.descriptors import FluencyLLMEval # noqa: F401