diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py index c835e4a33c..5ba750a5d3 100644 --- a/src/evidently/descriptors/__init__.py +++ b/src/evidently/descriptors/__init__.py @@ -38,6 +38,7 @@ from .generated_descriptors import BERTScore from .generated_descriptors import BiasLLMEval from .generated_descriptors import BinaryClassificationLLMEval +from .generated_descriptors import CoherenceLLMEval from .generated_descriptors import CompletenessLLMEval from .generated_descriptors import ContainsLink from .generated_descriptors import ContextQualityLLMEval @@ -46,6 +47,7 @@ from .generated_descriptors import EndsWith from .generated_descriptors import ExactMatch from .generated_descriptors import FaithfulnessLLMEval +from .generated_descriptors import FluencyLLMEval from .generated_descriptors import HuggingFace from .generated_descriptors import HuggingFaceToxicity from .generated_descriptors import IsValidJSON @@ -84,6 +86,7 @@ "BeginsWith", "BiasLLMEval", "BinaryClassificationLLMEval", + "CoherenceLLMEval", "ColumnTest", "CompletenessLLMEval", "Contains", @@ -99,6 +102,7 @@ "ExactMatch", "ExcludesWords", "FaithfulnessLLMEval", + "FluencyLLMEval", "HuggingFace", "HuggingFaceToxicity", "IncludesWords", diff --git a/src/evidently/descriptors/generated_descriptors.py b/src/evidently/descriptors/generated_descriptors.py index 3a528f22f3..82756718cc 100644 --- a/src/evidently/descriptors/generated_descriptors.py +++ b/src/evidently/descriptors/generated_descriptors.py @@ -797,6 +797,50 @@ def BinaryClassificationLLMEval( return FeatureDescriptor(feature=feature, alias=alias, tests=tests) +def CoherenceLLMEval( + column_name: str, + provider: str = "openai", + model: str = "gpt-4o-mini", + additional_columns: Optional[Dict[str, str]] = None, + include_category: Optional[bool] = None, + include_score: Optional[bool] = None, + include_reasoning: Optional[bool] = None, + uncertainty: Optional[Uncertainty] = None, + alias: Optional[str] = None, + tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None, +): + """Evaluate logical coherence and organization of text using LLM. + + Checks whether the text is logically consistent and well-structured, without + requiring any reference output or ground-truth answer. + + Args: + * `column_name`: Name of the text column to evaluate. + * `provider`: LLM provider name (e.g., "openai", "anthropic"). + * `model`: Model name to use (e.g., "gpt-4o-mini"). + * `additional_columns`: Optional mapping of prompt variables to column names. + * `include_category`: Whether to include category in output. + * `include_score`: Whether to include score in output. + * `include_reasoning`: Whether to include reasoning in output. + * `uncertainty`: Optional uncertainty handling strategy. + * `alias`: Optional alias for the descriptor. + * `tests`: Optional list of tests to apply. + """ + from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1 + + feature = CoherenceLLMEvalV1( + provider=provider, + model=model, + additional_columns=additional_columns, + include_category=include_category, + include_score=include_score, + include_reasoning=include_reasoning, + uncertainty=uncertainty, + display_name=alias, + ).feature(column_name) + return FeatureDescriptor(feature=feature, alias=alias, tests=tests) + + def CompletenessLLMEval( column_name: str, context: str, @@ -1014,6 +1058,50 @@ def FaithfulnessLLMEval( return FeatureDescriptor(feature=feature, alias=alias, tests=tests) +def FluencyLLMEval( + column_name: str, + provider: str = "openai", + model: str = "gpt-4o-mini", + additional_columns: Optional[Dict[str, str]] = None, + include_category: Optional[bool] = None, + include_score: Optional[bool] = None, + include_reasoning: Optional[bool] = None, + uncertainty: Optional[Uncertainty] = None, + alias: Optional[str] = None, + tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None, +): + """Evaluate fluency of text using LLM. + + Checks whether the text is grammatically correct, naturally written, + and easy to read — without requiring any reference output. + + Args: + * `column_name`: Name of the text column to evaluate. + * `provider`: LLM provider name (e.g., "openai", "anthropic"). + * `model`: Model name to use (e.g., "gpt-4o-mini"). + * `additional_columns`: Optional mapping of prompt variables to column names. + * `include_category`: Whether to include category in output. + * `include_score`: Whether to include score in output. + * `include_reasoning`: Whether to include reasoning in output. + * `uncertainty`: Optional uncertainty handling strategy. + * `alias`: Optional alias for the descriptor. + * `tests`: Optional list of tests to apply. + """ + from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1 + + feature = FluencyLLMEvalV1( + provider=provider, + model=model, + additional_columns=additional_columns, + include_category=include_category, + include_score=include_score, + include_reasoning=include_reasoning, + uncertainty=uncertainty, + display_name=alias, + ).feature(column_name) + return FeatureDescriptor(feature=feature, alias=alias, tests=tests) + + def LLMEval( column_name: str, provider: str, diff --git a/src/evidently/legacy/descriptors/llm_judges.py b/src/evidently/legacy/descriptors/llm_judges.py index c3b1fbaea2..52d69b5ea6 100644 --- a/src/evidently/legacy/descriptors/llm_judges.py +++ b/src/evidently/legacy/descriptors/llm_judges.py @@ -400,6 +400,68 @@ def get_input_columns(self, column_name: str) -> Dict[str, str]: return input_columns +class FluencyLLMEval(BinaryClassificationLLMEval): + class Config: + type_alias = "evidently:descriptor:FluencyLLMEval" + + name: ClassVar = "Fluency" + template: ClassVar = BinaryClassificationPromptTemplate( + criteria=textwrap.dedent( + """ + A "FLUENT" response is written in clear, natural, grammatically correct language that reads easily and smoothly. + It uses proper sentence structure, appropriate vocabulary, and flows naturally without awkward phrasing, excessive repetition, + or confusing constructions. + + A "NOT_FLUENT" response contains significant grammatical errors, broken or incomplete sentences, highly unnatural phrasing, + or is otherwise difficult to read and understand due to language quality issues — regardless of the accuracy of its content. + """ # noqa: E501 + ).strip(), + target_category="FLUENT", + non_target_category="NOT_FLUENT", + uncertainty=Uncertainty.UNKNOWN, + include_reasoning=True, + pre_messages=[ + LLMMessage.system( + "You are an impartial expert evaluator. You will be given a text. " + "Your task is to evaluate the fluency of the text.", + ) + ], + ) + provider = "openai" + model = "gpt-4o-mini" + + +class CoherenceLLMEval(BinaryClassificationLLMEval): + class Config: + type_alias = "evidently:descriptor:CoherenceLLMEval" + + name: ClassVar = "Coherence" + template: ClassVar = BinaryClassificationPromptTemplate( + criteria=textwrap.dedent( + """ + A "COHERENT" response presents ideas in a logically organized, consistent, and easy-to-follow manner. + Its arguments or statements flow naturally from one to the next, and the overall structure makes sense. + It does not contradict itself and stays on topic. + + An "INCOHERENT" response is one that is difficult to follow due to logical inconsistencies, abrupt topic changes, + self-contradictions, or a disorganized structure — even if individual sentences are grammatically correct. + """ # noqa: E501 + ).strip(), + target_category="COHERENT", + non_target_category="INCOHERENT", + uncertainty=Uncertainty.UNKNOWN, + include_reasoning=True, + pre_messages=[ + LLMMessage.system( + "You are an impartial expert evaluator. You will be given a text. " + "Your task is to evaluate the logical coherence and organization of the text.", + ) + ], + ) + provider = "openai" + model = "gpt-4o-mini" + + class MulticlassClassificationLLMEval(BaseLLMEval): class Config: type_alias = "evidently:descriptor:MulticlassClassificationLLMEval" diff --git a/src/evidently/llm/rag/index.py b/src/evidently/llm/rag/index.py index 4a2ed7568b..8fcd0cc4fc 100644 --- a/src/evidently/llm/rag/index.py +++ b/src/evidently/llm/rag/index.py @@ -162,4 +162,4 @@ def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk] n_results = min(n_results, len(self.chunks)) _, indexes = self.index.search(np.array([query_emb]), n_results) relevant_chunks = [self.chunks[i] for i in indexes.reshape(-1)] - return relevant_chunks + return relevant_chunks \ No newline at end of file diff --git a/tests/features/test_llm_judge.py b/tests/features/test_llm_judge.py index cd02968a98..fe5f700236 100644 --- a/tests/features/test_llm_judge.py +++ b/tests/features/test_llm_judge.py @@ -168,3 +168,56 @@ def test_run_snapshot_with_llm_judge(): } ] } + + +def test_fluency_llm_eval(): + """FluencyLLMEval should run without a reference column and produce a 'category' column.""" + from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1 + + fluency_eval = FluencyLLMEvalV1( + provider="mock", + model="", + template=BinaryClassificationPromptTemplate( + target_category="FLUENT", + non_target_category="NOT_FLUENT", + ), + ) + judge = fluency_eval.feature("text") + + data = pd.DataFrame({"text": ["FLUENT", "NOT_FLUENT"]}) + dd = DataDefinition(columns={}, reference_present=False) + fts = judge.generate_features(data, dd, Options()) + + # MockLLMWrapper echoes first character of the input text captured by the regex + assert "category" in fts.columns + assert len(fts) == 2 + + +def test_coherence_llm_eval(): + """CoherenceLLMEval should run without a reference column and produce a 'category' column.""" + from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1 + + coherence_eval = CoherenceLLMEvalV1( + provider="mock", + model="", + template=BinaryClassificationPromptTemplate( + target_category="COHERENT", + non_target_category="INCOHERENT", + ), + ) + judge = coherence_eval.feature("text") + + data = pd.DataFrame({"text": ["COHERENT", "INCOHERENT"]}) + dd = DataDefinition(columns={}, reference_present=False) + fts = judge.generate_features(data, dd, Options()) + + # MockLLMWrapper echoes first character of the input text captured by the regex + assert "category" in fts.columns + assert len(fts) == 2 + + +def test_reference_free_evals_importable(): + """Both new descriptors should be importable from the public evidently.descriptors module.""" + from evidently.descriptors import CoherenceLLMEval # noqa: F401 + from evidently.descriptors import FluencyLLMEval # noqa: F401 +