evidentlyai · mostaphaelansari · Feb 22, 2026 · Feb 23, 2026 · May 3, 2026
diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
@@ -38,6 +38,7 @@
 from .generated_descriptors import BERTScore
 from .generated_descriptors import BiasLLMEval
 from .generated_descriptors import BinaryClassificationLLMEval
+from .generated_descriptors import CoherenceLLMEval
 from .generated_descriptors import CompletenessLLMEval
 from .generated_descriptors import ContainsLink
 from .generated_descriptors import ContextQualityLLMEval
@@ -46,6 +47,7 @@
 from .generated_descriptors import EndsWith
 from .generated_descriptors import ExactMatch
 from .generated_descriptors import FaithfulnessLLMEval
+from .generated_descriptors import FluencyLLMEval
 from .generated_descriptors import HuggingFace
 from .generated_descriptors import HuggingFaceToxicity
 from .generated_descriptors import IsValidJSON
@@ -84,6 +86,7 @@
     "BeginsWith",
     "BiasLLMEval",
     "BinaryClassificationLLMEval",
+    "CoherenceLLMEval",
     "ColumnTest",
     "CompletenessLLMEval",
     "Contains",
@@ -99,6 +102,7 @@
     "ExactMatch",
     "ExcludesWords",
     "FaithfulnessLLMEval",
+    "FluencyLLMEval",
     "HuggingFace",
     "HuggingFaceToxicity",
     "IncludesWords",

diff --git a/src/evidently/descriptors/generated_descriptors.py b/src/evidently/descriptors/generated_descriptors.py
@@ -797,6 +797,50 @@ def BinaryClassificationLLMEval(
     return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
 
 
+def CoherenceLLMEval(
+    column_name: str,
+    provider: str = "openai",
+    model: str = "gpt-4o-mini",
+    additional_columns: Optional[Dict[str, str]] = None,
+    include_category: Optional[bool] = None,
+    include_score: Optional[bool] = None,
+    include_reasoning: Optional[bool] = None,
+    uncertainty: Optional[Uncertainty] = None,
+    alias: Optional[str] = None,
+    tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
+):
+    """Evaluate logical coherence and organization of text using LLM.
+
+    Checks whether the text is logically consistent and well-structured, without
+    requiring any reference output or ground-truth answer.
+
+    Args:
+    * `column_name`: Name of the text column to evaluate.
+    * `provider`: LLM provider name (e.g., "openai", "anthropic").
+    * `model`: Model name to use (e.g., "gpt-4o-mini").
+    * `additional_columns`: Optional mapping of prompt variables to column names.
+    * `include_category`: Whether to include category in output.
+    * `include_score`: Whether to include score in output.
+    * `include_reasoning`: Whether to include reasoning in output.
+    * `uncertainty`: Optional uncertainty handling strategy.
+    * `alias`: Optional alias for the descriptor.
+    * `tests`: Optional list of tests to apply.
+    """
+    from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1
+
+    feature = CoherenceLLMEvalV1(
+        provider=provider,
+        model=model,
+        additional_columns=additional_columns,
+        include_category=include_category,
+        include_score=include_score,
+        include_reasoning=include_reasoning,
+        uncertainty=uncertainty,
+        display_name=alias,
+    ).feature(column_name)
+    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
+
+
 def CompletenessLLMEval(
     column_name: str,
     context: str,
@@ -1014,6 +1058,50 @@ def FaithfulnessLLMEval(
     return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
 
 
+def FluencyLLMEval(
+    column_name: str,
+    provider: str = "openai",
+    model: str = "gpt-4o-mini",
+    additional_columns: Optional[Dict[str, str]] = None,
+    include_category: Optional[bool] = None,
+    include_score: Optional[bool] = None,
+    include_reasoning: Optional[bool] = None,
+    uncertainty: Optional[Uncertainty] = None,
+    alias: Optional[str] = None,
+    tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
+):
+    """Evaluate fluency of text using LLM.
+
+    Checks whether the text is grammatically correct, naturally written,
+    and easy to read — without requiring any reference output.
+
+    Args:
+    * `column_name`: Name of the text column to evaluate.
+    * `provider`: LLM provider name (e.g., "openai", "anthropic").
+    * `model`: Model name to use (e.g., "gpt-4o-mini").
+    * `additional_columns`: Optional mapping of prompt variables to column names.
+    * `include_category`: Whether to include category in output.
+    * `include_score`: Whether to include score in output.
+    * `include_reasoning`: Whether to include reasoning in output.
+    * `uncertainty`: Optional uncertainty handling strategy.
+    * `alias`: Optional alias for the descriptor.
+    * `tests`: Optional list of tests to apply.
+    """
+    from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1
+
+    feature = FluencyLLMEvalV1(
+        provider=provider,
+        model=model,
+        additional_columns=additional_columns,
+        include_category=include_category,
+        include_score=include_score,
+        include_reasoning=include_reasoning,
+        uncertainty=uncertainty,
+        display_name=alias,
+    ).feature(column_name)
+    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
+
+
 def LLMEval(
     column_name: str,
     provider: str,

diff --git a/src/evidently/legacy/descriptors/llm_judges.py b/src/evidently/legacy/descriptors/llm_judges.py
@@ -400,6 +400,68 @@ def get_input_columns(self, column_name: str) -> Dict[str, str]:
         return input_columns
 
 
+class FluencyLLMEval(BinaryClassificationLLMEval):
+    class Config:
+        type_alias = "evidently:descriptor:FluencyLLMEval"
+
+    name: ClassVar = "Fluency"
+    template: ClassVar = BinaryClassificationPromptTemplate(
+        criteria=textwrap.dedent(
+            """
+        A "FLUENT" response is written in clear, natural, grammatically correct language that reads easily and smoothly.
+        It uses proper sentence structure, appropriate vocabulary, and flows naturally without awkward phrasing, excessive repetition,
+        or confusing constructions.
+
+        A "NOT_FLUENT" response contains significant grammatical errors, broken or incomplete sentences, highly unnatural phrasing,
+        or is otherwise difficult to read and understand due to language quality issues — regardless of the accuracy of its content.
+        """  # noqa: E501
+        ).strip(),
+        target_category="FLUENT",
+        non_target_category="NOT_FLUENT",
+        uncertainty=Uncertainty.UNKNOWN,
+        include_reasoning=True,
+        pre_messages=[
+            LLMMessage.system(
+                "You are an impartial expert evaluator. You will be given a text. "
+                "Your task is to evaluate the fluency of the text.",
+            )
+        ],
+    )
+    provider = "openai"
+    model = "gpt-4o-mini"
+
+
+class CoherenceLLMEval(BinaryClassificationLLMEval):
+    class Config:
+        type_alias = "evidently:descriptor:CoherenceLLMEval"
+
+    name: ClassVar = "Coherence"
+    template: ClassVar = BinaryClassificationPromptTemplate(
+        criteria=textwrap.dedent(
+            """
+        A "COHERENT" response presents ideas in a logically organized, consistent, and easy-to-follow manner.
+        Its arguments or statements flow naturally from one to the next, and the overall structure makes sense.
+        It does not contradict itself and stays on topic.
+
+        An "INCOHERENT" response is one that is difficult to follow due to logical inconsistencies, abrupt topic changes,
+        self-contradictions, or a disorganized structure — even if individual sentences are grammatically correct.
+        """  # noqa: E501
+        ).strip(),
+        target_category="COHERENT",
+        non_target_category="INCOHERENT",
+        uncertainty=Uncertainty.UNKNOWN,
+        include_reasoning=True,
+        pre_messages=[
+            LLMMessage.system(
+                "You are an impartial expert evaluator. You will be given a text. "
+                "Your task is to evaluate the logical coherence and organization of the text.",
+            )
+        ],
+    )
+    provider = "openai"
+    model = "gpt-4o-mini"
+
+
 class MulticlassClassificationLLMEval(BaseLLMEval):
     class Config:
         type_alias = "evidently:descriptor:MulticlassClassificationLLMEval"

diff --git a/src/evidently/llm/rag/index.py b/src/evidently/llm/rag/index.py
@@ -162,4 +162,4 @@ def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]
         n_results = min(n_results, len(self.chunks))
         _, indexes = self.index.search(np.array([query_emb]), n_results)
         relevant_chunks = [self.chunks[i] for i in indexes.reshape(-1)]
-        return relevant_chunks
+        return relevant_chunks
diff --git a/tests/features/test_llm_judge.py b/tests/features/test_llm_judge.py
@@ -168,3 +168,56 @@ def test_run_snapshot_with_llm_judge():
             }
         ]
     }
+
+
+def test_fluency_llm_eval():
+    """FluencyLLMEval should run without a reference column and produce a 'category' column."""
+    from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1
+
+    fluency_eval = FluencyLLMEvalV1(
+        provider="mock",
+        model="",
+        template=BinaryClassificationPromptTemplate(
+            target_category="FLUENT",
+            non_target_category="NOT_FLUENT",
+        ),
+    )
+    judge = fluency_eval.feature("text")
+
+    data = pd.DataFrame({"text": ["FLUENT", "NOT_FLUENT"]})
+    dd = DataDefinition(columns={}, reference_present=False)
+    fts = judge.generate_features(data, dd, Options())
+
+    # MockLLMWrapper echoes first character of the input text captured by the regex
+    assert "category" in fts.columns
+    assert len(fts) == 2
+
+
+def test_coherence_llm_eval():
+    """CoherenceLLMEval should run without a reference column and produce a 'category' column."""
+    from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1
+
+    coherence_eval = CoherenceLLMEvalV1(
+        provider="mock",
+        model="",
+        template=BinaryClassificationPromptTemplate(
+            target_category="COHERENT",
+            non_target_category="INCOHERENT",
+        ),
+    )
+    judge = coherence_eval.feature("text")
+
+    data = pd.DataFrame({"text": ["COHERENT", "INCOHERENT"]})
+    dd = DataDefinition(columns={}, reference_present=False)
+    fts = judge.generate_features(data, dd, Options())
+
+    # MockLLMWrapper echoes first character of the input text captured by the regex
+    assert "category" in fts.columns
+    assert len(fts) == 2
+
+
+def test_reference_free_evals_importable():
+    """Both new descriptors should be importable from the public evidently.descriptors module."""
+    from evidently.descriptors import CoherenceLLMEval  # noqa: F401
+    from evidently.descriptors import FluencyLLMEval  # noqa: F401
+