evidentlyai · mostaphaelansari · Feb 22, 2026 · Feb 23, 2026 · May 3, 2026 · Liraim
diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
@@ -38,6 +38,7 @@
 from .generated_descriptors import BERTScore
 from .generated_descriptors import BiasLLMEval
 from .generated_descriptors import BinaryClassificationLLMEval
+from .generated_descriptors import CoherenceLLMEval
 from .generated_descriptors import CompletenessLLMEval
 from .generated_descriptors import ContainsLink
 from .generated_descriptors import ContextQualityLLMEval
@@ -46,6 +47,7 @@
 from .generated_descriptors import EndsWith
 from .generated_descriptors import ExactMatch
 from .generated_descriptors import FaithfulnessLLMEval
+from .generated_descriptors import FluencyLLMEval
 from .generated_descriptors import HuggingFace
 from .generated_descriptors import HuggingFaceToxicity
 from .generated_descriptors import IsValidJSON
@@ -84,6 +86,7 @@
     "BeginsWith",
     "BiasLLMEval",
     "BinaryClassificationLLMEval",
+    "CoherenceLLMEval",
     "ColumnTest",
     "CompletenessLLMEval",
     "Contains",
@@ -99,6 +102,7 @@
     "ExactMatch",
     "ExcludesWords",
     "FaithfulnessLLMEval",
+    "FluencyLLMEval",
     "HuggingFace",
     "HuggingFaceToxicity",
     "IncludesWords",

diff --git a/src/evidently/descriptors/generated_descriptors.py b/src/evidently/descriptors/generated_descriptors.py
@@ -797,6 +797,50 @@ def BinaryClassificationLLMEval(
     return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
 
 
+def CoherenceLLMEval(
+    column_name: str,
+    provider: str = "openai",
+    model: str = "gpt-4o-mini",
+    additional_columns: Optional[Dict[str, str]] = None,
+    include_category: Optional[bool] = None,
+    include_score: Optional[bool] = None,
+    include_reasoning: Optional[bool] = None,
+    uncertainty: Optional[Uncertainty] = None,
+    alias: Optional[str] = None,
+    tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
+):
+    """Evaluate logical coherence and organization of text using LLM.
+
+    Checks whether the text is logically consistent and well-structured, without
+    requiring any reference output or ground-truth answer.
+
+    Args:
+    * `column_name`: Name of the text column to evaluate.
+    * `provider`: LLM provider name (e.g., "openai", "anthropic").
+    * `model`: Model name to use (e.g., "gpt-4o-mini").
+    * `additional_columns`: Optional mapping of prompt variables to column names.
+    * `include_category`: Whether to include category in output.
+    * `include_score`: Whether to include score in output.
+    * `include_reasoning`: Whether to include reasoning in output.
+    * `uncertainty`: Optional uncertainty handling strategy.
+    * `alias`: Optional alias for the descriptor.
+    * `tests`: Optional list of tests to apply.
+    """
+    from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1
+
+    feature = CoherenceLLMEvalV1(
+        provider=provider,
+        model=model,
+        additional_columns=additional_columns,
+        include_category=include_category,
+        include_score=include_score,
+        include_reasoning=include_reasoning,
+        uncertainty=uncertainty,
+        display_name=alias,
+    ).feature(column_name)
+    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
+
+
 def CompletenessLLMEval(
     column_name: str,
     context: str,
@@ -1014,6 +1058,50 @@ def FaithfulnessLLMEval(
     return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
 
 
+def FluencyLLMEval(
+    column_name: str,
+    provider: str = "openai",
+    model: str = "gpt-4o-mini",
+    additional_columns: Optional[Dict[str, str]] = None,
+    include_category: Optional[bool] = None,
+    include_score: Optional[bool] = None,
+    include_reasoning: Optional[bool] = None,
+    uncertainty: Optional[Uncertainty] = None,
+    alias: Optional[str] = None,
+    tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
+):
+    """Evaluate fluency of text using LLM.
+
+    Checks whether the text is grammatically correct, naturally written,
+    and easy to read — without requiring any reference output.
+
+    Args:
+    * `column_name`: Name of the text column to evaluate.
+    * `provider`: LLM provider name (e.g., "openai", "anthropic").
+    * `model`: Model name to use (e.g., "gpt-4o-mini").
+    * `additional_columns`: Optional mapping of prompt variables to column names.
+    * `include_category`: Whether to include category in output.
+    * `include_score`: Whether to include score in output.
+    * `include_reasoning`: Whether to include reasoning in output.
+    * `uncertainty`: Optional uncertainty handling strategy.
+    * `alias`: Optional alias for the descriptor.
+    * `tests`: Optional list of tests to apply.
+    """
+    from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1
+
+    feature = FluencyLLMEvalV1(
+        provider=provider,
+        model=model,
+        additional_columns=additional_columns,
+        include_category=include_category,
+        include_score=include_score,
+        include_reasoning=include_reasoning,
+        uncertainty=uncertainty,
+        display_name=alias,
+    ).feature(column_name)
+    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)
+
+
 def LLMEval(
     column_name: str,
     provider: str,

diff --git a/src/evidently/legacy/descriptors/llm_judges.py b/src/evidently/legacy/descriptors/llm_judges.py
@@ -400,6 +400,68 @@ def get_input_columns(self, column_name: str) -> Dict[str, str]:
         return input_columns
 
 
+class FluencyLLMEval(BinaryClassificationLLMEval):
+    class Config:
+        type_alias = "evidently:descriptor:FluencyLLMEval"
+
+    name: ClassVar = "Fluency"
+    template: ClassVar = BinaryClassificationPromptTemplate(
+        criteria=textwrap.dedent(
+            """
+        A "FLUENT" response is written in clear, natural, grammatically correct language that reads easily and smoothly.
+        It uses proper sentence structure, appropriate vocabulary, and flows naturally without awkward phrasing, excessive repetition,
+        or confusing constructions.
+
+        A "NOT_FLUENT" response contains significant grammatical errors, broken or incomplete sentences, highly unnatural phrasing,
+        or is otherwise difficult to read and understand due to language quality issues — regardless of the accuracy of its content.
+        """  # noqa: E501
+        ).strip(),
+        target_category="FLUENT",
+        non_target_category="NOT_FLUENT",
+        uncertainty=Uncertainty.UNKNOWN,
+        include_reasoning=True,
+        pre_messages=[
+            LLMMessage.system(
+                "You are an impartial expert evaluator. You will be given a text. "
+                "Your task is to evaluate the fluency of the text.",
+            )
+        ],
+    )
+    provider = "openai"
+    model = "gpt-4o-mini"
+
+
+class CoherenceLLMEval(BinaryClassificationLLMEval):
+    class Config:
+        type_alias = "evidently:descriptor:CoherenceLLMEval"
+
+    name: ClassVar = "Coherence"
+    template: ClassVar = BinaryClassificationPromptTemplate(
+        criteria=textwrap.dedent(
+            """
+        A "COHERENT" response presents ideas in a logically organized, consistent, and easy-to-follow manner.
+        Its arguments or statements flow naturally from one to the next, and the overall structure makes sense.
+        It does not contradict itself and stays on topic.
+
+        An "INCOHERENT" response is one that is difficult to follow due to logical inconsistencies, abrupt topic changes,
+        self-contradictions, or a disorganized structure — even if individual sentences are grammatically correct.
+        """  # noqa: E501
+        ).strip(),
+        target_category="COHERENT",
+        non_target_category="INCOHERENT",
+        uncertainty=Uncertainty.UNKNOWN,
+        include_reasoning=True,
+        pre_messages=[
+            LLMMessage.system(
+                "You are an impartial expert evaluator. You will be given a text. "
+                "Your task is to evaluate the logical coherence and organization of the text.",
+            )
+        ],
+    )
+    provider = "openai"
+    model = "gpt-4o-mini"
+
+
 class MulticlassClassificationLLMEval(BaseLLMEval):
     class Config:
         type_alias = "evidently:descriptor:MulticlassClassificationLLMEval"

diff --git a/src/evidently/llm/rag/index.py b/src/evidently/llm/rag/index.py
@@ -162,4 +162,4 @@ def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]
         n_results = min(n_results, len(self.chunks))
         _, indexes = self.index.search(np.array([query_emb]), n_results)
         relevant_chunks = [self.chunks[i] for i in indexes.reshape(-1)]
-        return relevant_chunks
+        return relevant_chunks
diff --git a/test_output.txt b/test_output.txt
@@ -0,0 +1,42 @@
+============================= test session starts =============================
+platform win32 -- Python 3.11.14, pytest-7.4.4, pluggy-1.6.0 -- C:\Users\mosta\anaconda3\envs\evidently\python.exe
+cachedir: .pytest_cache
+rootdir: C:\Users\mosta\Documents\GitHub\evidently
+configfile: pyproject.toml
+plugins: anyio-4.12.1, Faker-40.4.0, asyncio-0.23.7, mock-3.14.0
+asyncio: mode=Mode.STRICT
+collecting ... collected 10 items
+
+tests/features/test_llm_judge.py::test_parse_response[template0-results0] PASSED [ 10%]
+tests/features/test_llm_judge.py::test_parse_response[template1-results1] PASSED [ 20%]
+tests/features/test_llm_judge.py::test_parse_response[template2-results2] PASSED [ 30%]
+tests/features/test_llm_judge.py::test_parse_response[template3-results3] PASSED [ 40%]
+tests/features/test_llm_judge.py::test_llm_judge PASSED                  [ 50%]
+tests/features/test_llm_judge.py::test_multicol_llm_judge PASSED         [ 60%]
+tests/features/test_llm_judge.py::test_run_snapshot_with_llm_judge PASSED [ 70%]
+tests/features/test_llm_judge.py::test_fluency_llm_eval PASSED           [ 80%]
+tests/features/test_llm_judge.py::test_coherence_llm_eval PASSED         [ 90%]
+tests/features/test_llm_judge.py::test_reference_free_evals_importable PASSED [100%]
+
+============================== warnings summary ===============================
+src\evidently\legacy\tests\utils.py:183
+src\evidently\legacy\tests\utils.py:183
+  C:\Users\mosta\Documents\GitHub\evidently\src\evidently\legacy\tests\utils.py:183: DeprecationWarning: numpy.core is deprecated and has been renamed to numpy._core. The numpy._core namespace contains private NumPy internals and its use is discouraged, as NumPy internals can change without warning in any release. In practice, most real-world usage of numpy.core is to access functionality in the public NumPy API. If that is the case, use the public NumPy API. If not, you are using NumPy internals. If you would still like to access an internal attribute, use numpy._core.numeric.
+    np.core.numeric.ScalarType = np.core.numeric.ScalarType + (ApproxValue, ApproxValueNoDict)  # type: ignore[attr-defined]
+
+tests\conftest.py:12
+tests\conftest.py:12
+  C:\Users\mosta\Documents\GitHub\evidently\tests\conftest.py:12: DeprecationWarning: numpy.core is deprecated and has been renamed to numpy._core. The numpy._core namespace contains private NumPy internals and its use is discouraged, as NumPy internals can change without warning in any release. In practice, most real-world usage of numpy.core is to access functionality in the public NumPy API. If that is the case, use the public NumPy API. If not, you are using NumPy internals. If you would still like to access an internal attribute, use numpy._core.numeric.
+    np.core.numeric.ScalarType = np.core.numeric.ScalarType + (ApproxValue,)  # type: ignore[attr-defined]
+
+tests/features/test_llm_judge.py::test_llm_judge
+  tests\features\test_llm_judge.py:103: PytestWarning: The test <Function test_llm_judge> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.
+    @pytest.mark.asyncio
+
+tests/features/test_llm_judge.py::test_multicol_llm_judge
+  tests\features\test_llm_judge.py:119: PytestWarning: The test <Function test_multicol_llm_judge> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.
+    @pytest.mark.asyncio
+
+-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
+======================= 10 passed, 6 warnings in 0.09s ========================
+
diff --git a/tests/features/test_llm_judge.py b/tests/features/test_llm_judge.py
@@ -168,3 +168,56 @@ def test_run_snapshot_with_llm_judge():
             }
         ]
     }
+
+
+def test_fluency_llm_eval():
+    """FluencyLLMEval should run without a reference column and produce a 'category' column."""
+    from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1
+
+    fluency_eval = FluencyLLMEvalV1(
+        provider="mock",
+        model="",
+        template=BinaryClassificationPromptTemplate(
+            target_category="FLUENT",
+            non_target_category="NOT_FLUENT",
+        ),
+    )
+    judge = fluency_eval.feature("text")
+
+    data = pd.DataFrame({"text": ["FLUENT", "NOT_FLUENT"]})
+    dd = DataDefinition(columns={}, reference_present=False)
+    fts = judge.generate_features(data, dd, Options())
+
+    # MockLLMWrapper echoes first character of the input text captured by the regex
+    assert "category" in fts.columns
+    assert len(fts) == 2
+
+
+def test_coherence_llm_eval():
+    """CoherenceLLMEval should run without a reference column and produce a 'category' column."""
+    from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1
+
+    coherence_eval = CoherenceLLMEvalV1(
+        provider="mock",
+        model="",
+        template=BinaryClassificationPromptTemplate(
+            target_category="COHERENT",
+            non_target_category="INCOHERENT",
+        ),
+    )
+    judge = coherence_eval.feature("text")
+
+    data = pd.DataFrame({"text": ["COHERENT", "INCOHERENT"]})
+    dd = DataDefinition(columns={}, reference_present=False)
+    fts = judge.generate_features(data, dd, Options())
+
+    # MockLLMWrapper echoes first character of the input text captured by the regex
+    assert "category" in fts.columns
+    assert len(fts) == 2
+
+
+def test_reference_free_evals_importable():
+    """Both new descriptors should be importable from the public evidently.descriptors module."""
+    from evidently.descriptors import CoherenceLLMEval  # noqa: F401
+    from evidently.descriptors import FluencyLLMEval  # noqa: F401
+