From 869002cfc65a24864c2f5ead0ec4a82340e2b117 Mon Sep 17 00:00:00 2001
From: Ovtcharov <kovtchar@amd.com>
Date: Thu, 28 May 2026 19:51:45 -0700
Subject: [PATCH 1/4] test(eval): cover scorecard, audit, runner, and claude
 judge (#1151)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The eval toolchain at src/gaia/eval/ had only 3 test files covering
analyze_failures, iterations, and MCP reliability — the four core
modules (runner, scorecard, audit, claude judge) had zero unit tests.

Adds 91 tests across 4 new files:
- test_scorecard.py — build_scorecard aggregation, write_summary_md,
  write_junit_xml, status counting, score capping, performance rollup
- test_audit.py — AST constant extraction, agent persistence detection,
  tool-results-in-history pattern matching, run_audit recommendations
- test_runner.py — validate_scenario schema checks, recompute_turn_score
  weighting, _aggregate_performance, _compute_effective_timeout,
  find_scenarios filtering, build_scenario_prompt assembly,
  compare_scorecards regression detection, AgentEvalRunner init
- test_claude_judge.py — ClaudeClient init validation, cost calculation,
  get_completion, get_completion_with_usage, count_tokens (all mocked)
---
 tests/unit/eval/test_audit.py        | 177 ++++++++
 tests/unit/eval/test_claude_judge.py | 215 +++++++++
 tests/unit/eval/test_runner.py       | 640 +++++++++++++++++++++++++++
 tests/unit/eval/test_scorecard.py    | 325 ++++++++++++++
 4 files changed, 1357 insertions(+)
 create mode 100644 tests/unit/eval/test_audit.py
 create mode 100644 tests/unit/eval/test_claude_judge.py
 create mode 100644 tests/unit/eval/test_runner.py
 create mode 100644 tests/unit/eval/test_scorecard.py
diff --git a/tests/unit/eval/test_audit.py b/tests/unit/eval/test_audit.py
new file mode 100644
index 000000000..3892a50a0
--- /dev/null
+++ b/tests/unit/eval/test_audit.py
@@ -0,0 +1,177 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""Unit tests for ``gaia.eval.audit``.
+
+Tests the deterministic architecture audit helpers that inspect
+``_chat_helpers.py`` for constants and patterns, plus the full
+``run_audit()`` rollup.
+"""
+
+import textwrap
+from pathlib import Path
+
+import pytest
+
+from gaia.eval.audit import (
+    audit_agent_persistence,
+    audit_chat_helpers,
+    audit_tool_results_in_history,
+    run_audit,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _write_helpers(tmp_path, source):
+    """Write a fake _chat_helpers.py into tmp_path and return its Path."""
+    p = tmp_path / "_chat_helpers.py"
+    p.write_text(textwrap.dedent(source), encoding="utf-8")
+    return p
+
+
+# ---------------------------------------------------------------------------
+# audit_chat_helpers
+# ---------------------------------------------------------------------------
+
+
+class TestAuditChatHelpers:
+    def test_extracts_max_constants(self, tmp_path, monkeypatch):
+        src = """\
+            _MAX_HISTORY_PAIRS = 10
+            _MAX_MSG_CHARS = 2000
+            _OTHER = 42
+        """
+        p = _write_helpers(tmp_path, src)
+        # Monkeypatch GAIA_ROOT so audit_chat_helpers reads our fake file
+        monkeypatch.setattr(
+            "gaia.eval.audit.GAIA_ROOT",
+            # Need a root where <root>/src/gaia/ui/_chat_helpers.py resolves to our file
+            # Easier: just monkeypatch the whole function's file path
+            tmp_path,
+        )
+        # Since audit_chat_helpers hardcodes the path, we need to
+        # create the expected directory structure.
+        target = tmp_path / "src" / "gaia" / "ui"
+        target.mkdir(parents=True)
+        (target / "_chat_helpers.py").write_text(textwrap.dedent(src), encoding="utf-8")
+        result = audit_chat_helpers()
+        assert result["_MAX_HISTORY_PAIRS"] == 10
+        assert result["_MAX_MSG_CHARS"] == 2000
+        assert "_OTHER" not in result
+
+    def test_returns_empty_on_missing_file(self, tmp_path, monkeypatch):
+        monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path)
+        result = audit_chat_helpers()
+        assert result == {}
+
+
+# ---------------------------------------------------------------------------
+# audit_agent_persistence
+# ---------------------------------------------------------------------------
+
+
+class TestAuditAgentPersistence:
+    def test_stateless_per_message(self, tmp_path):
+        p = _write_helpers(tmp_path, 'agent = ChatAgent(model="test")\n')
+        assert audit_agent_persistence(p) == "stateless_per_message"
+
+    def test_unknown_when_no_chatagent(self, tmp_path):
+        p = _write_helpers(tmp_path, "x = 1\n")
+        assert audit_agent_persistence(p) == "unknown"
+
+    def test_unknown_on_missing_file(self, tmp_path):
+        assert audit_agent_persistence(tmp_path / "nonexistent.py") == "unknown"
+
+
+# ---------------------------------------------------------------------------
+# audit_tool_results_in_history
+# ---------------------------------------------------------------------------
+
+
+class TestAuditToolResultsInHistory:
+    def test_detects_pattern(self, tmp_path):
+        src = """\
+            agent_steps = agent.run()
+            messages.append({"role": "tool", "content": agent_steps})
+        """
+        p = _write_helpers(tmp_path, src)
+        assert audit_tool_results_in_history(p) is True
+
+    def test_false_when_missing_agent_steps(self, tmp_path):
+        src = 'messages.append({"role": "user"})\n'
+        p = _write_helpers(tmp_path, src)
+        assert audit_tool_results_in_history(p) is False
+
+    def test_false_on_missing_file(self, tmp_path):
+        assert audit_tool_results_in_history(tmp_path / "nope.py") is False
+
+
+# ---------------------------------------------------------------------------
+# run_audit
+# ---------------------------------------------------------------------------
+
+
+class TestRunAudit:
+    def test_returns_audit_key(self, tmp_path, monkeypatch):
+        # Create a minimal _chat_helpers.py that satisfies all checks
+        target = tmp_path / "src" / "gaia" / "ui"
+        target.mkdir(parents=True)
+        src = textwrap.dedent("""\
+            _MAX_HISTORY_PAIRS = 3
+            _MAX_MSG_CHARS = 500
+            agent = ChatAgent(model="test")
+            agent_steps = agent.run()
+            messages.append({"role": "tool", "content": agent_steps})
+        """)
+        (target / "_chat_helpers.py").write_text(src, encoding="utf-8")
+        monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path)
+
+        result = run_audit()
+        assert "architecture_audit" in result
+        audit = result["architecture_audit"]
+        assert audit["history_pairs"] == 3
+        assert audit["max_msg_chars"] == 500
+        assert audit["tool_results_in_history"] is True
+        assert audit["agent_persistence"] == "stateless_per_message"
+
+    def test_recommendations_on_low_history(self, tmp_path, monkeypatch):
+        target = tmp_path / "src" / "gaia" / "ui"
+        target.mkdir(parents=True)
+        src = "_MAX_HISTORY_PAIRS = 2\n"
+        (target / "_chat_helpers.py").write_text(src, encoding="utf-8")
+        monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path)
+
+        result = run_audit()
+        recs = result["architecture_audit"]["recommendations"]
+        rec_ids = [r["id"] for r in recs]
+        assert "increase_history_pairs" in rec_ids
+
+    def test_blocked_scenarios_on_low_msg_chars(self, tmp_path, monkeypatch):
+        target = tmp_path / "src" / "gaia" / "ui"
+        target.mkdir(parents=True)
+        src = "_MAX_MSG_CHARS = 500\n"
+        (target / "_chat_helpers.py").write_text(src, encoding="utf-8")
+        monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path)
+
+        result = run_audit()
+        blocked = result["architecture_audit"]["blocked_scenarios"]
+        assert any(b["scenario"] == "cross_turn_file_recall" for b in blocked)
+
+    def test_no_recommendations_when_values_sufficient(self, tmp_path, monkeypatch):
+        target = tmp_path / "src" / "gaia" / "ui"
+        target.mkdir(parents=True)
+        src = textwrap.dedent("""\
+            _MAX_HISTORY_PAIRS = 20
+            _MAX_MSG_CHARS = 5000
+            agent = ChatAgent(model="test")
+            agent_steps = agent.run()
+            messages.append({"role": "tool", "content": agent_steps})
+        """)
+        (target / "_chat_helpers.py").write_text(src, encoding="utf-8")
+        monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path)
+
+        result = run_audit()
+        assert result["architecture_audit"]["recommendations"] == []
+        assert result["architecture_audit"]["blocked_scenarios"] == []
diff --git a/tests/unit/eval/test_claude_judge.py b/tests/unit/eval/test_claude_judge.py
new file mode 100644
index 000000000..d1605d328
--- /dev/null
+++ b/tests/unit/eval/test_claude_judge.py
@@ -0,0 +1,215 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""Unit tests for ``gaia.eval.claude`` (the Claude judge client).
+
+All Anthropic API calls are mocked — no real API key or network needed.
+Tests cover: init validation, cost calculation, get_completion,
+get_completion_with_usage, and count_tokens.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Shared mocks — we need anthropic + bs4 importable before importing ClaudeClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _patch_imports(monkeypatch):
+    """Ensure anthropic and bs4 are available as mocks at module level."""
+    # We don't want real imports — mock the modules if not installed.
+    pass
+
+
+def _make_mock_anthropic():
+    """Build a mock anthropic module with an Anthropic client constructor."""
+    mock_module = MagicMock()
+    mock_module.Anthropic = MagicMock()
+    return mock_module
+
+
+def _make_mock_bs4():
+    mock_module = MagicMock()
+    mock_module.BeautifulSoup = MagicMock()
+    return mock_module
+
+
+# ---------------------------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------------------------
+
+
+class TestClaudeClientInit:
+    def test_raises_on_missing_api_key(self, monkeypatch):
+        monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+        monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic())
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+
+        from gaia.eval.claude import ClaudeClient
+
+        with pytest.raises(ValueError, match="ANTHROPIC_API_KEY"):
+            ClaudeClient()
+
+    def test_raises_on_missing_anthropic(self, monkeypatch):
+        monkeypatch.setattr("gaia.eval.claude.anthropic", None)
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+
+        from gaia.eval.claude import ClaudeClient
+
+        with pytest.raises(ImportError, match="anthropic"):
+            ClaudeClient()
+
+    def test_raises_on_missing_bs4(self, monkeypatch):
+        monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic())
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", None)
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key-123")
+
+        from gaia.eval.claude import ClaudeClient
+
+        with pytest.raises(ImportError, match="bs4"):
+            ClaudeClient()
+
+    def test_success_with_valid_env(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key-123")
+        mock_anthropic = _make_mock_anthropic()
+        monkeypatch.setattr("gaia.eval.claude.anthropic", mock_anthropic)
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+
+        from gaia.eval.claude import ClaudeClient
+
+        client = ClaudeClient(model="claude-sonnet-4-6")
+        assert client.model == "claude-sonnet-4-6"
+        assert client.api_key == "test-key-123"
+        mock_anthropic.Anthropic.assert_called_once()
+
+
+# ---------------------------------------------------------------------------
+# Cost calculation
+# ---------------------------------------------------------------------------
+
+
+class TestCalculateCost:
+    @pytest.fixture()
+    def client(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
+        monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic())
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+        from gaia.eval.claude import ClaudeClient
+
+        return ClaudeClient(model="claude-sonnet-4-6")
+
+    def test_known_model_pricing(self, client):
+        cost = client.calculate_cost(1_000_000, 1_000_000)
+        # claude-sonnet-4-6: $3/MTok input, $15/MTok output
+        assert cost["input_cost"] == pytest.approx(3.0, abs=1e-4)
+        assert cost["output_cost"] == pytest.approx(15.0, abs=1e-4)
+        assert cost["total_cost"] == pytest.approx(18.0, abs=1e-4)
+
+    def test_small_token_count(self, client):
+        cost = client.calculate_cost(100, 50)
+        assert cost["input_cost"] == pytest.approx(100 / 1_000_000 * 3.0, abs=1e-6)
+        assert cost["output_cost"] == pytest.approx(50 / 1_000_000 * 15.0, abs=1e-6)
+
+    def test_unknown_model_uses_default(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
+        monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic())
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+        from gaia.eval.claude import ClaudeClient
+
+        c = ClaudeClient(model="claude-future-9000")
+        cost = c.calculate_cost(1_000_000, 1_000_000)
+        # Default pricing matches Sonnet: $3/$15
+        assert cost["total_cost"] == pytest.approx(18.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# get_completion
+# ---------------------------------------------------------------------------
+
+
+class TestGetCompletion:
+    @pytest.fixture()
+    def client(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
+        mock_anthropic = _make_mock_anthropic()
+        monkeypatch.setattr("gaia.eval.claude.anthropic", mock_anthropic)
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+        from gaia.eval.claude import ClaudeClient
+
+        c = ClaudeClient(model="claude-sonnet-4-6")
+        return c
+
+    def test_returns_content(self, client):
+        mock_content = [SimpleNamespace(text="Hello world")]
+        client.client.messages.create.return_value = SimpleNamespace(
+            content=mock_content
+        )
+        result = client.get_completion("test prompt")
+        assert result == mock_content
+        client.client.messages.create.assert_called_once()
+
+    def test_propagates_api_error(self, client):
+        client.client.messages.create.side_effect = RuntimeError("API down")
+        with pytest.raises(RuntimeError, match="API down"):
+            client.get_completion("test")
+
+
+# ---------------------------------------------------------------------------
+# get_completion_with_usage
+# ---------------------------------------------------------------------------
+
+
+class TestGetCompletionWithUsage:
+    @pytest.fixture()
+    def client(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
+        monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic())
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+        from gaia.eval.claude import ClaudeClient
+
+        return ClaudeClient(model="claude-sonnet-4-6")
+
+    def test_returns_usage_and_cost(self, client):
+        mock_msg = SimpleNamespace(
+            content=[SimpleNamespace(text="response")],
+            usage=SimpleNamespace(input_tokens=500, output_tokens=200),
+        )
+        client.client.messages.create.return_value = mock_msg
+
+        result = client.get_completion_with_usage("prompt")
+        assert result["content"] == mock_msg.content
+        assert result["usage"]["input_tokens"] == 500
+        assert result["usage"]["output_tokens"] == 200
+        assert result["usage"]["total_tokens"] == 700
+        assert "cost" in result
+        assert result["cost"]["total_cost"] > 0
+
+
+# ---------------------------------------------------------------------------
+# count_tokens
+# ---------------------------------------------------------------------------
+
+
+class TestCountTokens:
+    @pytest.fixture()
+    def client(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
+        monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic())
+        monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock())
+        from gaia.eval.claude import ClaudeClient
+
+        return ClaudeClient(model="claude-sonnet-4-6")
+
+    def test_delegates_to_sdk(self, client):
+        client.client.messages.count_tokens.return_value = SimpleNamespace(
+            input_tokens=42
+        )
+        result = client.count_tokens("test prompt")
+        assert result.input_tokens == 42
+        client.client.messages.count_tokens.assert_called_once_with(
+            model="claude-sonnet-4-6",
+            messages=[{"role": "user", "content": "test prompt"}],
+        )
diff --git a/tests/unit/eval/test_runner.py b/tests/unit/eval/test_runner.py
new file mode 100644
index 000000000..3c4103f21
--- /dev/null
+++ b/tests/unit/eval/test_runner.py
@@ -0,0 +1,640 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""Unit tests for ``gaia.eval.runner``.
+
+Tests cover:
+  - validate_scenario (schema validation)
+  - recompute_turn_score (weighted scoring)
+  - _validate_turn_scores (dimension completeness)
+  - _aggregate_performance (per-turn → scenario rollup)
+  - _compute_effective_timeout (per-scenario timeout scaling)
+  - find_scenarios (filtering by id/category/tags, with mocked YAML)
+  - build_scenario_prompt (prompt assembly)
+  - compare_scorecards (regression detection)
+  - AgentEvalRunner.__init__ (configuration)
+
+All file/network/subprocess calls are mocked — no real LLM or Agent UI needed.
+"""
+
+import json
+import textwrap
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from gaia.eval.runner import (
+    _SCORE_WEIGHTS,
+    _aggregate_performance,
+    _compute_effective_timeout,
+    _validate_turn_scores,
+    compare_scorecards,
+    recompute_turn_score,
+    validate_scenario,
+)
+
+# ---------------------------------------------------------------------------
+# validate_scenario
+# ---------------------------------------------------------------------------
+
+
+class TestValidateScenario:
+    def _valid_scenario(self, **overrides):
+        base = {
+            "id": "test_scenario",
+            "category": "general",
+            "persona": "casual_user",
+            "setup": {"index_documents": []},
+            "turns": [
+                {
+                    "turn": 1,
+                    "objective": "Ask about X",
+                    "ground_truth": {"answer": "Y"},
+                }
+            ],
+        }
+        base.update(overrides)
+        return base
+
+    def test_valid_scenario_passes(self, tmp_path):
+        data = self._valid_scenario()
+        validate_scenario(tmp_path / "test.yaml", data)  # should not raise
+
+    def test_missing_required_field(self, tmp_path):
+        data = self._valid_scenario()
+        del data["id"]
+        with pytest.raises(ValueError, match="missing top-level field 'id'"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_missing_setup_index_documents(self, tmp_path):
+        data = self._valid_scenario(setup={})
+        with pytest.raises(ValueError, match="setup.index_documents is missing"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_empty_turns(self, tmp_path):
+        data = self._valid_scenario(turns=[])
+        with pytest.raises(ValueError, match="turns list is empty"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_duplicate_turn_numbers(self, tmp_path):
+        data = self._valid_scenario(
+            turns=[
+                {
+                    "turn": 1,
+                    "objective": "X",
+                    "ground_truth": {"answer": "A"},
+                },
+                {
+                    "turn": 1,
+                    "objective": "Y",
+                    "ground_truth": {"answer": "B"},
+                },
+            ]
+        )
+        with pytest.raises(ValueError, match="duplicate turn number"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_non_sequential_turns(self, tmp_path):
+        data = self._valid_scenario(
+            turns=[
+                {
+                    "turn": 1,
+                    "objective": "X",
+                    "ground_truth": {"answer": "A"},
+                },
+                {
+                    "turn": 3,
+                    "objective": "Y",
+                    "ground_truth": {"answer": "B"},
+                },
+            ]
+        )
+        with pytest.raises(ValueError, match="sequential"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_turn_without_objective(self, tmp_path):
+        data = self._valid_scenario(
+            turns=[{"turn": 1, "ground_truth": {"answer": "A"}}]
+        )
+        with pytest.raises(ValueError, match="missing 'objective'"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_turn_without_ground_truth_or_criteria(self, tmp_path):
+        data = self._valid_scenario(turns=[{"turn": 1, "objective": "X"}])
+        with pytest.raises(ValueError, match="ground_truth.*success_criteria"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_success_criteria_as_string_is_valid(self, tmp_path):
+        data = self._valid_scenario(
+            turns=[
+                {"turn": 1, "objective": "X", "success_criteria": "Agent says hello"}
+            ]
+        )
+        validate_scenario(tmp_path / "test.yaml", data)  # should not raise
+
+    def test_success_criteria_as_dict_rejected(self, tmp_path):
+        data = self._valid_scenario(
+            turns=[
+                {
+                    "turn": 1,
+                    "objective": "X",
+                    "success_criteria": {"key": "val"},
+                }
+            ]
+        )
+        with pytest.raises(ValueError, match="success_criteria must be a string"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_persona_non_string_rejected(self, tmp_path):
+        data = self._valid_scenario(persona=42)
+        with pytest.raises(ValueError, match="persona must be a string"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_persona_empty_string_rejected(self, tmp_path):
+        data = self._valid_scenario(persona="  ")
+        with pytest.raises(ValueError, match="persona must be a non-empty string"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+    def test_custom_persona_accepted(self, tmp_path):
+        data = self._valid_scenario(persona="my_custom_persona")
+        validate_scenario(tmp_path / "test.yaml", data)  # should not raise
+
+    def test_missing_path_in_index_documents(self, tmp_path):
+        data = self._valid_scenario(setup={"index_documents": [{"title": "doc1"}]})
+        with pytest.raises(ValueError, match="missing 'path' field"):
+            validate_scenario(tmp_path / "test.yaml", data)
+
+
+# ---------------------------------------------------------------------------
+# recompute_turn_score
+# ---------------------------------------------------------------------------
+
+
+class TestRecomputeTurnScore:
+    def _full_scores(self, **overrides):
+        scores = {k: 8.0 for k in _SCORE_WEIGHTS}
+        scores.update(overrides)
+        return scores
+
+    def test_uniform_scores(self):
+        scores = {k: 8.0 for k in _SCORE_WEIGHTS}
+        assert recompute_turn_score(scores) == pytest.approx(8.0)
+
+    def test_missing_dimension_returns_minus_one(self):
+        scores = {k: 8.0 for k in _SCORE_WEIGHTS}
+        del scores["correctness"]
+        assert recompute_turn_score(scores) == -1.0
+
+    def test_non_numeric_dimension_returns_minus_one(self):
+        scores = {k: 8.0 for k in _SCORE_WEIGHTS}
+        scores["correctness"] = "high"
+        assert recompute_turn_score(scores) == -1.0
+
+    def test_clamps_to_range(self):
+        scores = self._full_scores(correctness=15.0, personality=-5.0)
+        result = recompute_turn_score(scores)
+        # correctness clamped to 10, personality to 0
+        expected = (
+            10.0 * _SCORE_WEIGHTS["correctness"]
+            + 0.0 * _SCORE_WEIGHTS["personality"]
+            + sum(
+                8.0 * w
+                for k, w in _SCORE_WEIGHTS.items()
+                if k not in ("correctness", "personality")
+            )
+        )
+        assert result == pytest.approx(expected)
+
+    def test_weighted_correctly(self):
+        scores = {k: 0.0 for k in _SCORE_WEIGHTS}
+        scores["correctness"] = 10.0
+        result = recompute_turn_score(scores)
+        assert result == pytest.approx(10.0 * _SCORE_WEIGHTS["correctness"])
+
+
+# ---------------------------------------------------------------------------
+# _validate_turn_scores
+# ---------------------------------------------------------------------------
+
+
+class TestValidateTurnScores:
+    def test_no_warnings_when_all_complete(self):
+        result = {
+            "turns": [
+                {
+                    "turn": 1,
+                    "scores": {k: 8.0 for k in _SCORE_WEIGHTS},
+                    "overall_score": 8.0,
+                }
+            ]
+        }
+        assert _validate_turn_scores(result) == []
+
+    def test_warns_on_missing_dimensions(self):
+        result = {
+            "turns": [
+                {
+                    "turn": 1,
+                    "scores": {"correctness": 8.0},  # missing other dimensions
+                    "overall_score": 8.0,
+                }
+            ]
+        }
+        warnings = _validate_turn_scores(result)
+        assert len(warnings) == 1
+        assert "Turn 1" in warnings[0]
+
+    def test_no_warning_when_no_overall_score(self):
+        result = {"turns": [{"turn": 1, "scores": {}, "overall_score": None}]}
+        assert _validate_turn_scores(result) == []
+
+
+# ---------------------------------------------------------------------------
+# _aggregate_performance
+# ---------------------------------------------------------------------------
+
+
+class TestAggregatePerformance:
+    def test_aggregates_from_turns(self):
+        result = {
+            "turns": [
+                {
+                    "performance": {
+                        "tokens_per_second": 40.0,
+                        "time_to_first_token": 1.0,
+                        "input_tokens": 100,
+                        "output_tokens": 200,
+                        "flags": ["slow"],
+                    }
+                },
+                {
+                    "performance": {
+                        "tokens_per_second": 60.0,
+                        "time_to_first_token": 0.5,
+                        "input_tokens": 150,
+                        "output_tokens": 250,
+                        "flags": ["ok"],
+                    }
+                },
+            ]
+        }
+        _aggregate_performance(result, "test-scenario")
+        ps = result["performance_summary"]
+        assert ps["avg_tokens_per_second"] == pytest.approx(50.0, abs=0.1)
+        assert ps["avg_time_to_first_token"] == pytest.approx(0.75, abs=0.001)
+        assert ps["total_input_tokens"] == 250
+        assert ps["total_output_tokens"] == 450
+        assert "slow" in ps["flags"]
+        assert "ok" in ps["flags"]
+
+    def test_none_when_no_perf_data(self):
+        result = {"turns": [{"performance": None}]}
+        _aggregate_performance(result, "s")
+        assert result["performance_summary"] is None
+
+    def test_handles_missing_performance_key(self):
+        result = {"turns": [{"turn": 1}]}
+        _aggregate_performance(result, "s")
+        assert result["performance_summary"] is None
+
+    def test_skips_invalid_values(self):
+        result = {
+            "turns": [
+                {
+                    "performance": {
+                        "tokens_per_second": -1,  # invalid
+                        "time_to_first_token": 0,  # invalid
+                        "input_tokens": "not_a_number",
+                        "output_tokens": 100,
+                    }
+                }
+            ]
+        }
+        _aggregate_performance(result, "s")
+        ps = result["performance_summary"]
+        assert ps["avg_tokens_per_second"] is None
+        assert ps["avg_time_to_first_token"] is None
+        assert ps["total_output_tokens"] == 100
+
+
+# ---------------------------------------------------------------------------
+# _compute_effective_timeout
+# ---------------------------------------------------------------------------
+
+
+class TestComputeEffectiveTimeout:
+    def test_base_timeout_when_no_turns_or_docs(self):
+        result = _compute_effective_timeout(
+            900, {"turns": [], "setup": {"index_documents": []}}
+        )
+        assert result >= 240  # at least startup overhead
+
+    def test_scales_with_turns_and_docs(self):
+        scenario = {
+            "turns": [{"turn": 1}, {"turn": 2}],
+            "setup": {"index_documents": [{"path": "a.pdf"}, {"path": "b.pdf"}]},
+        }
+        result = _compute_effective_timeout(900, scenario)
+        expected = 240 + 2 * 90 + 2 * 200  # startup + docs + turns
+        assert result >= expected
+
+    def test_capped_at_max(self):
+        scenario = {
+            "turns": [{"turn": i} for i in range(100)],
+            "setup": {"index_documents": [{"path": f"{i}.pdf"} for i in range(100)]},
+        }
+        result = _compute_effective_timeout(900, scenario)
+        assert result <= 7200
+
+
+# ---------------------------------------------------------------------------
+# find_scenarios (with mocked filesystem)
+# ---------------------------------------------------------------------------
+
+
+class TestFindScenarios:
+    def _write_scenario(self, d, sid, category="general", tags=None):
+        import yaml
+
+        data = {
+            "id": sid,
+            "category": category,
+            "persona": "casual_user",
+            "setup": {"index_documents": []},
+            "turns": [{"turn": 1, "objective": "X", "ground_truth": {"answer": "A"}}],
+        }
+        if tags:
+            data["tags"] = tags
+        path = d / f"{sid}.yaml"
+        path.write_text(yaml.dump(data), encoding="utf-8")
+        return path
+
+    def test_finds_by_category(self, tmp_path, monkeypatch):
+        monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", tmp_path)
+        monkeypatch.setattr(
+            "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist"
+        )
+        self._write_scenario(tmp_path, "s1", category="rag")
+        self._write_scenario(tmp_path, "s2", category="tool")
+
+        from gaia.eval.runner import find_scenarios
+
+        results = find_scenarios(category="rag")
+        assert len(results) == 1
+        assert results[0][1]["id"] == "s1"
+
+    def test_finds_by_id(self, tmp_path, monkeypatch):
+        monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", tmp_path)
+        monkeypatch.setattr(
+            "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist"
+        )
+        self._write_scenario(tmp_path, "alpha")
+        self._write_scenario(tmp_path, "beta")
+
+        from gaia.eval.runner import find_scenarios
+
+        results = find_scenarios(scenario_id="beta")
+        assert len(results) == 1
+        assert results[0][1]["id"] == "beta"
+
+    def test_filters_by_tags(self, tmp_path, monkeypatch):
+        monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", tmp_path)
+        monkeypatch.setattr(
+            "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist"
+        )
+        self._write_scenario(tmp_path, "s1", tags=["v1", "regression"])
+        self._write_scenario(tmp_path, "s2", tags=["v2"])
+
+        from gaia.eval.runner import find_scenarios
+
+        results = find_scenarios(tags=["regression"])
+        assert len(results) == 1
+        assert results[0][1]["id"] == "s1"
+
+    def test_extra_dirs_override(self, tmp_path, monkeypatch):
+        builtin = tmp_path / "builtin"
+        builtin.mkdir()
+        extra = tmp_path / "extra"
+        extra.mkdir()
+        monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", builtin)
+        monkeypatch.setattr(
+            "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist"
+        )
+        self._write_scenario(builtin, "overlap", category="old")
+        self._write_scenario(extra, "overlap", category="new")
+
+        from gaia.eval.runner import find_scenarios
+
+        results = find_scenarios(extra_dirs=[str(extra)])
+        assert len(results) == 1
+        assert results[0][1]["category"] == "new"
+
+
+# ---------------------------------------------------------------------------
+# build_scenario_prompt
+# ---------------------------------------------------------------------------
+
+
+class TestBuildScenarioPrompt:
+    def test_includes_scenario_yaml(self, monkeypatch):
+        # Mock the prompt-file loaders
+        monkeypatch.setattr("gaia.eval.runner._load_simulator_content", lambda: "SIM")
+        monkeypatch.setattr("gaia.eval.runner._load_judge_turn_content", lambda: "TURN")
+        monkeypatch.setattr(
+            "gaia.eval.runner._load_judge_scenario_content", lambda: "SCENARIO"
+        )
+        from gaia.eval.runner import build_scenario_prompt
+
+        scenario = {"id": "test_s", "category": "rag", "turns": []}
+        prompt = build_scenario_prompt(
+            scenario, {"documents": []}, "http://localhost:4200"
+        )
+        assert "test_s" in prompt
+        assert "SIM" in prompt
+        assert "TURN" in prompt
+        assert "SCENARIO" in prompt
+        assert "http://localhost:4200" in prompt
+
+    def test_agent_type_injected(self, monkeypatch):
+        monkeypatch.setattr("gaia.eval.runner._load_simulator_content", lambda: "")
+        monkeypatch.setattr("gaia.eval.runner._load_judge_turn_content", lambda: "")
+        monkeypatch.setattr("gaia.eval.runner._load_judge_scenario_content", lambda: "")
+        from gaia.eval.runner import build_scenario_prompt
+
+        prompt = build_scenario_prompt(
+            {"id": "s", "turns": []},
+            {},
+            "http://localhost:4200",
+            agent_type="gaia-lite",
+        )
+        assert 'agent_type="gaia-lite"' in prompt
+
+
+# ---------------------------------------------------------------------------
+# compare_scorecards
+# ---------------------------------------------------------------------------
+
+
+class TestCompareScorecards:
+    def _write_scorecard(self, path, scenarios, summary_overrides=None):
+        summary = {
+            "total_scenarios": len(scenarios),
+            "passed": sum(1 for s in scenarios if s["status"] == "PASS"),
+            "failed": sum(1 for s in scenarios if s["status"] == "FAIL"),
+            "pass_rate": 0.0,
+            "judged_pass_rate": 0.0,
+            "avg_score": 0.0,
+        }
+        total = summary["total_scenarios"]
+        if total:
+            summary["pass_rate"] = summary["passed"] / total
+        if summary_overrides:
+            summary.update(summary_overrides)
+        data = {"summary": summary, "scenarios": scenarios}
+        Path(path).write_text(json.dumps(data), encoding="utf-8")
+
+    def test_detects_regression(self, tmp_path):
+        base = tmp_path / "base.json"
+        curr = tmp_path / "curr.json"
+        self._write_scorecard(
+            base,
+            [{"scenario_id": "s1", "status": "PASS", "overall_score": 8.0}],
+        )
+        self._write_scorecard(
+            curr,
+            [{"scenario_id": "s1", "status": "FAIL", "overall_score": 3.0}],
+        )
+        result = compare_scorecards(base, curr)
+        assert len(result["regressed"]) == 1
+        assert result["regressed"][0]["scenario_id"] == "s1"
+
+    def test_detects_improvement(self, tmp_path):
+        base = tmp_path / "base.json"
+        curr = tmp_path / "curr.json"
+        self._write_scorecard(
+            base,
+            [{"scenario_id": "s1", "status": "FAIL", "overall_score": 3.0}],
+        )
+        self._write_scorecard(
+            curr,
+            [{"scenario_id": "s1", "status": "PASS", "overall_score": 8.0}],
+        )
+        result = compare_scorecards(base, curr)
+        assert len(result["improved"]) == 1
+
+    def test_detects_score_regression(self, tmp_path):
+        base = tmp_path / "base.json"
+        curr = tmp_path / "curr.json"
+        self._write_scorecard(
+            base,
+            [{"scenario_id": "s1", "status": "PASS", "overall_score": 9.0}],
+        )
+        self._write_scorecard(
+            curr,
+            [{"scenario_id": "s1", "status": "PASS", "overall_score": 6.5}],
+        )
+        result = compare_scorecards(base, curr)
+        assert len(result["score_regressed"]) == 1
+
+    def test_only_in_baseline_and_current(self, tmp_path):
+        base = tmp_path / "base.json"
+        curr = tmp_path / "curr.json"
+        self._write_scorecard(
+            base,
+            [{"scenario_id": "old", "status": "PASS", "overall_score": 8.0}],
+        )
+        self._write_scorecard(
+            curr,
+            [{"scenario_id": "new", "status": "PASS", "overall_score": 8.0}],
+        )
+        result = compare_scorecards(base, curr)
+        assert "old" in result["only_in_baseline"]
+        assert "new" in result["only_in_current"]
+
+    def test_corpus_changed(self, tmp_path):
+        base = tmp_path / "base.json"
+        curr = tmp_path / "curr.json"
+        self._write_scorecard(
+            base,
+            [{"scenario_id": "s1", "status": "PASS", "overall_score": 8.0}],
+        )
+        self._write_scorecard(
+            curr,
+            [
+                {
+                    "scenario_id": "s1",
+                    "status": "SKIPPED_NO_DOCUMENT",
+                    "overall_score": None,
+                }
+            ],
+        )
+        result = compare_scorecards(base, curr)
+        assert len(result["corpus_changed"]) == 1
+
+    def test_missing_baseline_raises(self, tmp_path):
+        with pytest.raises(FileNotFoundError):
+            compare_scorecards(tmp_path / "nope.json", tmp_path / "also-nope.json")
+
+    def test_time_regression(self, tmp_path):
+        base = tmp_path / "base.json"
+        curr = tmp_path / "curr.json"
+        self._write_scorecard(
+            base,
+            [
+                {
+                    "scenario_id": "s1",
+                    "status": "PASS",
+                    "overall_score": 8.0,
+                    "elapsed_s": 30.0,
+                }
+            ],
+        )
+        self._write_scorecard(
+            curr,
+            [
+                {
+                    "scenario_id": "s1",
+                    "status": "PASS",
+                    "overall_score": 8.0,
+                    "elapsed_s": 120.0,
+                }
+            ],
+        )
+        result = compare_scorecards(base, curr)
+        assert len(result["time_regressed"]) == 1
+
+
+# ---------------------------------------------------------------------------
+# AgentEvalRunner.__init__
+# ---------------------------------------------------------------------------
+
+
+class TestAgentEvalRunnerInit:
+    def test_defaults(self):
+        from gaia.eval.runner import AgentEvalRunner
+
+        runner = AgentEvalRunner()
+        assert runner.backend_url == "http://localhost:4200"
+        assert runner.model == "claude-sonnet-4-6"
+        assert runner.budget == "2.00"
+        assert runner.timeout == 900
+
+    def test_custom_args(self, tmp_path):
+        from gaia.eval.runner import AgentEvalRunner
+
+        runner = AgentEvalRunner(
+            backend_url="http://custom:5000",
+            model="claude-opus-4",
+            budget_per_scenario="5.00",
+            timeout_per_scenario=1200,
+            results_dir=str(tmp_path),
+            tags=["regression"],
+            agent_type="gaia-lite",
+        )
+        assert runner.backend_url == "http://custom:5000"
+        assert runner.model == "claude-opus-4"
+        assert runner.budget == "5.00"
+        assert runner.timeout == 1200
+        assert runner.results_dir == tmp_path
+        assert runner.tags == ["regression"]
+        assert runner.agent_type == "gaia-lite"
diff --git a/tests/unit/eval/test_scorecard.py b/tests/unit/eval/test_scorecard.py
new file mode 100644
index 000000000..26972120d
--- /dev/null
+++ b/tests/unit/eval/test_scorecard.py
@@ -0,0 +1,325 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""Unit tests for ``gaia.eval.scorecard``.
+
+Covers build_scorecard aggregation, write_summary_md rendering,
+write_junit_xml conversion, and edge cases (empty results, mixed statuses,
+performance data, unrecognized statuses).
+"""
+
+import xml.etree.ElementTree as ET
+
+import pytest
+
+from gaia.eval.scorecard import build_scorecard, write_junit_xml, write_summary_md
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_MINIMAL_CONFIG = {"model": "test-model", "budget": "1.00"}
+
+
+def _result(
+    scenario_id,
+    status,
+    overall_score,
+    category="general",
+    cost_usd=0.0,
+    performance_summary=None,
+    root_cause=None,
+):
+    r = {
+        "scenario_id": scenario_id,
+        "status": status,
+        "overall_score": overall_score,
+        "category": category,
+        "turns": [],
+        "cost_estimate": {"turns": 1, "estimated_usd": cost_usd},
+    }
+    if performance_summary is not None:
+        r["performance_summary"] = performance_summary
+    if root_cause is not None:
+        r["root_cause"] = root_cause
+    return r
+
+
+# ---------------------------------------------------------------------------
+# build_scorecard — basic counts
+# ---------------------------------------------------------------------------
+
+
+class TestBuildScorecardCounts:
+    def test_all_pass(self):
+        results = [_result("a", "PASS", 8.0), _result("b", "PASS", 9.0)]
+        sc = build_scorecard("run-1", results, _MINIMAL_CONFIG)
+        s = sc["summary"]
+        assert s["total_scenarios"] == 2
+        assert s["passed"] == 2
+        assert s["failed"] == 0
+        assert s["pass_rate"] == 1.0
+        assert s["judged_pass_rate"] == 1.0
+
+    def test_mixed_statuses(self):
+        results = [
+            _result("a", "PASS", 8.0),
+            _result("b", "FAIL", 4.0),
+            _result("c", "BLOCKED_BY_ARCHITECTURE", 3.0),
+            _result("d", "TIMEOUT", None),
+            _result("e", "BUDGET_EXCEEDED", None),
+            _result("f", "INFRA_ERROR", None),
+            _result("g", "SETUP_ERROR", None),
+            _result("h", "SKIPPED_NO_DOCUMENT", None),
+        ]
+        sc = build_scorecard("run-2", results, _MINIMAL_CONFIG)
+        s = sc["summary"]
+        assert s["total_scenarios"] == 8
+        assert s["passed"] == 1
+        assert s["failed"] == 1
+        assert s["blocked"] == 1
+        assert s["timeout"] == 1
+        assert s["budget_exceeded"] == 1
+        assert s["infra_error"] == 2  # INFRA_ERROR + SETUP_ERROR
+        assert s["skipped"] == 1
+        assert s["errored"] == 0
+
+    def test_errored_for_unknown_status(self):
+        results = [_result("x", "SOMETHING_NEW", 5.0)]
+        sc = build_scorecard("run-3", results, _MINIMAL_CONFIG)
+        assert sc["summary"]["errored"] == 1
+        assert "warnings" in sc
+
+    def test_empty_results(self):
+        sc = build_scorecard("run-empty", [], _MINIMAL_CONFIG)
+        s = sc["summary"]
+        assert s["total_scenarios"] == 0
+        assert s["pass_rate"] == 0.0
+        assert s["avg_score"] == 0.0
+
+
+# ---------------------------------------------------------------------------
+# build_scorecard — avg_score
+# ---------------------------------------------------------------------------
+
+
+class TestBuildScorecardScoring:
+    def test_avg_score_excludes_infra(self):
+        """TIMEOUT/BUDGET_EXCEEDED/INFRA_ERROR must NOT dilute avg_score."""
+        results = [
+            _result("a", "PASS", 8.0),
+            _result("b", "TIMEOUT", None),
+        ]
+        sc = build_scorecard("run-s1", results, _MINIMAL_CONFIG)
+        assert sc["summary"]["avg_score"] == 8.0
+
+    def test_fail_scores_capped_at_5_99(self):
+        """FAIL scenarios with score >= 6 should be capped at 5.99 for averaging."""
+        results = [_result("a", "FAIL", 7.0)]
+        sc = build_scorecard("run-cap", results, _MINIMAL_CONFIG)
+        assert sc["summary"]["avg_score"] == 5.99
+
+    def test_pass_scores_not_capped(self):
+        results = [_result("a", "PASS", 9.5)]
+        sc = build_scorecard("run-nocap", results, _MINIMAL_CONFIG)
+        assert sc["summary"]["avg_score"] == 9.5
+
+    def test_judged_pass_rate(self):
+        """judged_pass_rate denominator is PASS + FAIL + BLOCKED only."""
+        results = [
+            _result("a", "PASS", 8.0),
+            _result("b", "FAIL", 3.0),
+            _result("c", "TIMEOUT", None),
+        ]
+        sc = build_scorecard("run-jpr", results, _MINIMAL_CONFIG)
+        assert sc["summary"]["judged_pass_rate"] == pytest.approx(0.5)
+
+
+# ---------------------------------------------------------------------------
+# build_scorecard — by_category
+# ---------------------------------------------------------------------------
+
+
+class TestBuildScorecardCategory:
+    def test_category_breakdown(self):
+        results = [
+            _result("a", "PASS", 9.0, category="rag"),
+            _result("b", "FAIL", 4.0, category="rag"),
+            _result("c", "PASS", 8.0, category="tool"),
+        ]
+        sc = build_scorecard("run-cat", results, _MINIMAL_CONFIG)
+        by_cat = sc["summary"]["by_category"]
+        assert "rag" in by_cat
+        assert "tool" in by_cat
+        assert by_cat["rag"]["passed"] == 1
+        assert by_cat["rag"]["failed"] == 1
+        assert by_cat["tool"]["passed"] == 1
+
+    def test_category_avg_score_caps_fail(self):
+        results = [_result("a", "FAIL", 7.5, category="q")]
+        sc = build_scorecard("run-catcap", results, _MINIMAL_CONFIG)
+        assert sc["summary"]["by_category"]["q"]["avg_score"] == 5.99
+
+
+# ---------------------------------------------------------------------------
+# build_scorecard — cost and performance
+# ---------------------------------------------------------------------------
+
+
+class TestBuildScorecardCostPerf:
+    def test_cost_aggregation(self):
+        results = [
+            _result("a", "PASS", 8.0, cost_usd=0.12),
+            _result("b", "PASS", 7.0, cost_usd=0.08),
+        ]
+        sc = build_scorecard("run-cost", results, _MINIMAL_CONFIG)
+        assert sc["cost"]["estimated_total_usd"] == pytest.approx(0.20, abs=1e-4)
+
+    def test_performance_aggregation(self):
+        perf = {
+            "avg_tokens_per_second": 40.0,
+            "avg_time_to_first_token": 1.5,
+            "total_input_tokens": 100,
+            "total_output_tokens": 200,
+            "flags": ["slow"],
+        }
+        results = [_result("a", "PASS", 8.0, performance_summary=perf)]
+        sc = build_scorecard("run-perf", results, _MINIMAL_CONFIG)
+        p = sc["performance"]
+        assert p["avg_tokens_per_second"] == 40.0
+        assert p["avg_time_to_first_token"] == 1.5
+        assert p["total_input_tokens"] == 100
+        assert p["total_output_tokens"] == 200
+        assert "slow" in p["flags"]
+
+    def test_no_performance_data(self):
+        results = [_result("a", "PASS", 8.0)]
+        sc = build_scorecard("run-noperf", results, _MINIMAL_CONFIG)
+        assert sc["performance"]["avg_tokens_per_second"] is None
+        assert sc["performance"]["scenarios_with_data"] == 0
+
+
+# ---------------------------------------------------------------------------
+# build_scorecard — metadata
+# ---------------------------------------------------------------------------
+
+
+class TestBuildScorecardMeta:
+    def test_run_id_and_config_preserved(self):
+        sc = build_scorecard("my-run", [_result("a", "PASS", 8.0)], _MINIMAL_CONFIG)
+        assert sc["run_id"] == "my-run"
+        assert sc["config"] == _MINIMAL_CONFIG
+        assert "timestamp" in sc
+
+    def test_scenarios_list_preserved(self):
+        results = [_result("a", "PASS", 8.0)]
+        sc = build_scorecard("r", results, _MINIMAL_CONFIG)
+        assert sc["scenarios"] is results
+
+
+# ---------------------------------------------------------------------------
+# write_summary_md
+# ---------------------------------------------------------------------------
+
+
+class TestWriteSummaryMd:
+    def test_contains_key_sections(self):
+        results = [
+            _result("a", "PASS", 8.0, category="rag"),
+            _result("b", "FAIL", 3.0, category="rag", root_cause="bad prompt"),
+        ]
+        sc = build_scorecard("run-md", results, _MINIMAL_CONFIG)
+        md = write_summary_md(sc)
+        assert "# GAIA Agent Eval" in md
+        assert "## Summary" in md
+        assert "## By Category" in md
+        assert "## Scenarios" in md
+        assert "bad prompt" in md
+
+    def test_performance_section_when_data_present(self):
+        perf = {
+            "avg_tokens_per_second": 50.0,
+            "avg_time_to_first_token": 0.8,
+            "total_input_tokens": 500,
+            "total_output_tokens": 300,
+            "flags": [],
+        }
+        results = [_result("a", "PASS", 8.0, performance_summary=perf)]
+        sc = build_scorecard("run-mdperf", results, _MINIMAL_CONFIG)
+        md = write_summary_md(sc)
+        assert "## Performance" in md
+        assert "50.0 tok/s" in md
+
+    def test_no_performance_section_when_no_data(self):
+        results = [_result("a", "PASS", 8.0)]
+        sc = build_scorecard("run-nop", results, _MINIMAL_CONFIG)
+        md = write_summary_md(sc)
+        assert "## Performance" not in md
+
+
+# ---------------------------------------------------------------------------
+# write_junit_xml
+# ---------------------------------------------------------------------------
+
+
+class TestWriteJunitXml:
+    def test_valid_xml(self):
+        results = [
+            _result("a", "PASS", 8.0, category="rag"),
+            _result("b", "FAIL", 4.0, category="rag"),
+        ]
+        sc = build_scorecard("run-xml", results, _MINIMAL_CONFIG)
+        xml_str = write_junit_xml(sc)
+        root = ET.fromstring(xml_str)
+        assert root.tag == "testsuites"
+
+    def test_pass_has_no_failure_element(self):
+        results = [_result("a", "PASS", 9.0, category="c1")]
+        sc = build_scorecard("run-xp", results, _MINIMAL_CONFIG)
+        xml_str = write_junit_xml(sc)
+        root = ET.fromstring(xml_str)
+        testcase = root.find(".//testcase[@name='a']")
+        assert testcase is not None
+        assert testcase.find("failure") is None
+        assert testcase.find("error") is None
+
+    def test_fail_has_failure_element(self):
+        results = [_result("b", "FAIL", 3.0, category="c1")]
+        sc = build_scorecard("run-xf", results, _MINIMAL_CONFIG)
+        xml_str = write_junit_xml(sc)
+        root = ET.fromstring(xml_str)
+        testcase = root.find(".//testcase[@name='b']")
+        assert testcase is not None
+        failure = testcase.find("failure")
+        assert failure is not None
+        assert failure.get("type") == "FAIL"
+
+    def test_timeout_has_error_element(self):
+        results = [_result("t", "TIMEOUT", None, category="c1")]
+        sc = build_scorecard("run-xt", results, _MINIMAL_CONFIG)
+        xml_str = write_junit_xml(sc)
+        root = ET.fromstring(xml_str)
+        testcase = root.find(".//testcase[@name='t']")
+        assert testcase is not None
+        assert testcase.find("error") is not None
+
+    def test_skipped_has_skipped_element(self):
+        results = [_result("s", "SKIPPED_NO_DOCUMENT", None, category="c1")]
+        sc = build_scorecard("run-xs", results, _MINIMAL_CONFIG)
+        xml_str = write_junit_xml(sc)
+        root = ET.fromstring(xml_str)
+        testcase = root.find(".//testcase[@name='s']")
+        assert testcase is not None
+        assert testcase.find("skipped") is not None
+
+    def test_category_testsuite(self):
+        results = [
+            _result("a", "PASS", 8.0, category="cat1"),
+            _result("b", "PASS", 7.0, category="cat2"),
+        ]
+        sc = build_scorecard("run-xc", results, _MINIMAL_CONFIG)
+        xml_str = write_junit_xml(sc)
+        root = ET.fromstring(xml_str)
+        suites = root.findall("testsuite")
+        suite_names = {s.get("name") for s in suites}
+        assert "cat1" in suite_names
+        assert "cat2" in suite_names

From 862d75543916a360a089c6bb475c716feeee7882 Mon Sep 17 00:00:00 2001
From: Ovtcharov <kovtchar@amd.com>
Date: Thu, 28 May 2026 19:59:23 -0700
Subject: [PATCH 2/4] test(eval): remove unused imports from eval test files

Drop unused Path (test_audit), patch (test_claude_judge),
textwrap/MagicMock/patch (test_runner), and a dead no-op
fixture + unused helper function in test_claude_judge.
---
 tests/unit/eval/test_audit.py        |  1 -
 tests/unit/eval/test_claude_judge.py | 15 +--------------
 tests/unit/eval/test_runner.py       |  2 --
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/tests/unit/eval/test_audit.py b/tests/unit/eval/test_audit.py
index 3892a50a0..39a576326 100644
--- a/tests/unit/eval/test_audit.py
+++ b/tests/unit/eval/test_audit.py
@@ -8,7 +8,6 @@
 """
 
 import textwrap
-from pathlib import Path
 
 import pytest
 
diff --git a/tests/unit/eval/test_claude_judge.py b/tests/unit/eval/test_claude_judge.py
index d1605d328..675da2147 100644
--- a/tests/unit/eval/test_claude_judge.py
+++ b/tests/unit/eval/test_claude_judge.py
@@ -8,7 +8,7 @@
 """
 
 from types import SimpleNamespace
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -17,13 +17,6 @@
 # ---------------------------------------------------------------------------
 
 
-@pytest.fixture(autouse=True)
-def _patch_imports(monkeypatch):
-    """Ensure anthropic and bs4 are available as mocks at module level."""
-    # We don't want real imports — mock the modules if not installed.
-    pass
-
-
 def _make_mock_anthropic():
     """Build a mock anthropic module with an Anthropic client constructor."""
     mock_module = MagicMock()
@@ -31,12 +24,6 @@ def _make_mock_anthropic():
     return mock_module
 
 
-def _make_mock_bs4():
-    mock_module = MagicMock()
-    mock_module.BeautifulSoup = MagicMock()
-    return mock_module
-
-
 # ---------------------------------------------------------------------------
 # Initialization
 # ---------------------------------------------------------------------------
diff --git a/tests/unit/eval/test_runner.py b/tests/unit/eval/test_runner.py
index 3c4103f21..7bfea933d 100644
--- a/tests/unit/eval/test_runner.py
+++ b/tests/unit/eval/test_runner.py
@@ -17,9 +17,7 @@
 """
 
 import json
-import textwrap
 from pathlib import Path
-from unittest.mock import MagicMock, patch
 
 import pytest
 

From 099cf45156158c03941c54261e1836d5286a442a Mon Sep 17 00:00:00 2001
From: Ovtcharov <kovtchar@amd.com>
Date: Thu, 28 May 2026 23:42:52 -0700
Subject: [PATCH 3/4] test(eval): address PR review feedback

- Fix test_scales_with_turns_and_docs: use base_timeout=100 so the
  scaling formula (820) actually wins the max(), and assert == instead
  of >= to verify the exact computed value.
- Remove dead _write_helpers call in test_extracts_max_constants that
  wrote to the wrong path (returned p was unused).
- Move `import yaml` from inside _write_scenario to module-level.
---
 tests/unit/eval/test_audit.py  | 11 ++---------
 tests/unit/eval/test_runner.py |  9 ++++-----
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/tests/unit/eval/test_audit.py b/tests/unit/eval/test_audit.py
index 39a576326..6cc375ec2 100644
--- a/tests/unit/eval/test_audit.py
+++ b/tests/unit/eval/test_audit.py
@@ -42,16 +42,9 @@ def test_extracts_max_constants(self, tmp_path, monkeypatch):
             _MAX_MSG_CHARS = 2000
             _OTHER = 42
         """
-        p = _write_helpers(tmp_path, src)
         # Monkeypatch GAIA_ROOT so audit_chat_helpers reads our fake file
-        monkeypatch.setattr(
-            "gaia.eval.audit.GAIA_ROOT",
-            # Need a root where <root>/src/gaia/ui/_chat_helpers.py resolves to our file
-            # Easier: just monkeypatch the whole function's file path
-            tmp_path,
-        )
-        # Since audit_chat_helpers hardcodes the path, we need to
-        # create the expected directory structure.
+        monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path)
+        # audit_chat_helpers hardcodes the path — create the expected directory structure.
         target = tmp_path / "src" / "gaia" / "ui"
         target.mkdir(parents=True)
         (target / "_chat_helpers.py").write_text(textwrap.dedent(src), encoding="utf-8")
diff --git a/tests/unit/eval/test_runner.py b/tests/unit/eval/test_runner.py
index 7bfea933d..706693afa 100644
--- a/tests/unit/eval/test_runner.py
+++ b/tests/unit/eval/test_runner.py
@@ -20,6 +20,7 @@
 from pathlib import Path
 
 import pytest
+import yaml
 
 from gaia.eval.runner import (
     _SCORE_WEIGHTS,
@@ -332,9 +333,9 @@ def test_scales_with_turns_and_docs(self):
             "turns": [{"turn": 1}, {"turn": 2}],
             "setup": {"index_documents": [{"path": "a.pdf"}, {"path": "b.pdf"}]},
         }
-        result = _compute_effective_timeout(900, scenario)
-        expected = 240 + 2 * 90 + 2 * 200  # startup + docs + turns
-        assert result >= expected
+        expected = 240 + 2 * 90 + 2 * 200  # startup + docs + turns = 820
+        result = _compute_effective_timeout(100, scenario)
+        assert result == expected
 
     def test_capped_at_max(self):
         scenario = {
@@ -352,8 +353,6 @@ def test_capped_at_max(self):
 
 class TestFindScenarios:
     def _write_scenario(self, d, sid, category="general", tags=None):
-        import yaml
-
         data = {
             "id": sid,
             "category": category,

From cbc5d75c4286028f472fb8cf43bdaf6baafd66f5 Mon Sep 17 00:00:00 2001
From: Ovtcharov <kovtchar@amd.com>
Date: Fri, 29 May 2026 08:49:00 -0700
Subject: [PATCH 4/4] fix(types): resolve additional mypy errors in factory,
 providers, governance

Suppress no-any-return in factory.py (dynamic provider instantiation),
fix union-attr in openai/claude providers (stream response type), and
cast CheckpointStatus in checkpoint_bridge.py.
---
 src/gaia/governance/checkpoint_bridge.py  | 5 ++++-
 src/gaia/llm/factory.py                   | 2 +-
 src/gaia/llm/providers/claude.py          | 5 +++--
 src/gaia/llm/providers/openai_provider.py | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/gaia/governance/checkpoint_bridge.py b/src/gaia/governance/checkpoint_bridge.py
index 9c7b5fd4b..9ae1a1b86 100644
--- a/src/gaia/governance/checkpoint_bridge.py
+++ b/src/gaia/governance/checkpoint_bridge.py
@@ -11,10 +11,13 @@
 
 from threading import Lock
 
+from typing import cast
+
 from .exceptions import CheckpointNotFoundError, InvalidResolutionError
 from .schemas import (
     CheckpointRecord,
     CheckpointResolution,
+    CheckpointStatus,
     GovernanceDecision,
     TransitionOutcome,
     WorkflowTransition,
@@ -90,7 +93,7 @@ def resolve_checkpoint(
                 checkpoint_id=current.checkpoint_id,
                 workflow_id=current.workflow_id,
                 transition_id=current.transition_id,
-                status=status,
+                status=cast(CheckpointStatus, status),
                 created_at=current.created_at,
                 decision_context={
                     **current.decision_context,
diff --git a/src/gaia/llm/factory.py b/src/gaia/llm/factory.py
index ede92b023..8757955fe 100644
--- a/src/gaia/llm/factory.py
+++ b/src/gaia/llm/factory.py
@@ -67,4 +67,4 @@ def create_client(
     module = importlib.import_module(module_path)
     provider_class = getattr(module, class_name)
 
-    return provider_class(**kwargs)
+    return provider_class(**kwargs)  # type: ignore[no-any-return]
diff --git a/src/gaia/llm/providers/claude.py b/src/gaia/llm/providers/claude.py
index 789feff45..56d35c02c 100644
--- a/src/gaia/llm/providers/claude.py
+++ b/src/gaia/llm/providers/claude.py
@@ -74,7 +74,7 @@ def chat(
         response = self._client.messages.create(**params)
         if stream:
             return self._handle_stream(response)
-        return response.content[0].text
+        return response.content[0].text  # type: ignore[union-attr]
 
     # embed() inherited from ABC - raises NotSupportedError
 
@@ -99,7 +99,8 @@ def vision(self, images: list[bytes], prompt: str, **kwargs) -> str:
                 ],
             }
         ]
-        return self.chat(messages, **kwargs)
+        result = self.chat(messages, **kwargs)
+        return result if isinstance(result, str) else "".join(result)
 
     # get_performance_stats() inherited from ABC - raises NotSupportedError
     # load_model() inherited from ABC - raises NotSupportedError
diff --git a/src/gaia/llm/providers/openai_provider.py b/src/gaia/llm/providers/openai_provider.py
index ab204153a..4142b0cb7 100644
--- a/src/gaia/llm/providers/openai_provider.py
+++ b/src/gaia/llm/providers/openai_provider.py
@@ -60,7 +60,7 @@ def chat(
         )
         if stream:
             return self._handle_stream(response)
-        return response.choices[0].message.content
+        return response.choices[0].message.content  # type: ignore[union-attr]
 
     def embed(
         self, texts: list[str], model: str = "text-embedding-3-small", **kwargs