From 869002cfc65a24864c2f5ead0ec4a82340e2b117 Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Thu, 28 May 2026 19:51:45 -0700 Subject: [PATCH 1/4] test(eval): cover scorecard, audit, runner, and claude judge (#1151) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The eval toolchain at src/gaia/eval/ had only 3 test files covering analyze_failures, iterations, and MCP reliability — the four core modules (runner, scorecard, audit, claude judge) had zero unit tests. Adds 91 tests across 4 new files: - test_scorecard.py — build_scorecard aggregation, write_summary_md, write_junit_xml, status counting, score capping, performance rollup - test_audit.py — AST constant extraction, agent persistence detection, tool-results-in-history pattern matching, run_audit recommendations - test_runner.py — validate_scenario schema checks, recompute_turn_score weighting, _aggregate_performance, _compute_effective_timeout, find_scenarios filtering, build_scenario_prompt assembly, compare_scorecards regression detection, AgentEvalRunner init - test_claude_judge.py — ClaudeClient init validation, cost calculation, get_completion, get_completion_with_usage, count_tokens (all mocked) --- tests/unit/eval/test_audit.py | 177 ++++++++ tests/unit/eval/test_claude_judge.py | 215 +++++++++ tests/unit/eval/test_runner.py | 640 +++++++++++++++++++++++++++ tests/unit/eval/test_scorecard.py | 325 ++++++++++++++ 4 files changed, 1357 insertions(+) create mode 100644 tests/unit/eval/test_audit.py create mode 100644 tests/unit/eval/test_claude_judge.py create mode 100644 tests/unit/eval/test_runner.py create mode 100644 tests/unit/eval/test_scorecard.py diff --git a/tests/unit/eval/test_audit.py b/tests/unit/eval/test_audit.py new file mode 100644 index 000000000..3892a50a0 --- /dev/null +++ b/tests/unit/eval/test_audit.py @@ -0,0 +1,177 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""Unit tests for ``gaia.eval.audit``. + +Tests the deterministic architecture audit helpers that inspect +``_chat_helpers.py`` for constants and patterns, plus the full +``run_audit()`` rollup. +""" + +import textwrap +from pathlib import Path + +import pytest + +from gaia.eval.audit import ( + audit_agent_persistence, + audit_chat_helpers, + audit_tool_results_in_history, + run_audit, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _write_helpers(tmp_path, source): + """Write a fake _chat_helpers.py into tmp_path and return its Path.""" + p = tmp_path / "_chat_helpers.py" + p.write_text(textwrap.dedent(source), encoding="utf-8") + return p + + +# --------------------------------------------------------------------------- +# audit_chat_helpers +# --------------------------------------------------------------------------- + + +class TestAuditChatHelpers: + def test_extracts_max_constants(self, tmp_path, monkeypatch): + src = """\ + _MAX_HISTORY_PAIRS = 10 + _MAX_MSG_CHARS = 2000 + _OTHER = 42 + """ + p = _write_helpers(tmp_path, src) + # Monkeypatch GAIA_ROOT so audit_chat_helpers reads our fake file + monkeypatch.setattr( + "gaia.eval.audit.GAIA_ROOT", + # Need a root where /src/gaia/ui/_chat_helpers.py resolves to our file + # Easier: just monkeypatch the whole function's file path + tmp_path, + ) + # Since audit_chat_helpers hardcodes the path, we need to + # create the expected directory structure. + target = tmp_path / "src" / "gaia" / "ui" + target.mkdir(parents=True) + (target / "_chat_helpers.py").write_text(textwrap.dedent(src), encoding="utf-8") + result = audit_chat_helpers() + assert result["_MAX_HISTORY_PAIRS"] == 10 + assert result["_MAX_MSG_CHARS"] == 2000 + assert "_OTHER" not in result + + def test_returns_empty_on_missing_file(self, tmp_path, monkeypatch): + monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path) + result = audit_chat_helpers() + assert result == {} + + +# --------------------------------------------------------------------------- +# audit_agent_persistence +# --------------------------------------------------------------------------- + + +class TestAuditAgentPersistence: + def test_stateless_per_message(self, tmp_path): + p = _write_helpers(tmp_path, 'agent = ChatAgent(model="test")\n') + assert audit_agent_persistence(p) == "stateless_per_message" + + def test_unknown_when_no_chatagent(self, tmp_path): + p = _write_helpers(tmp_path, "x = 1\n") + assert audit_agent_persistence(p) == "unknown" + + def test_unknown_on_missing_file(self, tmp_path): + assert audit_agent_persistence(tmp_path / "nonexistent.py") == "unknown" + + +# --------------------------------------------------------------------------- +# audit_tool_results_in_history +# --------------------------------------------------------------------------- + + +class TestAuditToolResultsInHistory: + def test_detects_pattern(self, tmp_path): + src = """\ + agent_steps = agent.run() + messages.append({"role": "tool", "content": agent_steps}) + """ + p = _write_helpers(tmp_path, src) + assert audit_tool_results_in_history(p) is True + + def test_false_when_missing_agent_steps(self, tmp_path): + src = 'messages.append({"role": "user"})\n' + p = _write_helpers(tmp_path, src) + assert audit_tool_results_in_history(p) is False + + def test_false_on_missing_file(self, tmp_path): + assert audit_tool_results_in_history(tmp_path / "nope.py") is False + + +# --------------------------------------------------------------------------- +# run_audit +# --------------------------------------------------------------------------- + + +class TestRunAudit: + def test_returns_audit_key(self, tmp_path, monkeypatch): + # Create a minimal _chat_helpers.py that satisfies all checks + target = tmp_path / "src" / "gaia" / "ui" + target.mkdir(parents=True) + src = textwrap.dedent("""\ + _MAX_HISTORY_PAIRS = 3 + _MAX_MSG_CHARS = 500 + agent = ChatAgent(model="test") + agent_steps = agent.run() + messages.append({"role": "tool", "content": agent_steps}) + """) + (target / "_chat_helpers.py").write_text(src, encoding="utf-8") + monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path) + + result = run_audit() + assert "architecture_audit" in result + audit = result["architecture_audit"] + assert audit["history_pairs"] == 3 + assert audit["max_msg_chars"] == 500 + assert audit["tool_results_in_history"] is True + assert audit["agent_persistence"] == "stateless_per_message" + + def test_recommendations_on_low_history(self, tmp_path, monkeypatch): + target = tmp_path / "src" / "gaia" / "ui" + target.mkdir(parents=True) + src = "_MAX_HISTORY_PAIRS = 2\n" + (target / "_chat_helpers.py").write_text(src, encoding="utf-8") + monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path) + + result = run_audit() + recs = result["architecture_audit"]["recommendations"] + rec_ids = [r["id"] for r in recs] + assert "increase_history_pairs" in rec_ids + + def test_blocked_scenarios_on_low_msg_chars(self, tmp_path, monkeypatch): + target = tmp_path / "src" / "gaia" / "ui" + target.mkdir(parents=True) + src = "_MAX_MSG_CHARS = 500\n" + (target / "_chat_helpers.py").write_text(src, encoding="utf-8") + monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path) + + result = run_audit() + blocked = result["architecture_audit"]["blocked_scenarios"] + assert any(b["scenario"] == "cross_turn_file_recall" for b in blocked) + + def test_no_recommendations_when_values_sufficient(self, tmp_path, monkeypatch): + target = tmp_path / "src" / "gaia" / "ui" + target.mkdir(parents=True) + src = textwrap.dedent("""\ + _MAX_HISTORY_PAIRS = 20 + _MAX_MSG_CHARS = 5000 + agent = ChatAgent(model="test") + agent_steps = agent.run() + messages.append({"role": "tool", "content": agent_steps}) + """) + (target / "_chat_helpers.py").write_text(src, encoding="utf-8") + monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path) + + result = run_audit() + assert result["architecture_audit"]["recommendations"] == [] + assert result["architecture_audit"]["blocked_scenarios"] == [] diff --git a/tests/unit/eval/test_claude_judge.py b/tests/unit/eval/test_claude_judge.py new file mode 100644 index 000000000..d1605d328 --- /dev/null +++ b/tests/unit/eval/test_claude_judge.py @@ -0,0 +1,215 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""Unit tests for ``gaia.eval.claude`` (the Claude judge client). + +All Anthropic API calls are mocked — no real API key or network needed. +Tests cover: init validation, cost calculation, get_completion, +get_completion_with_usage, and count_tokens. +""" + +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# Shared mocks — we need anthropic + bs4 importable before importing ClaudeClient +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _patch_imports(monkeypatch): + """Ensure anthropic and bs4 are available as mocks at module level.""" + # We don't want real imports — mock the modules if not installed. + pass + + +def _make_mock_anthropic(): + """Build a mock anthropic module with an Anthropic client constructor.""" + mock_module = MagicMock() + mock_module.Anthropic = MagicMock() + return mock_module + + +def _make_mock_bs4(): + mock_module = MagicMock() + mock_module.BeautifulSoup = MagicMock() + return mock_module + + +# --------------------------------------------------------------------------- +# Initialization +# --------------------------------------------------------------------------- + + +class TestClaudeClientInit: + def test_raises_on_missing_api_key(self, monkeypatch): + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) + monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic()) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + + from gaia.eval.claude import ClaudeClient + + with pytest.raises(ValueError, match="ANTHROPIC_API_KEY"): + ClaudeClient() + + def test_raises_on_missing_anthropic(self, monkeypatch): + monkeypatch.setattr("gaia.eval.claude.anthropic", None) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + + from gaia.eval.claude import ClaudeClient + + with pytest.raises(ImportError, match="anthropic"): + ClaudeClient() + + def test_raises_on_missing_bs4(self, monkeypatch): + monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic()) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", None) + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key-123") + + from gaia.eval.claude import ClaudeClient + + with pytest.raises(ImportError, match="bs4"): + ClaudeClient() + + def test_success_with_valid_env(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key-123") + mock_anthropic = _make_mock_anthropic() + monkeypatch.setattr("gaia.eval.claude.anthropic", mock_anthropic) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + + from gaia.eval.claude import ClaudeClient + + client = ClaudeClient(model="claude-sonnet-4-6") + assert client.model == "claude-sonnet-4-6" + assert client.api_key == "test-key-123" + mock_anthropic.Anthropic.assert_called_once() + + +# --------------------------------------------------------------------------- +# Cost calculation +# --------------------------------------------------------------------------- + + +class TestCalculateCost: + @pytest.fixture() + def client(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") + monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic()) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + from gaia.eval.claude import ClaudeClient + + return ClaudeClient(model="claude-sonnet-4-6") + + def test_known_model_pricing(self, client): + cost = client.calculate_cost(1_000_000, 1_000_000) + # claude-sonnet-4-6: $3/MTok input, $15/MTok output + assert cost["input_cost"] == pytest.approx(3.0, abs=1e-4) + assert cost["output_cost"] == pytest.approx(15.0, abs=1e-4) + assert cost["total_cost"] == pytest.approx(18.0, abs=1e-4) + + def test_small_token_count(self, client): + cost = client.calculate_cost(100, 50) + assert cost["input_cost"] == pytest.approx(100 / 1_000_000 * 3.0, abs=1e-6) + assert cost["output_cost"] == pytest.approx(50 / 1_000_000 * 15.0, abs=1e-6) + + def test_unknown_model_uses_default(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") + monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic()) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + from gaia.eval.claude import ClaudeClient + + c = ClaudeClient(model="claude-future-9000") + cost = c.calculate_cost(1_000_000, 1_000_000) + # Default pricing matches Sonnet: $3/$15 + assert cost["total_cost"] == pytest.approx(18.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# get_completion +# --------------------------------------------------------------------------- + + +class TestGetCompletion: + @pytest.fixture() + def client(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") + mock_anthropic = _make_mock_anthropic() + monkeypatch.setattr("gaia.eval.claude.anthropic", mock_anthropic) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + from gaia.eval.claude import ClaudeClient + + c = ClaudeClient(model="claude-sonnet-4-6") + return c + + def test_returns_content(self, client): + mock_content = [SimpleNamespace(text="Hello world")] + client.client.messages.create.return_value = SimpleNamespace( + content=mock_content + ) + result = client.get_completion("test prompt") + assert result == mock_content + client.client.messages.create.assert_called_once() + + def test_propagates_api_error(self, client): + client.client.messages.create.side_effect = RuntimeError("API down") + with pytest.raises(RuntimeError, match="API down"): + client.get_completion("test") + + +# --------------------------------------------------------------------------- +# get_completion_with_usage +# --------------------------------------------------------------------------- + + +class TestGetCompletionWithUsage: + @pytest.fixture() + def client(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") + monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic()) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + from gaia.eval.claude import ClaudeClient + + return ClaudeClient(model="claude-sonnet-4-6") + + def test_returns_usage_and_cost(self, client): + mock_msg = SimpleNamespace( + content=[SimpleNamespace(text="response")], + usage=SimpleNamespace(input_tokens=500, output_tokens=200), + ) + client.client.messages.create.return_value = mock_msg + + result = client.get_completion_with_usage("prompt") + assert result["content"] == mock_msg.content + assert result["usage"]["input_tokens"] == 500 + assert result["usage"]["output_tokens"] == 200 + assert result["usage"]["total_tokens"] == 700 + assert "cost" in result + assert result["cost"]["total_cost"] > 0 + + +# --------------------------------------------------------------------------- +# count_tokens +# --------------------------------------------------------------------------- + + +class TestCountTokens: + @pytest.fixture() + def client(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") + monkeypatch.setattr("gaia.eval.claude.anthropic", _make_mock_anthropic()) + monkeypatch.setattr("gaia.eval.claude.BeautifulSoup", MagicMock()) + from gaia.eval.claude import ClaudeClient + + return ClaudeClient(model="claude-sonnet-4-6") + + def test_delegates_to_sdk(self, client): + client.client.messages.count_tokens.return_value = SimpleNamespace( + input_tokens=42 + ) + result = client.count_tokens("test prompt") + assert result.input_tokens == 42 + client.client.messages.count_tokens.assert_called_once_with( + model="claude-sonnet-4-6", + messages=[{"role": "user", "content": "test prompt"}], + ) diff --git a/tests/unit/eval/test_runner.py b/tests/unit/eval/test_runner.py new file mode 100644 index 000000000..3c4103f21 --- /dev/null +++ b/tests/unit/eval/test_runner.py @@ -0,0 +1,640 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""Unit tests for ``gaia.eval.runner``. + +Tests cover: + - validate_scenario (schema validation) + - recompute_turn_score (weighted scoring) + - _validate_turn_scores (dimension completeness) + - _aggregate_performance (per-turn → scenario rollup) + - _compute_effective_timeout (per-scenario timeout scaling) + - find_scenarios (filtering by id/category/tags, with mocked YAML) + - build_scenario_prompt (prompt assembly) + - compare_scorecards (regression detection) + - AgentEvalRunner.__init__ (configuration) + +All file/network/subprocess calls are mocked — no real LLM or Agent UI needed. +""" + +import json +import textwrap +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from gaia.eval.runner import ( + _SCORE_WEIGHTS, + _aggregate_performance, + _compute_effective_timeout, + _validate_turn_scores, + compare_scorecards, + recompute_turn_score, + validate_scenario, +) + +# --------------------------------------------------------------------------- +# validate_scenario +# --------------------------------------------------------------------------- + + +class TestValidateScenario: + def _valid_scenario(self, **overrides): + base = { + "id": "test_scenario", + "category": "general", + "persona": "casual_user", + "setup": {"index_documents": []}, + "turns": [ + { + "turn": 1, + "objective": "Ask about X", + "ground_truth": {"answer": "Y"}, + } + ], + } + base.update(overrides) + return base + + def test_valid_scenario_passes(self, tmp_path): + data = self._valid_scenario() + validate_scenario(tmp_path / "test.yaml", data) # should not raise + + def test_missing_required_field(self, tmp_path): + data = self._valid_scenario() + del data["id"] + with pytest.raises(ValueError, match="missing top-level field 'id'"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_missing_setup_index_documents(self, tmp_path): + data = self._valid_scenario(setup={}) + with pytest.raises(ValueError, match="setup.index_documents is missing"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_empty_turns(self, tmp_path): + data = self._valid_scenario(turns=[]) + with pytest.raises(ValueError, match="turns list is empty"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_duplicate_turn_numbers(self, tmp_path): + data = self._valid_scenario( + turns=[ + { + "turn": 1, + "objective": "X", + "ground_truth": {"answer": "A"}, + }, + { + "turn": 1, + "objective": "Y", + "ground_truth": {"answer": "B"}, + }, + ] + ) + with pytest.raises(ValueError, match="duplicate turn number"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_non_sequential_turns(self, tmp_path): + data = self._valid_scenario( + turns=[ + { + "turn": 1, + "objective": "X", + "ground_truth": {"answer": "A"}, + }, + { + "turn": 3, + "objective": "Y", + "ground_truth": {"answer": "B"}, + }, + ] + ) + with pytest.raises(ValueError, match="sequential"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_turn_without_objective(self, tmp_path): + data = self._valid_scenario( + turns=[{"turn": 1, "ground_truth": {"answer": "A"}}] + ) + with pytest.raises(ValueError, match="missing 'objective'"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_turn_without_ground_truth_or_criteria(self, tmp_path): + data = self._valid_scenario(turns=[{"turn": 1, "objective": "X"}]) + with pytest.raises(ValueError, match="ground_truth.*success_criteria"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_success_criteria_as_string_is_valid(self, tmp_path): + data = self._valid_scenario( + turns=[ + {"turn": 1, "objective": "X", "success_criteria": "Agent says hello"} + ] + ) + validate_scenario(tmp_path / "test.yaml", data) # should not raise + + def test_success_criteria_as_dict_rejected(self, tmp_path): + data = self._valid_scenario( + turns=[ + { + "turn": 1, + "objective": "X", + "success_criteria": {"key": "val"}, + } + ] + ) + with pytest.raises(ValueError, match="success_criteria must be a string"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_persona_non_string_rejected(self, tmp_path): + data = self._valid_scenario(persona=42) + with pytest.raises(ValueError, match="persona must be a string"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_persona_empty_string_rejected(self, tmp_path): + data = self._valid_scenario(persona=" ") + with pytest.raises(ValueError, match="persona must be a non-empty string"): + validate_scenario(tmp_path / "test.yaml", data) + + def test_custom_persona_accepted(self, tmp_path): + data = self._valid_scenario(persona="my_custom_persona") + validate_scenario(tmp_path / "test.yaml", data) # should not raise + + def test_missing_path_in_index_documents(self, tmp_path): + data = self._valid_scenario(setup={"index_documents": [{"title": "doc1"}]}) + with pytest.raises(ValueError, match="missing 'path' field"): + validate_scenario(tmp_path / "test.yaml", data) + + +# --------------------------------------------------------------------------- +# recompute_turn_score +# --------------------------------------------------------------------------- + + +class TestRecomputeTurnScore: + def _full_scores(self, **overrides): + scores = {k: 8.0 for k in _SCORE_WEIGHTS} + scores.update(overrides) + return scores + + def test_uniform_scores(self): + scores = {k: 8.0 for k in _SCORE_WEIGHTS} + assert recompute_turn_score(scores) == pytest.approx(8.0) + + def test_missing_dimension_returns_minus_one(self): + scores = {k: 8.0 for k in _SCORE_WEIGHTS} + del scores["correctness"] + assert recompute_turn_score(scores) == -1.0 + + def test_non_numeric_dimension_returns_minus_one(self): + scores = {k: 8.0 for k in _SCORE_WEIGHTS} + scores["correctness"] = "high" + assert recompute_turn_score(scores) == -1.0 + + def test_clamps_to_range(self): + scores = self._full_scores(correctness=15.0, personality=-5.0) + result = recompute_turn_score(scores) + # correctness clamped to 10, personality to 0 + expected = ( + 10.0 * _SCORE_WEIGHTS["correctness"] + + 0.0 * _SCORE_WEIGHTS["personality"] + + sum( + 8.0 * w + for k, w in _SCORE_WEIGHTS.items() + if k not in ("correctness", "personality") + ) + ) + assert result == pytest.approx(expected) + + def test_weighted_correctly(self): + scores = {k: 0.0 for k in _SCORE_WEIGHTS} + scores["correctness"] = 10.0 + result = recompute_turn_score(scores) + assert result == pytest.approx(10.0 * _SCORE_WEIGHTS["correctness"]) + + +# --------------------------------------------------------------------------- +# _validate_turn_scores +# --------------------------------------------------------------------------- + + +class TestValidateTurnScores: + def test_no_warnings_when_all_complete(self): + result = { + "turns": [ + { + "turn": 1, + "scores": {k: 8.0 for k in _SCORE_WEIGHTS}, + "overall_score": 8.0, + } + ] + } + assert _validate_turn_scores(result) == [] + + def test_warns_on_missing_dimensions(self): + result = { + "turns": [ + { + "turn": 1, + "scores": {"correctness": 8.0}, # missing other dimensions + "overall_score": 8.0, + } + ] + } + warnings = _validate_turn_scores(result) + assert len(warnings) == 1 + assert "Turn 1" in warnings[0] + + def test_no_warning_when_no_overall_score(self): + result = {"turns": [{"turn": 1, "scores": {}, "overall_score": None}]} + assert _validate_turn_scores(result) == [] + + +# --------------------------------------------------------------------------- +# _aggregate_performance +# --------------------------------------------------------------------------- + + +class TestAggregatePerformance: + def test_aggregates_from_turns(self): + result = { + "turns": [ + { + "performance": { + "tokens_per_second": 40.0, + "time_to_first_token": 1.0, + "input_tokens": 100, + "output_tokens": 200, + "flags": ["slow"], + } + }, + { + "performance": { + "tokens_per_second": 60.0, + "time_to_first_token": 0.5, + "input_tokens": 150, + "output_tokens": 250, + "flags": ["ok"], + } + }, + ] + } + _aggregate_performance(result, "test-scenario") + ps = result["performance_summary"] + assert ps["avg_tokens_per_second"] == pytest.approx(50.0, abs=0.1) + assert ps["avg_time_to_first_token"] == pytest.approx(0.75, abs=0.001) + assert ps["total_input_tokens"] == 250 + assert ps["total_output_tokens"] == 450 + assert "slow" in ps["flags"] + assert "ok" in ps["flags"] + + def test_none_when_no_perf_data(self): + result = {"turns": [{"performance": None}]} + _aggregate_performance(result, "s") + assert result["performance_summary"] is None + + def test_handles_missing_performance_key(self): + result = {"turns": [{"turn": 1}]} + _aggregate_performance(result, "s") + assert result["performance_summary"] is None + + def test_skips_invalid_values(self): + result = { + "turns": [ + { + "performance": { + "tokens_per_second": -1, # invalid + "time_to_first_token": 0, # invalid + "input_tokens": "not_a_number", + "output_tokens": 100, + } + } + ] + } + _aggregate_performance(result, "s") + ps = result["performance_summary"] + assert ps["avg_tokens_per_second"] is None + assert ps["avg_time_to_first_token"] is None + assert ps["total_output_tokens"] == 100 + + +# --------------------------------------------------------------------------- +# _compute_effective_timeout +# --------------------------------------------------------------------------- + + +class TestComputeEffectiveTimeout: + def test_base_timeout_when_no_turns_or_docs(self): + result = _compute_effective_timeout( + 900, {"turns": [], "setup": {"index_documents": []}} + ) + assert result >= 240 # at least startup overhead + + def test_scales_with_turns_and_docs(self): + scenario = { + "turns": [{"turn": 1}, {"turn": 2}], + "setup": {"index_documents": [{"path": "a.pdf"}, {"path": "b.pdf"}]}, + } + result = _compute_effective_timeout(900, scenario) + expected = 240 + 2 * 90 + 2 * 200 # startup + docs + turns + assert result >= expected + + def test_capped_at_max(self): + scenario = { + "turns": [{"turn": i} for i in range(100)], + "setup": {"index_documents": [{"path": f"{i}.pdf"} for i in range(100)]}, + } + result = _compute_effective_timeout(900, scenario) + assert result <= 7200 + + +# --------------------------------------------------------------------------- +# find_scenarios (with mocked filesystem) +# --------------------------------------------------------------------------- + + +class TestFindScenarios: + def _write_scenario(self, d, sid, category="general", tags=None): + import yaml + + data = { + "id": sid, + "category": category, + "persona": "casual_user", + "setup": {"index_documents": []}, + "turns": [{"turn": 1, "objective": "X", "ground_truth": {"answer": "A"}}], + } + if tags: + data["tags"] = tags + path = d / f"{sid}.yaml" + path.write_text(yaml.dump(data), encoding="utf-8") + return path + + def test_finds_by_category(self, tmp_path, monkeypatch): + monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", tmp_path) + monkeypatch.setattr( + "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist" + ) + self._write_scenario(tmp_path, "s1", category="rag") + self._write_scenario(tmp_path, "s2", category="tool") + + from gaia.eval.runner import find_scenarios + + results = find_scenarios(category="rag") + assert len(results) == 1 + assert results[0][1]["id"] == "s1" + + def test_finds_by_id(self, tmp_path, monkeypatch): + monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", tmp_path) + monkeypatch.setattr( + "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist" + ) + self._write_scenario(tmp_path, "alpha") + self._write_scenario(tmp_path, "beta") + + from gaia.eval.runner import find_scenarios + + results = find_scenarios(scenario_id="beta") + assert len(results) == 1 + assert results[0][1]["id"] == "beta" + + def test_filters_by_tags(self, tmp_path, monkeypatch): + monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", tmp_path) + monkeypatch.setattr( + "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist" + ) + self._write_scenario(tmp_path, "s1", tags=["v1", "regression"]) + self._write_scenario(tmp_path, "s2", tags=["v2"]) + + from gaia.eval.runner import find_scenarios + + results = find_scenarios(tags=["regression"]) + assert len(results) == 1 + assert results[0][1]["id"] == "s1" + + def test_extra_dirs_override(self, tmp_path, monkeypatch): + builtin = tmp_path / "builtin" + builtin.mkdir() + extra = tmp_path / "extra" + extra.mkdir() + monkeypatch.setattr("gaia.eval.runner.SCENARIOS_DIR", builtin) + monkeypatch.setattr( + "gaia.eval.runner.USER_SCENARIOS_DIR", tmp_path / "no-exist" + ) + self._write_scenario(builtin, "overlap", category="old") + self._write_scenario(extra, "overlap", category="new") + + from gaia.eval.runner import find_scenarios + + results = find_scenarios(extra_dirs=[str(extra)]) + assert len(results) == 1 + assert results[0][1]["category"] == "new" + + +# --------------------------------------------------------------------------- +# build_scenario_prompt +# --------------------------------------------------------------------------- + + +class TestBuildScenarioPrompt: + def test_includes_scenario_yaml(self, monkeypatch): + # Mock the prompt-file loaders + monkeypatch.setattr("gaia.eval.runner._load_simulator_content", lambda: "SIM") + monkeypatch.setattr("gaia.eval.runner._load_judge_turn_content", lambda: "TURN") + monkeypatch.setattr( + "gaia.eval.runner._load_judge_scenario_content", lambda: "SCENARIO" + ) + from gaia.eval.runner import build_scenario_prompt + + scenario = {"id": "test_s", "category": "rag", "turns": []} + prompt = build_scenario_prompt( + scenario, {"documents": []}, "http://localhost:4200" + ) + assert "test_s" in prompt + assert "SIM" in prompt + assert "TURN" in prompt + assert "SCENARIO" in prompt + assert "http://localhost:4200" in prompt + + def test_agent_type_injected(self, monkeypatch): + monkeypatch.setattr("gaia.eval.runner._load_simulator_content", lambda: "") + monkeypatch.setattr("gaia.eval.runner._load_judge_turn_content", lambda: "") + monkeypatch.setattr("gaia.eval.runner._load_judge_scenario_content", lambda: "") + from gaia.eval.runner import build_scenario_prompt + + prompt = build_scenario_prompt( + {"id": "s", "turns": []}, + {}, + "http://localhost:4200", + agent_type="gaia-lite", + ) + assert 'agent_type="gaia-lite"' in prompt + + +# --------------------------------------------------------------------------- +# compare_scorecards +# --------------------------------------------------------------------------- + + +class TestCompareScorecards: + def _write_scorecard(self, path, scenarios, summary_overrides=None): + summary = { + "total_scenarios": len(scenarios), + "passed": sum(1 for s in scenarios if s["status"] == "PASS"), + "failed": sum(1 for s in scenarios if s["status"] == "FAIL"), + "pass_rate": 0.0, + "judged_pass_rate": 0.0, + "avg_score": 0.0, + } + total = summary["total_scenarios"] + if total: + summary["pass_rate"] = summary["passed"] / total + if summary_overrides: + summary.update(summary_overrides) + data = {"summary": summary, "scenarios": scenarios} + Path(path).write_text(json.dumps(data), encoding="utf-8") + + def test_detects_regression(self, tmp_path): + base = tmp_path / "base.json" + curr = tmp_path / "curr.json" + self._write_scorecard( + base, + [{"scenario_id": "s1", "status": "PASS", "overall_score": 8.0}], + ) + self._write_scorecard( + curr, + [{"scenario_id": "s1", "status": "FAIL", "overall_score": 3.0}], + ) + result = compare_scorecards(base, curr) + assert len(result["regressed"]) == 1 + assert result["regressed"][0]["scenario_id"] == "s1" + + def test_detects_improvement(self, tmp_path): + base = tmp_path / "base.json" + curr = tmp_path / "curr.json" + self._write_scorecard( + base, + [{"scenario_id": "s1", "status": "FAIL", "overall_score": 3.0}], + ) + self._write_scorecard( + curr, + [{"scenario_id": "s1", "status": "PASS", "overall_score": 8.0}], + ) + result = compare_scorecards(base, curr) + assert len(result["improved"]) == 1 + + def test_detects_score_regression(self, tmp_path): + base = tmp_path / "base.json" + curr = tmp_path / "curr.json" + self._write_scorecard( + base, + [{"scenario_id": "s1", "status": "PASS", "overall_score": 9.0}], + ) + self._write_scorecard( + curr, + [{"scenario_id": "s1", "status": "PASS", "overall_score": 6.5}], + ) + result = compare_scorecards(base, curr) + assert len(result["score_regressed"]) == 1 + + def test_only_in_baseline_and_current(self, tmp_path): + base = tmp_path / "base.json" + curr = tmp_path / "curr.json" + self._write_scorecard( + base, + [{"scenario_id": "old", "status": "PASS", "overall_score": 8.0}], + ) + self._write_scorecard( + curr, + [{"scenario_id": "new", "status": "PASS", "overall_score": 8.0}], + ) + result = compare_scorecards(base, curr) + assert "old" in result["only_in_baseline"] + assert "new" in result["only_in_current"] + + def test_corpus_changed(self, tmp_path): + base = tmp_path / "base.json" + curr = tmp_path / "curr.json" + self._write_scorecard( + base, + [{"scenario_id": "s1", "status": "PASS", "overall_score": 8.0}], + ) + self._write_scorecard( + curr, + [ + { + "scenario_id": "s1", + "status": "SKIPPED_NO_DOCUMENT", + "overall_score": None, + } + ], + ) + result = compare_scorecards(base, curr) + assert len(result["corpus_changed"]) == 1 + + def test_missing_baseline_raises(self, tmp_path): + with pytest.raises(FileNotFoundError): + compare_scorecards(tmp_path / "nope.json", tmp_path / "also-nope.json") + + def test_time_regression(self, tmp_path): + base = tmp_path / "base.json" + curr = tmp_path / "curr.json" + self._write_scorecard( + base, + [ + { + "scenario_id": "s1", + "status": "PASS", + "overall_score": 8.0, + "elapsed_s": 30.0, + } + ], + ) + self._write_scorecard( + curr, + [ + { + "scenario_id": "s1", + "status": "PASS", + "overall_score": 8.0, + "elapsed_s": 120.0, + } + ], + ) + result = compare_scorecards(base, curr) + assert len(result["time_regressed"]) == 1 + + +# --------------------------------------------------------------------------- +# AgentEvalRunner.__init__ +# --------------------------------------------------------------------------- + + +class TestAgentEvalRunnerInit: + def test_defaults(self): + from gaia.eval.runner import AgentEvalRunner + + runner = AgentEvalRunner() + assert runner.backend_url == "http://localhost:4200" + assert runner.model == "claude-sonnet-4-6" + assert runner.budget == "2.00" + assert runner.timeout == 900 + + def test_custom_args(self, tmp_path): + from gaia.eval.runner import AgentEvalRunner + + runner = AgentEvalRunner( + backend_url="http://custom:5000", + model="claude-opus-4", + budget_per_scenario="5.00", + timeout_per_scenario=1200, + results_dir=str(tmp_path), + tags=["regression"], + agent_type="gaia-lite", + ) + assert runner.backend_url == "http://custom:5000" + assert runner.model == "claude-opus-4" + assert runner.budget == "5.00" + assert runner.timeout == 1200 + assert runner.results_dir == tmp_path + assert runner.tags == ["regression"] + assert runner.agent_type == "gaia-lite" diff --git a/tests/unit/eval/test_scorecard.py b/tests/unit/eval/test_scorecard.py new file mode 100644 index 000000000..26972120d --- /dev/null +++ b/tests/unit/eval/test_scorecard.py @@ -0,0 +1,325 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""Unit tests for ``gaia.eval.scorecard``. + +Covers build_scorecard aggregation, write_summary_md rendering, +write_junit_xml conversion, and edge cases (empty results, mixed statuses, +performance data, unrecognized statuses). +""" + +import xml.etree.ElementTree as ET + +import pytest + +from gaia.eval.scorecard import build_scorecard, write_junit_xml, write_summary_md + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_MINIMAL_CONFIG = {"model": "test-model", "budget": "1.00"} + + +def _result( + scenario_id, + status, + overall_score, + category="general", + cost_usd=0.0, + performance_summary=None, + root_cause=None, +): + r = { + "scenario_id": scenario_id, + "status": status, + "overall_score": overall_score, + "category": category, + "turns": [], + "cost_estimate": {"turns": 1, "estimated_usd": cost_usd}, + } + if performance_summary is not None: + r["performance_summary"] = performance_summary + if root_cause is not None: + r["root_cause"] = root_cause + return r + + +# --------------------------------------------------------------------------- +# build_scorecard — basic counts +# --------------------------------------------------------------------------- + + +class TestBuildScorecardCounts: + def test_all_pass(self): + results = [_result("a", "PASS", 8.0), _result("b", "PASS", 9.0)] + sc = build_scorecard("run-1", results, _MINIMAL_CONFIG) + s = sc["summary"] + assert s["total_scenarios"] == 2 + assert s["passed"] == 2 + assert s["failed"] == 0 + assert s["pass_rate"] == 1.0 + assert s["judged_pass_rate"] == 1.0 + + def test_mixed_statuses(self): + results = [ + _result("a", "PASS", 8.0), + _result("b", "FAIL", 4.0), + _result("c", "BLOCKED_BY_ARCHITECTURE", 3.0), + _result("d", "TIMEOUT", None), + _result("e", "BUDGET_EXCEEDED", None), + _result("f", "INFRA_ERROR", None), + _result("g", "SETUP_ERROR", None), + _result("h", "SKIPPED_NO_DOCUMENT", None), + ] + sc = build_scorecard("run-2", results, _MINIMAL_CONFIG) + s = sc["summary"] + assert s["total_scenarios"] == 8 + assert s["passed"] == 1 + assert s["failed"] == 1 + assert s["blocked"] == 1 + assert s["timeout"] == 1 + assert s["budget_exceeded"] == 1 + assert s["infra_error"] == 2 # INFRA_ERROR + SETUP_ERROR + assert s["skipped"] == 1 + assert s["errored"] == 0 + + def test_errored_for_unknown_status(self): + results = [_result("x", "SOMETHING_NEW", 5.0)] + sc = build_scorecard("run-3", results, _MINIMAL_CONFIG) + assert sc["summary"]["errored"] == 1 + assert "warnings" in sc + + def test_empty_results(self): + sc = build_scorecard("run-empty", [], _MINIMAL_CONFIG) + s = sc["summary"] + assert s["total_scenarios"] == 0 + assert s["pass_rate"] == 0.0 + assert s["avg_score"] == 0.0 + + +# --------------------------------------------------------------------------- +# build_scorecard — avg_score +# --------------------------------------------------------------------------- + + +class TestBuildScorecardScoring: + def test_avg_score_excludes_infra(self): + """TIMEOUT/BUDGET_EXCEEDED/INFRA_ERROR must NOT dilute avg_score.""" + results = [ + _result("a", "PASS", 8.0), + _result("b", "TIMEOUT", None), + ] + sc = build_scorecard("run-s1", results, _MINIMAL_CONFIG) + assert sc["summary"]["avg_score"] == 8.0 + + def test_fail_scores_capped_at_5_99(self): + """FAIL scenarios with score >= 6 should be capped at 5.99 for averaging.""" + results = [_result("a", "FAIL", 7.0)] + sc = build_scorecard("run-cap", results, _MINIMAL_CONFIG) + assert sc["summary"]["avg_score"] == 5.99 + + def test_pass_scores_not_capped(self): + results = [_result("a", "PASS", 9.5)] + sc = build_scorecard("run-nocap", results, _MINIMAL_CONFIG) + assert sc["summary"]["avg_score"] == 9.5 + + def test_judged_pass_rate(self): + """judged_pass_rate denominator is PASS + FAIL + BLOCKED only.""" + results = [ + _result("a", "PASS", 8.0), + _result("b", "FAIL", 3.0), + _result("c", "TIMEOUT", None), + ] + sc = build_scorecard("run-jpr", results, _MINIMAL_CONFIG) + assert sc["summary"]["judged_pass_rate"] == pytest.approx(0.5) + + +# --------------------------------------------------------------------------- +# build_scorecard — by_category +# --------------------------------------------------------------------------- + + +class TestBuildScorecardCategory: + def test_category_breakdown(self): + results = [ + _result("a", "PASS", 9.0, category="rag"), + _result("b", "FAIL", 4.0, category="rag"), + _result("c", "PASS", 8.0, category="tool"), + ] + sc = build_scorecard("run-cat", results, _MINIMAL_CONFIG) + by_cat = sc["summary"]["by_category"] + assert "rag" in by_cat + assert "tool" in by_cat + assert by_cat["rag"]["passed"] == 1 + assert by_cat["rag"]["failed"] == 1 + assert by_cat["tool"]["passed"] == 1 + + def test_category_avg_score_caps_fail(self): + results = [_result("a", "FAIL", 7.5, category="q")] + sc = build_scorecard("run-catcap", results, _MINIMAL_CONFIG) + assert sc["summary"]["by_category"]["q"]["avg_score"] == 5.99 + + +# --------------------------------------------------------------------------- +# build_scorecard — cost and performance +# --------------------------------------------------------------------------- + + +class TestBuildScorecardCostPerf: + def test_cost_aggregation(self): + results = [ + _result("a", "PASS", 8.0, cost_usd=0.12), + _result("b", "PASS", 7.0, cost_usd=0.08), + ] + sc = build_scorecard("run-cost", results, _MINIMAL_CONFIG) + assert sc["cost"]["estimated_total_usd"] == pytest.approx(0.20, abs=1e-4) + + def test_performance_aggregation(self): + perf = { + "avg_tokens_per_second": 40.0, + "avg_time_to_first_token": 1.5, + "total_input_tokens": 100, + "total_output_tokens": 200, + "flags": ["slow"], + } + results = [_result("a", "PASS", 8.0, performance_summary=perf)] + sc = build_scorecard("run-perf", results, _MINIMAL_CONFIG) + p = sc["performance"] + assert p["avg_tokens_per_second"] == 40.0 + assert p["avg_time_to_first_token"] == 1.5 + assert p["total_input_tokens"] == 100 + assert p["total_output_tokens"] == 200 + assert "slow" in p["flags"] + + def test_no_performance_data(self): + results = [_result("a", "PASS", 8.0)] + sc = build_scorecard("run-noperf", results, _MINIMAL_CONFIG) + assert sc["performance"]["avg_tokens_per_second"] is None + assert sc["performance"]["scenarios_with_data"] == 0 + + +# --------------------------------------------------------------------------- +# build_scorecard — metadata +# --------------------------------------------------------------------------- + + +class TestBuildScorecardMeta: + def test_run_id_and_config_preserved(self): + sc = build_scorecard("my-run", [_result("a", "PASS", 8.0)], _MINIMAL_CONFIG) + assert sc["run_id"] == "my-run" + assert sc["config"] == _MINIMAL_CONFIG + assert "timestamp" in sc + + def test_scenarios_list_preserved(self): + results = [_result("a", "PASS", 8.0)] + sc = build_scorecard("r", results, _MINIMAL_CONFIG) + assert sc["scenarios"] is results + + +# --------------------------------------------------------------------------- +# write_summary_md +# --------------------------------------------------------------------------- + + +class TestWriteSummaryMd: + def test_contains_key_sections(self): + results = [ + _result("a", "PASS", 8.0, category="rag"), + _result("b", "FAIL", 3.0, category="rag", root_cause="bad prompt"), + ] + sc = build_scorecard("run-md", results, _MINIMAL_CONFIG) + md = write_summary_md(sc) + assert "# GAIA Agent Eval" in md + assert "## Summary" in md + assert "## By Category" in md + assert "## Scenarios" in md + assert "bad prompt" in md + + def test_performance_section_when_data_present(self): + perf = { + "avg_tokens_per_second": 50.0, + "avg_time_to_first_token": 0.8, + "total_input_tokens": 500, + "total_output_tokens": 300, + "flags": [], + } + results = [_result("a", "PASS", 8.0, performance_summary=perf)] + sc = build_scorecard("run-mdperf", results, _MINIMAL_CONFIG) + md = write_summary_md(sc) + assert "## Performance" in md + assert "50.0 tok/s" in md + + def test_no_performance_section_when_no_data(self): + results = [_result("a", "PASS", 8.0)] + sc = build_scorecard("run-nop", results, _MINIMAL_CONFIG) + md = write_summary_md(sc) + assert "## Performance" not in md + + +# --------------------------------------------------------------------------- +# write_junit_xml +# --------------------------------------------------------------------------- + + +class TestWriteJunitXml: + def test_valid_xml(self): + results = [ + _result("a", "PASS", 8.0, category="rag"), + _result("b", "FAIL", 4.0, category="rag"), + ] + sc = build_scorecard("run-xml", results, _MINIMAL_CONFIG) + xml_str = write_junit_xml(sc) + root = ET.fromstring(xml_str) + assert root.tag == "testsuites" + + def test_pass_has_no_failure_element(self): + results = [_result("a", "PASS", 9.0, category="c1")] + sc = build_scorecard("run-xp", results, _MINIMAL_CONFIG) + xml_str = write_junit_xml(sc) + root = ET.fromstring(xml_str) + testcase = root.find(".//testcase[@name='a']") + assert testcase is not None + assert testcase.find("failure") is None + assert testcase.find("error") is None + + def test_fail_has_failure_element(self): + results = [_result("b", "FAIL", 3.0, category="c1")] + sc = build_scorecard("run-xf", results, _MINIMAL_CONFIG) + xml_str = write_junit_xml(sc) + root = ET.fromstring(xml_str) + testcase = root.find(".//testcase[@name='b']") + assert testcase is not None + failure = testcase.find("failure") + assert failure is not None + assert failure.get("type") == "FAIL" + + def test_timeout_has_error_element(self): + results = [_result("t", "TIMEOUT", None, category="c1")] + sc = build_scorecard("run-xt", results, _MINIMAL_CONFIG) + xml_str = write_junit_xml(sc) + root = ET.fromstring(xml_str) + testcase = root.find(".//testcase[@name='t']") + assert testcase is not None + assert testcase.find("error") is not None + + def test_skipped_has_skipped_element(self): + results = [_result("s", "SKIPPED_NO_DOCUMENT", None, category="c1")] + sc = build_scorecard("run-xs", results, _MINIMAL_CONFIG) + xml_str = write_junit_xml(sc) + root = ET.fromstring(xml_str) + testcase = root.find(".//testcase[@name='s']") + assert testcase is not None + assert testcase.find("skipped") is not None + + def test_category_testsuite(self): + results = [ + _result("a", "PASS", 8.0, category="cat1"), + _result("b", "PASS", 7.0, category="cat2"), + ] + sc = build_scorecard("run-xc", results, _MINIMAL_CONFIG) + xml_str = write_junit_xml(sc) + root = ET.fromstring(xml_str) + suites = root.findall("testsuite") + suite_names = {s.get("name") for s in suites} + assert "cat1" in suite_names + assert "cat2" in suite_names From 862d75543916a360a089c6bb475c716feeee7882 Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Thu, 28 May 2026 19:59:23 -0700 Subject: [PATCH 2/4] test(eval): remove unused imports from eval test files Drop unused Path (test_audit), patch (test_claude_judge), textwrap/MagicMock/patch (test_runner), and a dead no-op fixture + unused helper function in test_claude_judge. --- tests/unit/eval/test_audit.py | 1 - tests/unit/eval/test_claude_judge.py | 15 +-------------- tests/unit/eval/test_runner.py | 2 -- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/unit/eval/test_audit.py b/tests/unit/eval/test_audit.py index 3892a50a0..39a576326 100644 --- a/tests/unit/eval/test_audit.py +++ b/tests/unit/eval/test_audit.py @@ -8,7 +8,6 @@ """ import textwrap -from pathlib import Path import pytest diff --git a/tests/unit/eval/test_claude_judge.py b/tests/unit/eval/test_claude_judge.py index d1605d328..675da2147 100644 --- a/tests/unit/eval/test_claude_judge.py +++ b/tests/unit/eval/test_claude_judge.py @@ -8,7 +8,7 @@ """ from types import SimpleNamespace -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest @@ -17,13 +17,6 @@ # --------------------------------------------------------------------------- -@pytest.fixture(autouse=True) -def _patch_imports(monkeypatch): - """Ensure anthropic and bs4 are available as mocks at module level.""" - # We don't want real imports — mock the modules if not installed. - pass - - def _make_mock_anthropic(): """Build a mock anthropic module with an Anthropic client constructor.""" mock_module = MagicMock() @@ -31,12 +24,6 @@ def _make_mock_anthropic(): return mock_module -def _make_mock_bs4(): - mock_module = MagicMock() - mock_module.BeautifulSoup = MagicMock() - return mock_module - - # --------------------------------------------------------------------------- # Initialization # --------------------------------------------------------------------------- diff --git a/tests/unit/eval/test_runner.py b/tests/unit/eval/test_runner.py index 3c4103f21..7bfea933d 100644 --- a/tests/unit/eval/test_runner.py +++ b/tests/unit/eval/test_runner.py @@ -17,9 +17,7 @@ """ import json -import textwrap from pathlib import Path -from unittest.mock import MagicMock, patch import pytest From 099cf45156158c03941c54261e1836d5286a442a Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Thu, 28 May 2026 23:42:52 -0700 Subject: [PATCH 3/4] test(eval): address PR review feedback - Fix test_scales_with_turns_and_docs: use base_timeout=100 so the scaling formula (820) actually wins the max(), and assert == instead of >= to verify the exact computed value. - Remove dead _write_helpers call in test_extracts_max_constants that wrote to the wrong path (returned p was unused). - Move `import yaml` from inside _write_scenario to module-level. --- tests/unit/eval/test_audit.py | 11 ++--------- tests/unit/eval/test_runner.py | 9 ++++----- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/unit/eval/test_audit.py b/tests/unit/eval/test_audit.py index 39a576326..6cc375ec2 100644 --- a/tests/unit/eval/test_audit.py +++ b/tests/unit/eval/test_audit.py @@ -42,16 +42,9 @@ def test_extracts_max_constants(self, tmp_path, monkeypatch): _MAX_MSG_CHARS = 2000 _OTHER = 42 """ - p = _write_helpers(tmp_path, src) # Monkeypatch GAIA_ROOT so audit_chat_helpers reads our fake file - monkeypatch.setattr( - "gaia.eval.audit.GAIA_ROOT", - # Need a root where /src/gaia/ui/_chat_helpers.py resolves to our file - # Easier: just monkeypatch the whole function's file path - tmp_path, - ) - # Since audit_chat_helpers hardcodes the path, we need to - # create the expected directory structure. + monkeypatch.setattr("gaia.eval.audit.GAIA_ROOT", tmp_path) + # audit_chat_helpers hardcodes the path — create the expected directory structure. target = tmp_path / "src" / "gaia" / "ui" target.mkdir(parents=True) (target / "_chat_helpers.py").write_text(textwrap.dedent(src), encoding="utf-8") diff --git a/tests/unit/eval/test_runner.py b/tests/unit/eval/test_runner.py index 7bfea933d..706693afa 100644 --- a/tests/unit/eval/test_runner.py +++ b/tests/unit/eval/test_runner.py @@ -20,6 +20,7 @@ from pathlib import Path import pytest +import yaml from gaia.eval.runner import ( _SCORE_WEIGHTS, @@ -332,9 +333,9 @@ def test_scales_with_turns_and_docs(self): "turns": [{"turn": 1}, {"turn": 2}], "setup": {"index_documents": [{"path": "a.pdf"}, {"path": "b.pdf"}]}, } - result = _compute_effective_timeout(900, scenario) - expected = 240 + 2 * 90 + 2 * 200 # startup + docs + turns - assert result >= expected + expected = 240 + 2 * 90 + 2 * 200 # startup + docs + turns = 820 + result = _compute_effective_timeout(100, scenario) + assert result == expected def test_capped_at_max(self): scenario = { @@ -352,8 +353,6 @@ def test_capped_at_max(self): class TestFindScenarios: def _write_scenario(self, d, sid, category="general", tags=None): - import yaml - data = { "id": sid, "category": category, From cbc5d75c4286028f472fb8cf43bdaf6baafd66f5 Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Fri, 29 May 2026 08:49:00 -0700 Subject: [PATCH 4/4] fix(types): resolve additional mypy errors in factory, providers, governance Suppress no-any-return in factory.py (dynamic provider instantiation), fix union-attr in openai/claude providers (stream response type), and cast CheckpointStatus in checkpoint_bridge.py. --- src/gaia/governance/checkpoint_bridge.py | 5 ++++- src/gaia/llm/factory.py | 2 +- src/gaia/llm/providers/claude.py | 5 +++-- src/gaia/llm/providers/openai_provider.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/gaia/governance/checkpoint_bridge.py b/src/gaia/governance/checkpoint_bridge.py index 9c7b5fd4b..9ae1a1b86 100644 --- a/src/gaia/governance/checkpoint_bridge.py +++ b/src/gaia/governance/checkpoint_bridge.py @@ -11,10 +11,13 @@ from threading import Lock +from typing import cast + from .exceptions import CheckpointNotFoundError, InvalidResolutionError from .schemas import ( CheckpointRecord, CheckpointResolution, + CheckpointStatus, GovernanceDecision, TransitionOutcome, WorkflowTransition, @@ -90,7 +93,7 @@ def resolve_checkpoint( checkpoint_id=current.checkpoint_id, workflow_id=current.workflow_id, transition_id=current.transition_id, - status=status, + status=cast(CheckpointStatus, status), created_at=current.created_at, decision_context={ **current.decision_context, diff --git a/src/gaia/llm/factory.py b/src/gaia/llm/factory.py index ede92b023..8757955fe 100644 --- a/src/gaia/llm/factory.py +++ b/src/gaia/llm/factory.py @@ -67,4 +67,4 @@ def create_client( module = importlib.import_module(module_path) provider_class = getattr(module, class_name) - return provider_class(**kwargs) + return provider_class(**kwargs) # type: ignore[no-any-return] diff --git a/src/gaia/llm/providers/claude.py b/src/gaia/llm/providers/claude.py index 789feff45..56d35c02c 100644 --- a/src/gaia/llm/providers/claude.py +++ b/src/gaia/llm/providers/claude.py @@ -74,7 +74,7 @@ def chat( response = self._client.messages.create(**params) if stream: return self._handle_stream(response) - return response.content[0].text + return response.content[0].text # type: ignore[union-attr] # embed() inherited from ABC - raises NotSupportedError @@ -99,7 +99,8 @@ def vision(self, images: list[bytes], prompt: str, **kwargs) -> str: ], } ] - return self.chat(messages, **kwargs) + result = self.chat(messages, **kwargs) + return result if isinstance(result, str) else "".join(result) # get_performance_stats() inherited from ABC - raises NotSupportedError # load_model() inherited from ABC - raises NotSupportedError diff --git a/src/gaia/llm/providers/openai_provider.py b/src/gaia/llm/providers/openai_provider.py index ab204153a..4142b0cb7 100644 --- a/src/gaia/llm/providers/openai_provider.py +++ b/src/gaia/llm/providers/openai_provider.py @@ -60,7 +60,7 @@ def chat( ) if stream: return self._handle_stream(response) - return response.choices[0].message.content + return response.choices[0].message.content # type: ignore[union-attr] def embed( self, texts: list[str], model: str = "text-embedding-3-small", **kwargs