From 98bd99191bf7257e324e6daf5192c11d56bd5597 Mon Sep 17 00:00:00 2001 From: yurekami Date: Fri, 22 May 2026 21:15:40 +0800 Subject: [PATCH] docs: fix CONTRIBUTING.md template and usage guide typos; add tests - Fill in project name in CONTRIBUTING.md (was placeholder "__________") - Fix typos in docs/README.md ("followoing" -> "following", "Evaluation your" -> "Evaluating your") - Fix broken README link pointing to SWE-agent/ProgramBench#installation (now relative ../README.md#quickstart) - Add CLI smoke tests for info, blob, and top-level help - Add tests for count_testcases, EvaluationResult.summarize, _can_reprocess, and filter_instances --- CONTRIBUTING.md | 4 +- docs/README.md | 6 +- tests/test_cli.py | 39 +++++++++++ tests/test_eval_extras.py | 115 +++++++++++++++++++++++++++++++++ tests/test_instance_filters.py | 51 +++++++++++++++ 5 files changed, 210 insertions(+), 5 deletions(-) create mode 100644 tests/test_cli.py create mode 100644 tests/test_eval_extras.py create mode 100644 tests/test_instance_filters.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 18587a9..5ad38a3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ -# Contributing to __________ +# Contributing to ProgramBench We want to make contributing to this project as easy and transparent as possible. @@ -35,5 +35,5 @@ outlined on that page and do not file a public issue. * ... ## License -By contributing to __________, you agree that your contributions will be licensed +By contributing to ProgramBench, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. diff --git a/docs/README.md b/docs/README.md index a0b1af6..0b501f4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -8,7 +8,7 @@ ## Inference Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`. -E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the followoing image: +E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image: ``` https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/ @@ -41,9 +41,9 @@ We expect to release our baseline system in `mini-swe-agent` this week. ## Evaluation -Evaluation your agent run is the main function performed by the `ProgramBench` repository. +Evaluating your agent run is the main function performed by the `ProgramBench` repository. -After following the installation instructions from the [README](https://github.com/SWE-agent/ProgramBench#installation), you can run the evaluation with: +After following the installation instructions from the [README](../README.md#quickstart), you can run the evaluation with: ``` uv run programbench eval /path/to/my-amazing-agent-run diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..594bd9d --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,39 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Smoke tests for CLI subcommands.""" + +from typer.testing import CliRunner + +from programbench.cli.main import app + +runner = CliRunner() + + +def test_top_level_help(): + result = runner.invoke(app, ["--help"]) + assert result.exit_code == 0 + assert "eval" in result.output + assert "blob" in result.output + assert "info" in result.output + + +def test_info_help(): + result = runner.invoke(app, ["info", "--help"]) + assert result.exit_code == 0 + assert "run-dir" in result.output.lower() or "run_dir" in result.output.lower() + + +def test_blob_help(): + result = runner.invoke(app, ["blob", "--help"]) + assert result.exit_code == 0 + assert "sync" in result.output + + +def test_blob_sync_help(): + result = runner.invoke(app, ["blob", "sync", "--help"]) + assert result.exit_code == 0 + assert "instance" in result.output.lower() diff --git a/tests/test_eval_extras.py b/tests/test_eval_extras.py new file mode 100644 index 0000000..8a009e8 --- /dev/null +++ b/tests/test_eval_extras.py @@ -0,0 +1,115 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for eval functions not covered by test_eval.py.""" + +import pytest + +from programbench.eval.eval import ( + EvaluationResult, + TestBranchError, + TestResult, + count_testcases, +) +from programbench.eval.eval_batch import _can_reprocess + + +JUNIT_XML_THREE_CASES = """\ + + + + + + + + +""" + + +class TestCountTestcases: + @pytest.mark.parametrize( + ("xml", "expected"), + [ + ("", 0), + (" \n ", 0), + ("", 0), + (JUNIT_XML_THREE_CASES, 3), + ], + ) + def test_counts(self, xml, expected): + assert count_testcases(xml) == expected + + +class TestEvaluationResultSummarize: + def test_clean_run(self): + result = EvaluationResult( + test_results=[ + TestResult(name="t1", branch="b1", status="passed", extra={}), + TestResult(name="t2", branch="b1", status="passed", extra={}), + ], + solution_branch="submission", + ) + s = result.summarize() + assert "100" in s + assert "2/2" in s + assert "submission" in s + + def test_with_error_code(self): + result = EvaluationResult(error_code="compile_failed", error_details="gcc not found") + s = result.summarize() + assert "compile_failed" in s + assert "gcc not found" in s + + def test_with_branch_errors(self): + result = EvaluationResult( + test_results=[TestResult(name="t1", branch="b1", status="passed", extra={})], + test_branch_errors={"b2": [TestBranchError(error_code="timeout", error_details="")]}, + ) + assert "b2" in result.summarize() + + def test_with_system_errors(self): + result = EvaluationResult( + test_results=[TestResult(name="t1", branch="b1", status="system_error", extra={})], + ) + assert "system_errors=1" in result.summarize() + + def test_with_warnings(self): + result = EvaluationResult(warnings=["something unexpected"]) + assert "warnings=1" in result.summarize() + + +class TestCanReprocess: + def test_error_code_is_reprocessable(self): + assert _can_reprocess(EvaluationResult(error_code="compile_failed")) + + def test_all_branches_tagged_in_log(self): + result = EvaluationResult( + test_branches=["b1", "b2"], + log=[ + {"step": "results_read", "branch": "b1", "returncode": 0, "output": ""}, + {"step": "results_read", "branch": "b2", "returncode": 0, "output": ""}, + ], + ) + assert _can_reprocess(result) + + def test_missing_branch_in_log_not_reprocessable(self): + result = EvaluationResult( + test_branches=["b1", "b2"], + log=[ + {"step": "results_read", "branch": "b1", "returncode": 0, "output": ""}, + ], + ) + assert not _can_reprocess(result) + + def test_branch_with_error_excluded_from_check(self): + result = EvaluationResult( + test_branches=["b1", "b2"], + test_branch_errors={"b2": [TestBranchError(error_code="fail", error_details="")]}, + log=[ + {"step": "results_read", "branch": "b1", "returncode": 0, "output": ""}, + ], + ) + assert _can_reprocess(result) diff --git a/tests/test_instance_filters.py b/tests/test_instance_filters.py new file mode 100644 index 0000000..41f5071 --- /dev/null +++ b/tests/test_instance_filters.py @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from programbench.utils.instance_filters import filter_instances + + +def _inst(iid: str, branches: dict | None = None) -> dict: + return {"instance_id": iid, "branches": branches or {}} + + +INSTANCES = [ + _inst("alpha__foo.abc", {"b1": {"tests": ["t1"]}}), + _inst("beta__bar.def", {"b2": {"tests": ["t2"]}}), + _inst("gamma__baz.ghi"), +] + + +class TestFilterInstances: + def test_no_filters_returns_all(self): + assert filter_instances(INSTANCES) == INSTANCES + + def test_regex_filter(self): + assert [i["instance_id"] for i in filter_instances(INSTANCES, filter_spec="alpha.*")] == ["alpha__foo.abc"] + + def test_regex_filter_no_match(self): + assert filter_instances(INSTANCES, filter_spec="nonexistent") == [] + + def test_slice_spec(self): + assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="0:2")] == [ + "alpha__foo.abc", + "beta__bar.def", + ] + + def test_slice_from_end(self): + assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="-1:")] == ["gamma__baz.ghi"] + + def test_has_test_branch(self): + result = filter_instances(INSTANCES, has_test_branch=True) + assert [i["instance_id"] for i in result] == ["alpha__foo.abc", "beta__bar.def"] + + def test_filter_and_slice_combined(self): + result = filter_instances(INSTANCES, filter_spec="(alpha|beta).*", slice_spec="0:1") + assert [i["instance_id"] for i in result] == ["alpha__foo.abc"] + + def test_shuffle_is_deterministic(self): + r1 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)] + r2 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)] + assert r1 == r2