diff --git a/benchmarks/evoclaw/README.md b/benchmarks/evoclaw/README.md new file mode 100644 index 00000000..c7202a8e --- /dev/null +++ b/benchmarks/evoclaw/README.md @@ -0,0 +1,20 @@ +# EvoClaw + +This benchmark entrypoint runs OpenHands against EvoClaw repositories through the +standard OpenHands benchmarks SDK path: + +1. discover EvoClaw repo directories from `--data-root`, +2. build/start an OpenHands agent-server workspace from each EvoClaw base image, +3. upload the EvoClaw task queue and SRS files into the workspace, +4. run `Agent`/`Conversation` with the normal fake-user evaluation loop, +5. emit the resulting git patch and conversation trajectory. + +```bash +uv run evoclaw-infer .llm_config/example.json \ + --data-root /path/to/EvoClaw-data \ + --repos navidrome \ + --n-limit 1 +``` + +This is currently an inference harness. It intentionally does not reimplement +EvoClaw's milestone DAG grader inside this repo. diff --git a/benchmarks/evoclaw/__init__.py b/benchmarks/evoclaw/__init__.py new file mode 100644 index 00000000..5a8477f6 --- /dev/null +++ b/benchmarks/evoclaw/__init__.py @@ -0,0 +1 @@ +"""EvoClaw benchmark integration.""" diff --git a/benchmarks/evoclaw/config.py b/benchmarks/evoclaw/config.py new file mode 100644 index 00000000..63afa3ed --- /dev/null +++ b/benchmarks/evoclaw/config.py @@ -0,0 +1,14 @@ +"""Defaults for EvoClaw inference.""" + +INFER_DEFAULTS = { + "dataset": "evoclaw", + "split": "test", + "max_iterations": 3000, + "instance_timeout": 18000, + "num_workers": 1, + "n_critic_runs": 1, + "workspace": "docker", + "enable_condenser": True, + "condenser_max_size": 100, + "condenser_keep_first": 4, +} diff --git a/benchmarks/evoclaw/prompts/default.j2 b/benchmarks/evoclaw/prompts/default.j2 new file mode 100644 index 00000000..ed9ff435 --- /dev/null +++ b/benchmarks/evoclaw/prompts/default.j2 @@ -0,0 +1,9 @@ +We need modify the repository in /testbed to complete the EvoClaw task queue. + +Task queue: +{{ task_queue_path }} + +Requirements files are available under: +{{ srs_dir }} + +For each listed milestone, read its SRS file, implement the requested behavior in /testbed, and run the relevant tests when practical. If all listed milestones are complete, use the finish tool. diff --git a/benchmarks/evoclaw/run_infer.py b/benchmarks/evoclaw/run_infer.py new file mode 100644 index 00000000..b5743eed --- /dev/null +++ b/benchmarks/evoclaw/run_infer.py @@ -0,0 +1,443 @@ +from __future__ import annotations + +import csv +import json +import os +import shlex +from pathlib import Path +from typing import List + +from jinja2 import Environment, FileSystemLoader + +from benchmarks.evoclaw.config import INFER_DEFAULTS +from benchmarks.utils.agent_context import create_agent_context +from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser +from benchmarks.utils.console_logging import summarize_instance +from benchmarks.utils.conversation import build_event_persistence_callback +from benchmarks.utils.critics import create_critic +from benchmarks.utils.evaluation import Evaluation +from benchmarks.utils.evaluation_utils import ( + construct_eval_output_dir, + get_default_on_result_writer, +) +from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.litellm_proxy import build_eval_llm +from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.models import ( + EvalInstance, + EvalMetadata, + EvalOutput, + ToolPresetType, +) +from openhands.sdk import Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser +from openhands.sdk.workspace import RemoteWorkspace +from openhands.tools.delegate import DelegateTool +from openhands.tools.preset.default import get_default_tools +from openhands.workspace import DockerDevWorkspace + + +logger = get_logger(__name__) + + +def get_tools_for_preset( + preset: ToolPresetType, + enable_browser: bool = False, +) -> list[Tool]: + if preset == "gemini": + from openhands.tools.preset.gemini import get_gemini_tools + + return get_gemini_tools(enable_browser=enable_browser) + if preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=enable_browser) + if preset == "planning": + from openhands.tools.preset.planning import get_planning_tools + + return get_planning_tools() + + return get_default_tools(enable_browser=enable_browser) + + +def _repo_image_name(repo_name: str) -> str: + return f"{repo_name.lower()}/base:latest" + + +def _read_selected_milestones(repo_root: Path) -> list[str]: + selected = repo_root / "selected_milestone_ids.txt" + if selected.exists(): + return [ + line.strip() for line in selected.read_text().splitlines() if line.strip() + ] + + milestones = repo_root / "milestones.csv" + if milestones.exists(): + with milestones.open(newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or [] + key = "milestone_id" if "milestone_id" in fieldnames else fieldnames[0] + return [row[key].strip() for row in reader if row.get(key, "").strip()] + + dependencies = repo_root / "dependencies.csv" + if dependencies.exists(): + ids: set[str] = set() + with dependencies.open(newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + for value in row.values(): + value = (value or "").strip() + if value: + ids.add(value) + return sorted(ids) + + return [] + + +def _write_workspace_file( + workspace: RemoteWorkspace, + local_path: Path, + remote_path: str, +) -> None: + result = workspace.file_upload(local_path, remote_path) + if not result.success: + raise RuntimeError(f"Failed to upload {local_path} to {remote_path}: {result}") + + +def _ensure_workspace_writable(workspace: RemoteWorkspace, repo_root: Path) -> None: + with (repo_root / "metadata.json").open(encoding="utf-8") as f: + metadata = json.load(f) + + chown_parts = [ + f"sudo chown $(id -u):$(id -g) {shlex.quote(workspace.working_dir)}", + ( + f"sudo find {shlex.quote(workspace.working_dir)} -maxdepth 1 " + "-type f -exec chown $(id -u):$(id -g) {} +" + ), + f"git config --global --add safe.directory {shlex.quote(workspace.working_dir)}", + ] + for src_dir in metadata.get("repo_src_dirs") or []: + relative = str(src_dir).strip().strip("/") + if not relative or "*" in relative: + continue + remote_path = f"{workspace.working_dir}/{relative}" + chown_parts.append( + f"if [ -e {shlex.quote(remote_path)} ]; then " + f"sudo find {shlex.quote(remote_path)} " + r"\( -name .git -o -name node_modules -o -name .venv " + r"-o -name __pycache__ \) -prune -o " + "-exec chown $(id -u):$(id -g) {} +; fi" + ) + + result = workspace.execute_command( + " && ".join(chown_parts), + timeout=120, + ) + if result.exit_code != 0: + raise RuntimeError( + f"Failed to make {workspace.working_dir} writable: {result.stderr}" + ) + + +def _capture_git_patch(workspace: RemoteWorkspace, repo_dir: str) -> str: + """Capture tracked changes plus newly-created files as one git patch.""" + intent_result = workspace.execute_command( + f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} add -N .", + timeout=120, + ) + if intent_result.exit_code != 0: + raise RuntimeError(f"git add -N failed: {intent_result.stderr}") + + diff_result = workspace.execute_command( + f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} --no-pager diff --no-color --binary", + timeout=120, + ) + if diff_result.exit_code != 0: + raise RuntimeError(f"git diff failed: {diff_result.stderr}") + return diff_result.stdout + + +def _render_instruction( + prompt_path: str, + task_queue_path: str, + srs_dir: str, +) -> str: + prompts_dir = os.path.dirname(prompt_path) + template_name = os.path.basename(prompt_path) + env = Environment(loader=FileSystemLoader(prompts_dir)) + return env.get_template(template_name).render( + task_queue_path=task_queue_path, + srs_dir=srs_dir, + ) + + +class EvoClawEvaluation(Evaluation): + def prepare_instances(self) -> List[EvalInstance]: + assert self.metadata.details is not None + data_root = Path(self.metadata.details["data_root"]).expanduser().resolve() + selected_repos = self.metadata.details.get("selected_repos") + + instances: list[EvalInstance] = [] + for repo_root in sorted(data_root.iterdir()): + if not repo_root.is_dir() or not (repo_root / "metadata.json").exists(): + continue + if selected_repos and not any( + selected in repo_root.name for selected in selected_repos + ): + continue + + milestone_ids = _read_selected_milestones(repo_root) + instances.append( + EvalInstance( + id=repo_root.name, + data={ + "repo_root": str(repo_root), + "image": _repo_image_name(repo_root.name), + "milestone_ids": milestone_ids, + }, + ) + ) + + if self.metadata.eval_limit: + instances = instances[: self.metadata.eval_limit] + logger.info("Prepared %d EvoClaw instances", len(instances)) + return instances + + def prepare_workspace( + self, + instance: EvalInstance, + resource_factor: int = 1, + forward_env: list[str] | None = None, + ) -> RemoteWorkspace: + return DockerDevWorkspace( + base_image=instance.data["image"], + target="source", + working_dir="/testbed", + forward_env=forward_env or [], + ) + + def _upload_task_materials( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + ) -> dict[str, str]: + repo_root = Path(instance.data["repo_root"]) + material_dir = Path(self.metadata.eval_output_dir) / "evoclaw_materials" + material_dir.mkdir(parents=True, exist_ok=True) + instance_dir = material_dir / instance.id + instance_dir.mkdir(parents=True, exist_ok=True) + + remote_root = "/tmp/evoclaw" + remote_srs_dir = f"{remote_root}/srs" + remote_task_queue = f"{remote_root}/TASK_QUEUE.md" + _ensure_workspace_writable(workspace, repo_root) + mkdir_result = workspace.execute_command( + f"mkdir -p {remote_srs_dir}", timeout=30 + ) + if mkdir_result.exit_code != 0: + raise RuntimeError( + f"Failed to create {remote_srs_dir}: {mkdir_result.stderr}" + ) + + queue_lines = [ + "# EvoClaw Task Queue", + "", + "Implement the following milestones in /testbed:", + "", + ] + for milestone_id in instance.data["milestone_ids"]: + srs_path = repo_root / "srs" / milestone_id / "SRS.md" + if not srs_path.exists(): + logger.warning("Missing SRS for %s: %s", milestone_id, srs_path) + continue + local_srs = instance_dir / f"{milestone_id}_SRS.md" + local_srs.write_text(srs_path.read_text(encoding="utf-8"), encoding="utf-8") + remote_srs = f"{remote_srs_dir}/{milestone_id}_SRS.md" + _write_workspace_file(workspace, local_srs, remote_srs) + queue_lines.append(f"- {milestone_id}: {remote_srs}") + + local_queue = instance_dir / "TASK_QUEUE.md" + local_queue.write_text("\n".join(queue_lines) + "\n", encoding="utf-8") + _write_workspace_file(workspace, local_queue, remote_task_queue) + + return { + "task_queue_path": remote_task_queue, + "srs_dir": remote_srs_dir, + } + + def evaluate_instance( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + ) -> EvalOutput: + agent_llm = build_eval_llm(self.metadata.llm) + tools = get_tools_for_preset( + preset=self.metadata.tool_preset, + enable_browser=False, + ) + if self.metadata.enable_delegation: + tools.append(Tool(name=DelegateTool.name)) + + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + + agent = Agent( + llm=agent_llm, + tools=tools, + system_prompt_kwargs={"cli_mode": True}, + condenser=condenser, + agent_context=create_agent_context(), + ) + + persist_callback = build_event_persistence_callback( + run_id=self.metadata.eval_output_dir, + instance_id=instance.id, + attempt=self.current_attempt, + ) + + conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[persist_callback], + max_iteration_per_run=self.metadata.max_iterations, + delete_on_close=True, + ) + + paths = self._upload_task_materials(instance, workspace) + assert self.metadata.prompt_path is not None + instruction = _render_instruction( + prompt_path=self.metadata.prompt_path, + task_queue_path=paths["task_queue_path"], + srs_dir=paths["srs_dir"], + ) + + repo_dir = shlex.quote(workspace.working_dir) + status_result = workspace.execute_command( + f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} status --short", + timeout=120, + ) + if status_result.exit_code != 0: + raise RuntimeError(f"git status failed: {status_result.stderr}") + if status_result.stdout.strip(): + logger.warning( + "Workspace %s starts with existing changes:\n%s", + workspace.working_dir, + status_result.stdout, + ) + conversation.send_message(instruction) + run_error = None + try: + run_conversation_with_fake_user_response(conversation) + except Exception as exc: + run_error = str(exc) + logger.exception("Conversation run failed for %s", instance.id) + + git_patch = _capture_git_patch(workspace, repo_dir) + + summarize_instance( + instance_id=instance.id, + conversation=conversation, + git_patch=git_patch, + logger=logger, + ) + + return EvalOutput( + instance_id=instance.id, + attempt=self.current_attempt, + test_result={"git_patch": git_patch}, + instruction=instruction, + error=run_error, + history=list(conversation.state.events), + metrics=conversation.conversation_stats.get_combined_metrics(), + instance=instance.data, + ) + + +def main() -> None: + parser = get_parser() + add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--data-root", + required=True, + help="Path to EvoClaw-data containing repo directories with metadata.json.", + ) + parser.add_argument( + "--repos", + nargs="+", + default=None, + help="Optional repo-name substring filters, e.g. --repos navidrome ripgrep.", + ) + parser.add_argument( + "--instance-timeout", + type=int, + default=INFER_DEFAULTS["instance_timeout"], + help="Maximum wall-clock seconds per instance (default: 18000 = 5 hours).", + ) + parser.set_defaults(**INFER_DEFAULTS) + args = parser.parse_args() + + llm = load_llm_config(args.llm_config_path) + selected_repos = args.repos + if args.select: + selected_from_file = [ + line.strip() + for line in Path(args.select).read_text().splitlines() + if line.strip() + ] + selected_repos = (selected_repos or []) + selected_from_file + + structured_output_dir = construct_eval_output_dir( + base_dir=args.output_dir, + dataset_name="evoclaw", + model_name=llm.model, + max_iterations=args.max_iterations, + eval_note=args.note, + ) + + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + + metadata = EvalMetadata( + llm=llm, + dataset=args.dataset, + dataset_split=args.split, + max_iterations=args.max_iterations, + eval_output_dir=structured_output_dir, + details={ + "data_root": str(Path(args.data_root).expanduser().resolve()), + "selected_repos": selected_repos, + }, + prompt_path=args.prompt_path, + eval_limit=args.n_limit, + n_critic_runs=args.n_critic_runs, + critic=create_critic(args), + selected_instances_file=args.select, + max_retries=args.max_retries, + workspace_type=args.workspace, + tool_preset=args.tool_preset, + enable_delegation=args.enable_delegation, + agent_type=args.agent_type, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, + ) + + evaluator = EvoClawEvaluation( + metadata=metadata, + num_workers=args.num_workers, + instance_timeout=args.instance_timeout, + ) + evaluator.run(on_result=get_default_on_result_writer(evaluator.output_path)) + + logger.info("EvoClaw inference completed") + print(json.dumps({"output_json": str(evaluator.output_path)})) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 34ecaf33..d8c54e0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ swebenchmultimodal-eval = "benchmarks.swebenchmultimodal.eval_infer:main" swebenchmultilingual-infer = "benchmarks.swebenchmultilingual.run_infer:main" swebenchmultilingual-eval = "benchmarks.swebenchmultilingual.eval_infer:main" swefficiency-infer = "benchmarks.swefficiency.run_infer:main" +evoclaw-infer = "benchmarks.evoclaw.run_infer:main" terminalbench-infer = "benchmarks.terminalbench.run_infer:main" terminalbench-eval = "benchmarks.terminalbench.eval_infer:main" skillsbench-infer = "benchmarks.skillsbench.run_infer:main" @@ -90,7 +91,7 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["."] -include = ["benchmarks"] +include = ["benchmarks", "benchmarks.*"] [tool.setuptools] # Install the top-level sitecustomize module so Python auto-loads our Modal logging patch. diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 6199693f..3adf0483 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -360,6 +360,15 @@ def _get_test_instance_for_benchmark(benchmark_name: str) -> EvalInstance: "problem_statement": "Test problem for swesmith", }, ) + elif benchmark_name == "evoclaw": + return EvalInstance( + id="test-instance-1", + data={ + "repo_root": "test/repo", + "image": "test/base:latest", + "milestone_ids": ["M1"], + }, + ) else: # Generic instance for unknown benchmarks return EvalInstance( @@ -538,6 +547,24 @@ def _create_metadata_for_benchmark(benchmark_name: str, llm: LLM) -> EvalMetadat prompt_path=prompt_path, critic=PassCritic(), ) + elif benchmark_name == "evoclaw": + prompt_path = str( + Path(__file__).parent.parent + / "benchmarks" + / "evoclaw" + / "prompts" + / "default.j2" + ) + return EvalMetadata( + llm=llm, + max_iterations=5, + eval_output_dir="/tmp/eval_output", + dataset="evoclaw", + dataset_split="test", + details={"data_root": "test/data", "selected_repos": None}, + prompt_path=prompt_path, + critic=PassCritic(), + ) else: # Generic metadata for unknown benchmarks return EvalMetadata( @@ -658,6 +685,16 @@ def test_benchmark_metrics_collection( ), ): result = evaluation.evaluate_instance(instance, mock_workspace) + elif benchmark_name == "evoclaw": + with patch.object( + evaluation, + "_upload_task_materials", + return_value={ + "task_queue_path": "/e2e_workspace/TASK_QUEUE.md", + "srs_dir": "/e2e_workspace/srs", + }, + ): + result = evaluation.evaluate_instance(instance, mock_workspace) else: result = evaluation.evaluate_instance(instance, mock_workspace)