From eb745ce7909eaa0a983297af1fea07f37390b0ab Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 7 May 2026 22:48:08 +0000 Subject: [PATCH 1/6] Add EvoClaw benchmark inference --- benchmarks/evoclaw/README.md | 20 ++ benchmarks/evoclaw/__init__.py | 1 + benchmarks/evoclaw/config.py | 13 + benchmarks/evoclaw/prompts/default.j2 | 9 + benchmarks/evoclaw/run_infer.py | 362 ++++++++++++++++++++++++++ pyproject.toml | 3 +- 6 files changed, 407 insertions(+), 1 deletion(-) create mode 100644 benchmarks/evoclaw/README.md create mode 100644 benchmarks/evoclaw/__init__.py create mode 100644 benchmarks/evoclaw/config.py create mode 100644 benchmarks/evoclaw/prompts/default.j2 create mode 100644 benchmarks/evoclaw/run_infer.py diff --git a/benchmarks/evoclaw/README.md b/benchmarks/evoclaw/README.md new file mode 100644 index 000000000..c7202a8e4 --- /dev/null +++ b/benchmarks/evoclaw/README.md @@ -0,0 +1,20 @@ +# EvoClaw + +This benchmark entrypoint runs OpenHands against EvoClaw repositories through the +standard OpenHands benchmarks SDK path: + +1. discover EvoClaw repo directories from `--data-root`, +2. build/start an OpenHands agent-server workspace from each EvoClaw base image, +3. upload the EvoClaw task queue and SRS files into the workspace, +4. run `Agent`/`Conversation` with the normal fake-user evaluation loop, +5. emit the resulting git patch and conversation trajectory. + +```bash +uv run evoclaw-infer .llm_config/example.json \ + --data-root /path/to/EvoClaw-data \ + --repos navidrome \ + --n-limit 1 +``` + +This is currently an inference harness. It intentionally does not reimplement +EvoClaw's milestone DAG grader inside this repo. diff --git a/benchmarks/evoclaw/__init__.py b/benchmarks/evoclaw/__init__.py new file mode 100644 index 000000000..5a8477f6f --- /dev/null +++ b/benchmarks/evoclaw/__init__.py @@ -0,0 +1 @@ +"""EvoClaw benchmark integration.""" diff --git a/benchmarks/evoclaw/config.py b/benchmarks/evoclaw/config.py new file mode 100644 index 000000000..460eac91e --- /dev/null +++ b/benchmarks/evoclaw/config.py @@ -0,0 +1,13 @@ +"""Defaults for EvoClaw inference.""" + +INFER_DEFAULTS = { + "dataset": "evoclaw", + "split": "test", + "max_iterations": 3000, + "num_workers": 1, + "n_critic_runs": 1, + "workspace": "docker", + "enable_condenser": True, + "condenser_max_size": 100, + "condenser_keep_first": 4, +} diff --git a/benchmarks/evoclaw/prompts/default.j2 b/benchmarks/evoclaw/prompts/default.j2 new file mode 100644 index 000000000..ed9ff435c --- /dev/null +++ b/benchmarks/evoclaw/prompts/default.j2 @@ -0,0 +1,9 @@ +We need modify the repository in /testbed to complete the EvoClaw task queue. + +Task queue: +{{ task_queue_path }} + +Requirements files are available under: +{{ srs_dir }} + +For each listed milestone, read its SRS file, implement the requested behavior in /testbed, and run the relevant tests when practical. If all listed milestones are complete, use the finish tool. diff --git a/benchmarks/evoclaw/run_infer.py b/benchmarks/evoclaw/run_infer.py new file mode 100644 index 000000000..5af8fe760 --- /dev/null +++ b/benchmarks/evoclaw/run_infer.py @@ -0,0 +1,362 @@ +from __future__ import annotations + +import csv +import json +import os +from pathlib import Path +from typing import List + +from jinja2 import Environment, FileSystemLoader + +from benchmarks.evoclaw.config import INFER_DEFAULTS +from benchmarks.utils.agent_context import create_agent_context +from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser +from benchmarks.utils.console_logging import summarize_instance +from benchmarks.utils.conversation import build_event_persistence_callback +from benchmarks.utils.critics import create_critic +from benchmarks.utils.evaluation import Evaluation +from benchmarks.utils.evaluation_utils import ( + construct_eval_output_dir, + get_default_on_result_writer, +) +from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.litellm_proxy import build_eval_llm +from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.models import ( + EvalInstance, + EvalMetadata, + EvalOutput, + ToolPresetType, +) +from openhands.sdk import Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser +from openhands.sdk.workspace import RemoteWorkspace +from openhands.tools.delegate import DelegateTool +from openhands.workspace import DockerDevWorkspace + + +logger = get_logger(__name__) + + +def get_tools_for_preset( + preset: ToolPresetType, + enable_browser: bool = False, +) -> list[Tool]: + if preset == "gemini": + from openhands.tools.preset.gemini import get_gemini_tools + + return get_gemini_tools(enable_browser=enable_browser) + if preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=enable_browser) + if preset == "planning": + from openhands.tools.preset.planning import get_planning_tools + + return get_planning_tools() + + from openhands.tools.preset.default import get_default_tools + + return get_default_tools(enable_browser=enable_browser) + + +def _repo_image_name(repo_name: str) -> str: + return f"{repo_name.lower()}/base:latest" + + +def _read_selected_milestones(repo_root: Path) -> list[str]: + selected = repo_root / "selected_milestone_ids.txt" + if selected.exists(): + return [line.strip() for line in selected.read_text().splitlines() if line.strip()] + + milestones = repo_root / "milestones.csv" + if milestones.exists(): + with milestones.open(newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or [] + key = "milestone_id" if "milestone_id" in fieldnames else fieldnames[0] + return [row[key].strip() for row in reader if row.get(key, "").strip()] + + dependencies = repo_root / "dependencies.csv" + if dependencies.exists(): + ids: set[str] = set() + with dependencies.open(newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + for value in row.values(): + value = (value or "").strip() + if value: + ids.add(value) + return sorted(ids) + + return [] + + +def _write_workspace_file( + workspace: RemoteWorkspace, + local_path: Path, + remote_path: str, +) -> None: + result = workspace.file_upload(local_path, remote_path) + if not result.success: + raise RuntimeError(f"Failed to upload {local_path} to {remote_path}: {result}") + + +def _render_instruction( + prompt_path: str, + task_queue_path: str, + srs_dir: str, +) -> str: + prompts_dir = os.path.dirname(prompt_path) + template_name = os.path.basename(prompt_path) + env = Environment(loader=FileSystemLoader(prompts_dir)) + return env.get_template(template_name).render( + task_queue_path=task_queue_path, + srs_dir=srs_dir, + ) + + +class EvoClawEvaluation(Evaluation): + def prepare_instances(self) -> List[EvalInstance]: + assert self.metadata.details is not None + data_root = Path(self.metadata.details["data_root"]).expanduser().resolve() + selected_repos = self.metadata.details.get("selected_repos") + + instances: list[EvalInstance] = [] + for repo_root in sorted(data_root.iterdir()): + if not repo_root.is_dir() or not (repo_root / "metadata.json").exists(): + continue + if selected_repos and not any( + selected in repo_root.name for selected in selected_repos + ): + continue + + milestone_ids = _read_selected_milestones(repo_root) + instances.append( + EvalInstance( + id=repo_root.name, + data={ + "repo_root": str(repo_root), + "image": _repo_image_name(repo_root.name), + "milestone_ids": milestone_ids, + }, + ) + ) + + if self.metadata.eval_limit: + instances = instances[: self.metadata.eval_limit] + logger.info("Prepared %d EvoClaw instances", len(instances)) + return instances + + def prepare_workspace( + self, + instance: EvalInstance, + resource_factor: int = 1, + forward_env: list[str] | None = None, + ) -> RemoteWorkspace: + return DockerDevWorkspace( + base_image=instance.data["image"], + target="source", + working_dir="/testbed", + forward_env=forward_env or [], + ) + + def _upload_task_materials( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + ) -> dict[str, str]: + repo_root = Path(instance.data["repo_root"]) + material_dir = Path(self.metadata.eval_output_dir) / "evoclaw_materials" + material_dir.mkdir(parents=True, exist_ok=True) + instance_dir = material_dir / instance.id + instance_dir.mkdir(parents=True, exist_ok=True) + + remote_srs_dir = "/e2e_workspace/srs" + remote_task_queue = "/e2e_workspace/TASK_QUEUE.md" + workspace.execute_command("mkdir -p /e2e_workspace/srs", timeout=30) + + queue_lines = [ + "# EvoClaw Task Queue", + "", + "Implement the following milestones in /testbed:", + "", + ] + for milestone_id in instance.data["milestone_ids"]: + srs_path = repo_root / "srs" / milestone_id / "SRS.md" + if not srs_path.exists(): + logger.warning("Missing SRS for %s: %s", milestone_id, srs_path) + continue + local_srs = instance_dir / f"{milestone_id}_SRS.md" + local_srs.write_text(srs_path.read_text(encoding="utf-8"), encoding="utf-8") + remote_srs = f"{remote_srs_dir}/{milestone_id}_SRS.md" + _write_workspace_file(workspace, local_srs, remote_srs) + queue_lines.append(f"- {milestone_id}: {remote_srs}") + + local_queue = instance_dir / "TASK_QUEUE.md" + local_queue.write_text("\n".join(queue_lines) + "\n", encoding="utf-8") + _write_workspace_file(workspace, local_queue, remote_task_queue) + + return { + "task_queue_path": remote_task_queue, + "srs_dir": remote_srs_dir, + } + + def evaluate_instance( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + ) -> EvalOutput: + agent_llm = build_eval_llm(self.metadata.llm) + tools = get_tools_for_preset( + preset=self.metadata.tool_preset, + enable_browser=False, + ) + if self.metadata.enable_delegation: + tools.append(Tool(name=DelegateTool.name)) + + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + + agent = Agent( + llm=agent_llm, + tools=tools, + system_prompt_kwargs={"cli_mode": True}, + condenser=condenser, + agent_context=create_agent_context(), + ) + + persist_callback = build_event_persistence_callback( + run_id=self.metadata.eval_output_dir, + instance_id=instance.id, + attempt=self.current_attempt, + ) + + conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[persist_callback], + max_iteration_per_run=self.metadata.max_iterations, + delete_on_close=True, + ) + + paths = self._upload_task_materials(instance, workspace) + assert self.metadata.prompt_path is not None + instruction = _render_instruction( + prompt_path=self.metadata.prompt_path, + task_queue_path=paths["task_queue_path"], + srs_dir=paths["srs_dir"], + ) + + workspace.execute_command("cd /testbed && git reset --hard", timeout=120) + conversation.send_message(instruction) + run_conversation_with_fake_user_response(conversation) + + diff_result = workspace.execute_command( + "cd /testbed && git --no-pager diff --no-color", + timeout=120, + ) + if diff_result.exit_code != 0: + raise RuntimeError(f"git diff failed: {diff_result.stderr}") + git_patch = diff_result.stdout + + summarize_instance( + instance_id=instance.id, + conversation=conversation, + git_patch=git_patch, + logger=logger, + ) + + return EvalOutput( + instance_id=instance.id, + attempt=self.current_attempt, + test_result={"git_patch": git_patch}, + instruction=instruction, + error=None, + history=list(conversation.state.events), + metrics=conversation.conversation_stats.get_combined_metrics(), + instance=instance.data, + ) + + +def main() -> None: + parser = get_parser() + add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--data-root", + required=True, + help="Path to EvoClaw-data containing repo directories with metadata.json.", + ) + parser.add_argument( + "--repos", + nargs="+", + default=None, + help="Optional repo-name substring filters, e.g. --repos navidrome ripgrep.", + ) + parser.set_defaults(**INFER_DEFAULTS) + args = parser.parse_args() + + llm = load_llm_config(args.llm_config_path) + selected_repos = args.repos + if args.select: + selected_from_file = [ + line.strip() + for line in Path(args.select).read_text().splitlines() + if line.strip() + ] + selected_repos = (selected_repos or []) + selected_from_file + + structured_output_dir = construct_eval_output_dir( + base_dir=args.output_dir, + dataset_name="evoclaw", + model_name=llm.model, + max_iterations=args.max_iterations, + eval_note=args.note, + ) + + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + + metadata = EvalMetadata( + llm=llm, + dataset=args.dataset, + dataset_split=args.split, + max_iterations=args.max_iterations, + eval_output_dir=structured_output_dir, + details={ + "data_root": str(Path(args.data_root).expanduser().resolve()), + "selected_repos": selected_repos, + }, + prompt_path=args.prompt_path, + eval_limit=args.n_limit, + n_critic_runs=args.n_critic_runs, + critic=create_critic(args), + selected_instances_file=args.select, + max_retries=args.max_retries, + workspace_type=args.workspace, + tool_preset=args.tool_preset, + enable_delegation=args.enable_delegation, + agent_type=args.agent_type, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, + ) + + evaluator = EvoClawEvaluation( + metadata=metadata, + num_workers=args.num_workers, + ) + evaluator.run(on_result=get_default_on_result_writer(evaluator.output_path)) + + logger.info("EvoClaw inference completed") + print(json.dumps({"output_json": str(evaluator.output_path)})) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 34ecaf333..d8c54e0f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ swebenchmultimodal-eval = "benchmarks.swebenchmultimodal.eval_infer:main" swebenchmultilingual-infer = "benchmarks.swebenchmultilingual.run_infer:main" swebenchmultilingual-eval = "benchmarks.swebenchmultilingual.eval_infer:main" swefficiency-infer = "benchmarks.swefficiency.run_infer:main" +evoclaw-infer = "benchmarks.evoclaw.run_infer:main" terminalbench-infer = "benchmarks.terminalbench.run_infer:main" terminalbench-eval = "benchmarks.terminalbench.eval_infer:main" skillsbench-infer = "benchmarks.skillsbench.run_infer:main" @@ -90,7 +91,7 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["."] -include = ["benchmarks"] +include = ["benchmarks", "benchmarks.*"] [tool.setuptools] # Install the top-level sitecustomize module so Python auto-loads our Modal logging patch. From fe3d86a97e5fc8acf7b928eb70da51b451e9a071 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 8 May 2026 04:45:55 +0000 Subject: [PATCH 2/6] Fix EvoClaw benchmark metrics test --- benchmarks/evoclaw/run_infer.py | 7 ++++--- tests/test_metrics.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/benchmarks/evoclaw/run_infer.py b/benchmarks/evoclaw/run_infer.py index 5af8fe760..109e38e1a 100644 --- a/benchmarks/evoclaw/run_infer.py +++ b/benchmarks/evoclaw/run_infer.py @@ -32,6 +32,7 @@ from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool +from openhands.tools.preset.default import get_default_tools from openhands.workspace import DockerDevWorkspace @@ -55,8 +56,6 @@ def get_tools_for_preset( return get_planning_tools() - from openhands.tools.preset.default import get_default_tools - return get_default_tools(enable_browser=enable_browser) @@ -67,7 +66,9 @@ def _repo_image_name(repo_name: str) -> str: def _read_selected_milestones(repo_root: Path) -> list[str]: selected = repo_root / "selected_milestone_ids.txt" if selected.exists(): - return [line.strip() for line in selected.read_text().splitlines() if line.strip()] + return [ + line.strip() for line in selected.read_text().splitlines() if line.strip() + ] milestones = repo_root / "milestones.csv" if milestones.exists(): diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 6199693f3..3adf04835 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -360,6 +360,15 @@ def _get_test_instance_for_benchmark(benchmark_name: str) -> EvalInstance: "problem_statement": "Test problem for swesmith", }, ) + elif benchmark_name == "evoclaw": + return EvalInstance( + id="test-instance-1", + data={ + "repo_root": "test/repo", + "image": "test/base:latest", + "milestone_ids": ["M1"], + }, + ) else: # Generic instance for unknown benchmarks return EvalInstance( @@ -538,6 +547,24 @@ def _create_metadata_for_benchmark(benchmark_name: str, llm: LLM) -> EvalMetadat prompt_path=prompt_path, critic=PassCritic(), ) + elif benchmark_name == "evoclaw": + prompt_path = str( + Path(__file__).parent.parent + / "benchmarks" + / "evoclaw" + / "prompts" + / "default.j2" + ) + return EvalMetadata( + llm=llm, + max_iterations=5, + eval_output_dir="/tmp/eval_output", + dataset="evoclaw", + dataset_split="test", + details={"data_root": "test/data", "selected_repos": None}, + prompt_path=prompt_path, + critic=PassCritic(), + ) else: # Generic metadata for unknown benchmarks return EvalMetadata( @@ -658,6 +685,16 @@ def test_benchmark_metrics_collection( ), ): result = evaluation.evaluate_instance(instance, mock_workspace) + elif benchmark_name == "evoclaw": + with patch.object( + evaluation, + "_upload_task_materials", + return_value={ + "task_queue_path": "/e2e_workspace/TASK_QUEUE.md", + "srs_dir": "/e2e_workspace/srs", + }, + ): + result = evaluation.evaluate_instance(instance, mock_workspace) else: result = evaluation.evaluate_instance(instance, mock_workspace) From 3c5dfea628c859182581d4d7c19837fafe0c85d5 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 8 May 2026 05:29:54 +0000 Subject: [PATCH 3/6] Harden EvoClaw SDK workspace setup --- benchmarks/evoclaw/run_infer.py | 121 ++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 7 deletions(-) diff --git a/benchmarks/evoclaw/run_infer.py b/benchmarks/evoclaw/run_infer.py index 109e38e1a..f363ff2b9 100644 --- a/benchmarks/evoclaw/run_infer.py +++ b/benchmarks/evoclaw/run_infer.py @@ -3,6 +3,8 @@ import csv import json import os +import re +import shlex from pathlib import Path from typing import List @@ -103,6 +105,82 @@ def _write_workspace_file( raise RuntimeError(f"Failed to upload {local_path} to {remote_path}: {result}") +def _affected_writable_paths(repo_root: Path, milestone_ids: list[str]) -> list[str]: + paths: set[str] = set() + for milestone_id in milestone_ids: + srs_path = repo_root / "srs" / milestone_id / "SRS.md" + if not srs_path.exists(): + continue + + in_affected_section = False + for line in srs_path.read_text(encoding="utf-8").splitlines(): + if "Affected Modules" in line: + in_affected_section = True + continue + if in_affected_section and line.startswith("##"): + break + if not in_affected_section: + continue + + for match in re.findall(r"`([^`]+)`", line): + relative = match.strip().lstrip("/") + if relative.startswith("testbed/"): + relative = relative[len("testbed/") :] + if not relative or "*" in relative: + continue + parts = Path(relative).parts + if not parts: + continue + if len(parts) == 1: + paths.add(parts[0]) + elif parts[0] == "ui" and len(parts) > 2: + paths.add(str(Path(parts[0]) / parts[1])) + else: + paths.add(parts[0]) + + return sorted(paths) + + +def _ensure_workspace_writable( + workspace: RemoteWorkspace, + repo_root: Path, + milestone_ids: list[str], +) -> None: + with (repo_root / "metadata.json").open(encoding="utf-8") as f: + metadata = json.load(f) + + src_dirs = _affected_writable_paths(repo_root, milestone_ids) + if not src_dirs: + src_dirs = metadata.get("repo_src_dirs") or [] + + chown_parts = [ + f"sudo chown $(id -u):$(id -g) {shlex.quote(workspace.working_dir)}", + ( + f"sudo find {shlex.quote(workspace.working_dir)} -maxdepth 1 " + "-type f -exec chown $(id -u):$(id -g) {} +" + ), + f"git config --global --add safe.directory {shlex.quote(workspace.working_dir)}", + ] + for src_dir in src_dirs: + relative = str(src_dir).strip().strip("/") + if not relative or "*" in relative: + continue + remote_path = f"{workspace.working_dir}/{relative}" + chown_parts.append( + f"if [ -e {shlex.quote(remote_path)} ]; then " + f"sudo chown -R $(id -u):$(id -g) {shlex.quote(remote_path)}; fi" + ) + + result = workspace.execute_command( + " && ".join(chown_parts), + timeout=120, + ) + if result.exit_code != 0: + raise RuntimeError( + f"Failed to make {workspace.working_dir} writable: {result.stderr}" + ) + + def _render_instruction( prompt_path: str, task_queue_path: str, @@ -173,9 +251,21 @@ def _upload_task_materials( instance_dir = material_dir / instance.id instance_dir.mkdir(parents=True, exist_ok=True) - remote_srs_dir = "/e2e_workspace/srs" - remote_task_queue = "/e2e_workspace/TASK_QUEUE.md" - workspace.execute_command("mkdir -p /e2e_workspace/srs", timeout=30) + remote_root = "/tmp/evoclaw" + remote_srs_dir = f"{remote_root}/srs" + remote_task_queue = f"{remote_root}/TASK_QUEUE.md" + _ensure_workspace_writable( + workspace, + repo_root, + instance.data["milestone_ids"], + ) + mkdir_result = workspace.execute_command( + f"mkdir -p {remote_srs_dir}", timeout=30 + ) + if mkdir_result.exit_code != 0: + raise RuntimeError( + f"Failed to create {remote_srs_dir}: {mkdir_result.stderr}" + ) queue_lines = [ "# EvoClaw Task Queue", @@ -254,12 +344,29 @@ def evaluate_instance( srs_dir=paths["srs_dir"], ) - workspace.execute_command("cd /testbed && git reset --hard", timeout=120) + repo_dir = shlex.quote(workspace.working_dir) + status_result = workspace.execute_command( + f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} status --short", + timeout=120, + ) + if status_result.exit_code != 0: + raise RuntimeError(f"git status failed: {status_result.stderr}") + if status_result.stdout.strip(): + logger.warning( + "Workspace %s starts with existing changes:\n%s", + workspace.working_dir, + status_result.stdout, + ) conversation.send_message(instruction) - run_conversation_with_fake_user_response(conversation) + run_error = None + try: + run_conversation_with_fake_user_response(conversation) + except Exception as exc: + run_error = str(exc) + logger.exception("Conversation run failed for %s", instance.id) diff_result = workspace.execute_command( - "cd /testbed && git --no-pager diff --no-color", + f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} --no-pager diff --no-color", timeout=120, ) if diff_result.exit_code != 0: @@ -278,7 +385,7 @@ def evaluate_instance( attempt=self.current_attempt, test_result={"git_patch": git_patch}, instruction=instruction, - error=None, + error=run_error, history=list(conversation.state.events), metrics=conversation.conversation_stats.get_combined_metrics(), instance=instance.data, From ec8233185ca613a618aa13bd081d51c831a65f07 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 8 May 2026 05:32:35 +0000 Subject: [PATCH 4/6] Fix EvoClaw writable path typing --- benchmarks/evoclaw/run_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/evoclaw/run_infer.py b/benchmarks/evoclaw/run_infer.py index f363ff2b9..3c6cfc32c 100644 --- a/benchmarks/evoclaw/run_infer.py +++ b/benchmarks/evoclaw/run_infer.py @@ -128,7 +128,7 @@ def _affected_writable_paths(repo_root: Path, milestone_ids: list[str]) -> list[ relative = relative[len("testbed/") :] if not relative or "*" in relative: continue - parts = Path(relative).parts + parts = [part for part in relative.split("/") if part] if not parts: continue if len(parts) == 1: From 4bdbe8b30148fee9728bd0347c088b4a9f72496e Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 8 May 2026 16:44:15 +0000 Subject: [PATCH 5/6] Simplify EvoClaw workspace permissions --- benchmarks/evoclaw/run_infer.py | 60 ++++----------------------------- 1 file changed, 7 insertions(+), 53 deletions(-) diff --git a/benchmarks/evoclaw/run_infer.py b/benchmarks/evoclaw/run_infer.py index 3c6cfc32c..3ec696d2a 100644 --- a/benchmarks/evoclaw/run_infer.py +++ b/benchmarks/evoclaw/run_infer.py @@ -3,7 +3,6 @@ import csv import json import os -import re import shlex from pathlib import Path from typing import List @@ -105,54 +104,10 @@ def _write_workspace_file( raise RuntimeError(f"Failed to upload {local_path} to {remote_path}: {result}") -def _affected_writable_paths(repo_root: Path, milestone_ids: list[str]) -> list[str]: - paths: set[str] = set() - for milestone_id in milestone_ids: - srs_path = repo_root / "srs" / milestone_id / "SRS.md" - if not srs_path.exists(): - continue - - in_affected_section = False - for line in srs_path.read_text(encoding="utf-8").splitlines(): - if "Affected Modules" in line: - in_affected_section = True - continue - if in_affected_section and line.startswith("##"): - break - if not in_affected_section: - continue - - for match in re.findall(r"`([^`]+)`", line): - relative = match.strip().lstrip("/") - if relative.startswith("testbed/"): - relative = relative[len("testbed/") :] - if not relative or "*" in relative: - continue - parts = [part for part in relative.split("/") if part] - if not parts: - continue - if len(parts) == 1: - paths.add(parts[0]) - elif parts[0] == "ui" and len(parts) > 2: - paths.add(str(Path(parts[0]) / parts[1])) - else: - paths.add(parts[0]) - - return sorted(paths) - - -def _ensure_workspace_writable( - workspace: RemoteWorkspace, - repo_root: Path, - milestone_ids: list[str], -) -> None: +def _ensure_workspace_writable(workspace: RemoteWorkspace, repo_root: Path) -> None: with (repo_root / "metadata.json").open(encoding="utf-8") as f: metadata = json.load(f) - src_dirs = _affected_writable_paths(repo_root, milestone_ids) - if not src_dirs: - src_dirs = metadata.get("repo_src_dirs") or [] - chown_parts = [ f"sudo chown $(id -u):$(id -g) {shlex.quote(workspace.working_dir)}", ( @@ -161,14 +116,17 @@ def _ensure_workspace_writable( ), f"git config --global --add safe.directory {shlex.quote(workspace.working_dir)}", ] - for src_dir in src_dirs: + for src_dir in metadata.get("repo_src_dirs") or []: relative = str(src_dir).strip().strip("/") if not relative or "*" in relative: continue remote_path = f"{workspace.working_dir}/{relative}" chown_parts.append( f"if [ -e {shlex.quote(remote_path)} ]; then " - f"sudo chown -R $(id -u):$(id -g) {shlex.quote(remote_path)}; fi" + f"sudo find {shlex.quote(remote_path)} " + r"\( -name .git -o -name node_modules -o -name .venv " + r"-o -name __pycache__ \) -prune -o " + "-exec chown $(id -u):$(id -g) {} +; fi" ) result = workspace.execute_command( @@ -254,11 +212,7 @@ def _upload_task_materials( remote_root = "/tmp/evoclaw" remote_srs_dir = f"{remote_root}/srs" remote_task_queue = f"{remote_root}/TASK_QUEUE.md" - _ensure_workspace_writable( - workspace, - repo_root, - instance.data["milestone_ids"], - ) + _ensure_workspace_writable(workspace, repo_root) mkdir_result = workspace.execute_command( f"mkdir -p {remote_srs_dir}", timeout=30 ) From 2438da52086f618d553fb34cb31cd201492779f5 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 8 May 2026 20:38:21 +0000 Subject: [PATCH 6/6] Capture EvoClaw new files in patches --- benchmarks/evoclaw/config.py | 1 + benchmarks/evoclaw/run_infer.py | 33 ++++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/benchmarks/evoclaw/config.py b/benchmarks/evoclaw/config.py index 460eac91e..63afa3ed0 100644 --- a/benchmarks/evoclaw/config.py +++ b/benchmarks/evoclaw/config.py @@ -4,6 +4,7 @@ "dataset": "evoclaw", "split": "test", "max_iterations": 3000, + "instance_timeout": 18000, "num_workers": 1, "n_critic_runs": 1, "workspace": "docker", diff --git a/benchmarks/evoclaw/run_infer.py b/benchmarks/evoclaw/run_infer.py index 3ec696d2a..b5743eed1 100644 --- a/benchmarks/evoclaw/run_infer.py +++ b/benchmarks/evoclaw/run_infer.py @@ -139,6 +139,24 @@ def _ensure_workspace_writable(workspace: RemoteWorkspace, repo_root: Path) -> N ) +def _capture_git_patch(workspace: RemoteWorkspace, repo_dir: str) -> str: + """Capture tracked changes plus newly-created files as one git patch.""" + intent_result = workspace.execute_command( + f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} add -N .", + timeout=120, + ) + if intent_result.exit_code != 0: + raise RuntimeError(f"git add -N failed: {intent_result.stderr}") + + diff_result = workspace.execute_command( + f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} --no-pager diff --no-color --binary", + timeout=120, + ) + if diff_result.exit_code != 0: + raise RuntimeError(f"git diff failed: {diff_result.stderr}") + return diff_result.stdout + + def _render_instruction( prompt_path: str, task_queue_path: str, @@ -319,13 +337,7 @@ def evaluate_instance( run_error = str(exc) logger.exception("Conversation run failed for %s", instance.id) - diff_result = workspace.execute_command( - f"GIT_OPTIONAL_LOCKS=0 git -C {repo_dir} --no-pager diff --no-color", - timeout=120, - ) - if diff_result.exit_code != 0: - raise RuntimeError(f"git diff failed: {diff_result.stderr}") - git_patch = diff_result.stdout + git_patch = _capture_git_patch(workspace, repo_dir) summarize_instance( instance_id=instance.id, @@ -360,6 +372,12 @@ def main() -> None: default=None, help="Optional repo-name substring filters, e.g. --repos navidrome ripgrep.", ) + parser.add_argument( + "--instance-timeout", + type=int, + default=INFER_DEFAULTS["instance_timeout"], + help="Maximum wall-clock seconds per instance (default: 18000 = 5 hours).", + ) parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() @@ -413,6 +431,7 @@ def main() -> None: evaluator = EvoClawEvaluation( metadata=metadata, num_workers=args.num_workers, + instance_timeout=args.instance_timeout, ) evaluator.run(on_result=get_default_on_result_writer(evaluator.output_path))