diff --git a/benchmarks/swebench/README.md b/benchmarks/swebench/README.md index da0b2600..546f3a27 100644 --- a/benchmarks/swebench/README.md +++ b/benchmarks/swebench/README.md @@ -186,7 +186,7 @@ uv run swebench-infer .llm_config/sonnet-4-5.json \ ### Apptainer Workspace for HPC Clusters -#### Step 1: Build and push images using a separate machine with Docker support +#### Option 1: Pre-build and push images using a separate machine with Docker support ```bash uv run python -m benchmarks.swebench.build_images \ @@ -199,7 +199,20 @@ uv run python -m benchmarks.swebench.build_images \ The wrapper layer (`docutils<0.21`, `roman`) is applied in-place for allowlisted repos during this build pipeline (currently `sphinx-doc`). -#### Step 2: Run on HPC with Apptainer +#### Option 2: Build local Apptainer sandboxes on the HPC machine + +If a pre-built agent-server image is missing from the registry, Apptainer mode +falls back to building a local sandbox from the official SWE-Bench image and the +checked-out OpenHands SDK submodule. This does not require a Docker daemon. + +```bash +export OPENHANDS_APPTAINER_BUILD_ROOT=/scratch/$USER/swebench-apptainer-agent-images +``` + +Set `OPENHANDS_APPTAINER_FORCE_BUILD=1` to rebuild a local sandbox even when a +matching registry image exists. + +#### Run on HPC with Apptainer **Optionally**, you can override the default location where Apptainer cache is saved using the below environment variables: @@ -215,7 +228,9 @@ uv run swebench-infer path/to/llm_config.json \ --workspace apptainer ``` -In `apptainer` mode, SWE-Bench uses pre-built registry images as-is and does not run local Docker builds. +In `apptainer` mode, SWE-Bench first tries to use pre-built registry images. If +the expected registry tag is unavailable, it builds a local Apptainer sandbox +instead. ## Evaluation diff --git a/benchmarks/swebench/apptainer_build.py b/benchmarks/swebench/apptainer_build.py new file mode 100644 index 00000000..cb3cb606 --- /dev/null +++ b/benchmarks/swebench/apptainer_build.py @@ -0,0 +1,312 @@ +"""Local Apptainer builds for SWE-bench agent-server images.""" + +from __future__ import annotations + +import os +import shutil +import subprocess +from pathlib import Path + +from benchmarks.swebench import constants +from benchmarks.swebench.build_base_images import dockerfile_content_hash +from benchmarks.utils.build_utils import BuildOutput, _get_sdk_submodule_info +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + +DEFAULT_APPTAINER_BUILD_ROOT = ( + Path.home() / ".cache" / "openhands" / "swebench-apptainer-agent-images" +) +SUPPORTED_APPTAINER_TARGETS = {constants.BUILD_TARGET_SOURCE_MINIMAL} + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[2] + + +def _sdk_root() -> Path: + return _repo_root() / "vendor" / "software-agent-sdk" + + +def _sanitize_filename(value: str) -> str: + return "".join(c if c.isalnum() or c in "._-" else "_" for c in value) + + +def _build_root() -> Path: + return Path( + os.getenv("OPENHANDS_APPTAINER_BUILD_ROOT", str(DEFAULT_APPTAINER_BUILD_ROOT)) + ).expanduser() + + +def _force_build_enabled() -> bool: + return os.getenv("OPENHANDS_APPTAINER_FORCE_BUILD", "").lower() in { + "1", + "true", + "yes", + } + + +def apptainer_agent_image_path( + custom_tag: str, + target: constants.TargetType = constants.DEFAULT_BUILD_TARGET, +) -> Path: + """Return the local Apptainer sandbox path for a SWE-bench agent image.""" + _, git_sha, _ = _get_sdk_submodule_info() + sdk_short_sha = git_sha[:7] if git_sha != "unknown" else "unknown" + content_hash = dockerfile_content_hash() + name = _sanitize_filename(f"{sdk_short_sha}-{content_hash}-{custom_tag}-{target}") + return _build_root() / f"{name}.sandbox" + + +def _package_install_script() -> str: + """Return package setup shell matching the minimal Docker target.""" + return r""" +export DEBIAN_FRONTEND=noninteractive +if command -v apt-get >/dev/null 2>&1; then + apt-get -o Acquire::Retries=5 update + apt-get -o Acquire::Retries=5 install -y --no-install-recommends \ + bash ca-certificates curl wget sudo apt-utils git jq tmux tar \ + build-essential coreutils util-linux procps findutils grep sed \ + apt-transport-https gnupg lsb-release xz-utils + rm -rf /var/lib/apt/lists/* +elif command -v apk >/dev/null 2>&1; then + apk add --no-cache \ + bash ca-certificates curl wget sudo git jq tmux tar build-base \ + coreutils util-linux procps findutils grep sed gnupg shadow xz +elif command -v microdnf >/dev/null 2>&1; then + microdnf install -y \ + bash ca-certificates curl wget sudo git jq tmux tar make gcc gcc-c++ \ + coreutils util-linux procps-ng findutils grep sed shadow-utils \ + gnupg2 xz + microdnf clean all +elif command -v dnf >/dev/null 2>&1; then + dnf install -y \ + bash ca-certificates curl wget sudo git jq tmux tar make gcc gcc-c++ \ + coreutils util-linux procps-ng findutils grep sed shadow-utils \ + gnupg2 xz + dnf clean all +elif command -v yum >/dev/null 2>&1; then + yum install -y \ + bash ca-certificates curl wget sudo git jq tmux tar make gcc gcc-c++ \ + coreutils util-linux procps-ng findutils grep sed shadow-utils \ + gnupg2 xz + yum clean all +elif command -v zypper >/dev/null 2>&1; then + zypper --non-interactive install --no-recommends \ + bash ca-certificates curl wget sudo git jq tmux tar make gcc gcc-c++ \ + coreutils util-linux procps findutils grep sed shadow gpg2 xz + zypper clean --all +else + echo "Unsupported base image: no known package manager found" >&2 + exit 1 +fi +""" + + +def _wrap_swebench_deps_script() -> str: + """Return optional Sphinx dependency wrapper shell.""" + return r""" +if command -v conda >/dev/null 2>&1; then + conda run -n testbed pip install --no-deps --force-reinstall 'docutils<0.21' 'roman' \ + || (source /opt/miniconda3/bin/activate testbed && pip install --no-deps --force-reinstall 'docutils<0.21' 'roman') +elif [ -x /opt/miniconda3/bin/conda ]; then + /opt/miniconda3/bin/conda run -n testbed pip install --no-deps --force-reinstall 'docutils<0.21' 'roman' \ + || (source /opt/miniconda3/bin/activate testbed && pip install --no-deps --force-reinstall 'docutils<0.21' 'roman') +fi +if command -v pip >/dev/null 2>&1; then + pip install --no-deps --force-reinstall 'docutils<0.21' 'roman' +fi +""" + + +def _definition_file_content( + base_image: str, + git_sha: str, + git_ref: str, + wrap_swebench_deps: bool, + uv_path: Path, + uvx_path: Path | None, +) -> str: + sdk_root = _sdk_root() + wrap_script = _wrap_swebench_deps_script() if wrap_swebench_deps else "" + uvx_files = f" {uvx_path} /usr/local/bin/uvx\n" if uvx_path else "" + return f"""Bootstrap: docker +From: {base_image} + +%files + {uv_path} /usr/local/bin/uv +{uvx_files}\ + {sdk_root / "pyproject.toml"} /agent-server/pyproject.toml + {sdk_root / "uv.lock"} /agent-server/uv.lock + {sdk_root / "README.md"} /agent-server/README.md + {sdk_root / "LICENSE"} /agent-server/LICENSE + {sdk_root / "openhands-sdk"} /agent-server/openhands-sdk + {sdk_root / "openhands-tools"} /agent-server/openhands-tools + {sdk_root / "openhands-workspace"} /agent-server/openhands-workspace + {sdk_root / "openhands-agent-server"} /agent-server/openhands-agent-server + +%post + set -eux + {_package_install_script()} + + USERNAME=openhands + UID=10001 + GID=10001 + grep -Eq "^[^:]*:[^:]*:${{GID}}:" /etc/group || groupadd -g "${{GID}}" "${{USERNAME}}" + grep -Eq "^${{USERNAME}}:" /etc/passwd || useradd -m -u "${{UID}}" -g "${{GID}}" -s /bin/bash "${{USERNAME}}" + usermod -aG sudo "${{USERNAME}}" 2>/dev/null || true + echo "${{USERNAME}} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + mkdir -p /workspace/project /agent-server/uv-managed-python + chown -R "${{USERNAME}}:${{USERNAME}}" /workspace /agent-server + + chmod 0755 /usr/local/bin/uv + if [ -e /usr/local/bin/uvx ]; then chmod 0755 /usr/local/bin/uvx; fi + + su "${{USERNAME}}" -s /bin/bash -c 'cd /agent-server && \\ + export HOME=/home/openhands && \\ + export UV_PROJECT_ENVIRONMENT=/agent-server/.venv && \\ + export UV_PYTHON_INSTALL_DIR=/agent-server/uv-managed-python && \\ + uv python install 3.13 && \\ + uv venv --python-preference only-managed --python 3.13 .venv && \\ + uv sync --frozen --no-editable --managed-python --extra boto3 && \\ + uv pip install --python /agent-server/.venv/bin/python "transformers>=4.56.0,<5" && \\ + readlink -f .venv/bin/python | grep -q "^/agent-server/uv-managed-python/"' + + {wrap_script} + +%environment + export LC_ALL=C.UTF-8 + export LANG=C.UTF-8 + export OH_ENABLE_VNC=false + export LOG_JSON=true + export OPENHANDS_BUILD_GIT_SHA={git_sha} + export OPENHANDS_BUILD_GIT_REF={git_ref} + +%runscript + export LC_ALL=C.UTF-8 + export LANG=C.UTF-8 + export OH_ENABLE_VNC=false + export LOG_JSON=true + export OPENHANDS_BUILD_GIT_SHA={git_sha} + export OPENHANDS_BUILD_GIT_REF={git_ref} + exec /agent-server/.venv/bin/python -m openhands.agent_server "$@" +""" + + +def build_apptainer_agent_image( + base_image: str, + custom_tag: str, + target: constants.TargetType = constants.DEFAULT_BUILD_TARGET, + wrap_swebench_deps: bool = False, +) -> BuildOutput: + """Build a local Apptainer agent-server sandbox from a SWE-bench base image.""" + if target not in SUPPORTED_APPTAINER_TARGETS: + return BuildOutput( + base_image=base_image, + tags=[], + error=( + f"Apptainer local builds currently support " + f"{sorted(SUPPORTED_APPTAINER_TARGETS)}, got {target!r}" + ), + ) + + if shutil.which("apptainer") is None: + return BuildOutput( + base_image=base_image, + tags=[], + error="Apptainer is not available on PATH", + ) + uv_bin = shutil.which("uv") + if uv_bin is None: + return BuildOutput( + base_image=base_image, + tags=[], + error="uv is not available on PATH", + ) + uvx_bin = shutil.which("uvx") + + sandbox = apptainer_agent_image_path(custom_tag, target) + if sandbox.exists() and not _force_build_enabled(): + logger.info("Using existing Apptainer agent sandbox %s", sandbox) + return BuildOutput(base_image=base_image, tags=[str(sandbox)], error=None) + + build_root = _build_root() + log_dir = build_root / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + build_root.mkdir(parents=True, exist_ok=True) + + tmp_sandbox = sandbox.with_suffix(".tmp") + if tmp_sandbox.exists(): + shutil.rmtree(tmp_sandbox) + if sandbox.exists(): + shutil.rmtree(sandbox) + + git_ref, git_sha, _ = _get_sdk_submodule_info() + definition = build_root / f"{sandbox.name}.def" + definition.write_text( + _definition_file_content( + base_image=base_image, + git_sha=git_sha, + git_ref=git_ref, + wrap_swebench_deps=wrap_swebench_deps, + uv_path=Path(uv_bin).resolve(), + uvx_path=Path(uvx_bin).resolve() if uvx_bin else None, + ) + ) + + log_path = log_dir / f"{sandbox.name}.log" + cmd = ["apptainer", "build", "--sandbox", str(tmp_sandbox), str(definition)] + logger.info("Building Apptainer agent sandbox: %s", " ".join(cmd)) + env = os.environ.copy() + if "APPTAINER_CACHEDIR" not in env: + env["APPTAINER_CACHEDIR"] = str(build_root / "cache") + for key in ("APPTAINER_CACHEDIR", "APPTAINER_TMPDIR"): + if env.get(key): + Path(env[key]).expanduser().mkdir(parents=True, exist_ok=True) + proc = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + env=env, + ) + log_path.write_text(proc.stdout) + if proc.returncode != 0: + shutil.rmtree(tmp_sandbox, ignore_errors=True) + return BuildOutput( + base_image=base_image, + tags=[], + error=f"Apptainer build failed with exit code {proc.returncode}", + log_path=str(log_path), + ) + + tmp_sandbox.rename(sandbox) + logger.info("Built Apptainer agent sandbox %s", sandbox) + return BuildOutput( + base_image=base_image, + tags=[str(sandbox)], + error=None, + log_path=str(log_path), + ) + + +def ensure_apptainer_agent_image( + base_image: str, + custom_tag: str, + target: constants.TargetType = constants.DEFAULT_BUILD_TARGET, + wrap_swebench_deps: bool = False, +) -> Path: + """Build or reuse a local Apptainer agent-server sandbox.""" + output = build_apptainer_agent_image( + base_image=base_image, + custom_tag=custom_tag, + target=target, + wrap_swebench_deps=wrap_swebench_deps, + ) + logger.info("Apptainer image build output: %s", output) + if output.error is not None: + raise RuntimeError(f"Apptainer image build failed: {output.error}") + if not output.tags: + raise RuntimeError("Apptainer image build produced no image path") + return Path(output.tags[0]) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 33f80237..3351a629 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -5,6 +5,7 @@ from jinja2 import Environment, FileSystemLoader from benchmarks.swebench import constants +from benchmarks.swebench.apptainer_build import ensure_apptainer_agent_image from benchmarks.swebench.build_images import ( extract_custom_tag, get_official_docker_image, @@ -108,6 +109,22 @@ def should_wrap_instance(self, instance: EvalInstance) -> bool: def get_source_repo_path(self, instance: EvalInstance) -> str: return "/testbed" + def get_apptainer_extra_bind_mounts(self) -> list[str]: + """Return host paths that must be visible inside Apptainer agent servers.""" + bind_mounts: list[str] = [] + custom_tokenizer = self.metadata.llm.custom_tokenizer + if custom_tokenizer: + tokenizer_path = os.path.abspath(os.path.expanduser(custom_tokenizer)) + if os.path.exists(tokenizer_path): + bind_mounts.append(f"{tokenizer_path}:{tokenizer_path}:ro") + else: + logger.warning( + "custom_tokenizer path %s does not exist on host; " + "not adding an Apptainer bind mount", + custom_tokenizer, + ) + return bind_mounts + def prepare_instances(self) -> List[EvalInstance]: logger.info("Setting up SWE-bench evaluation data") @@ -181,27 +198,46 @@ def prepare_workspace( forward_env=forward_env or [], ) elif self.metadata.workspace_type == "apptainer": - if not remote_image_exists(agent_server_image): - raise RuntimeError( - f"Agent server image {agent_server_image} does not exist in container registry, " - "make sure to build, push it, and make it public accessible before using apptainer workspace." + force_local_build = os.getenv( + "OPENHANDS_APPTAINER_FORCE_BUILD", "" + ).lower() in {"1", "true", "yes"} + if not force_local_build and remote_image_exists(agent_server_image): + logger.info( + f"Using apptainer workspace with pre-built image {agent_server_image} " + f"(tag prefix: {get_phased_image_tag_prefix()})" ) + if wrap_needed: + logger.info( + "Using pre-built wrapped apptainer image for wrapped repo" + ) - logger.info( - f"Using apptainer workspace with pre-built image {agent_server_image} " - f"(tag prefix: {get_phased_image_tag_prefix()})" - ) - if wrap_needed: + workspace = ApptainerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", + forward_env=forward_env or [], + extra_bind_mounts=self.get_apptainer_extra_bind_mounts(), + cache_dir=os.getenv("APPTAINER_CACHEDIR", None), + ) + else: logger.info( - "Skipping local wrap for apptainer workspace; expecting image to be pre-wrapped in registry" + "Agent server image %s is not available in the registry; " + "building a local Apptainer sandbox from %s", + agent_server_image, + official_docker_image, + ) + local_agent_image = ensure_apptainer_agent_image( + base_image=official_docker_image, + custom_tag=custom_tag, + target=build_target, + wrap_swebench_deps=wrap_needed, + ) + workspace = ApptainerWorkspace( + sif_file=str(local_agent_image), + working_dir="/workspace", + forward_env=forward_env or [], + extra_bind_mounts=self.get_apptainer_extra_bind_mounts(), + cache_dir=os.getenv("APPTAINER_CACHEDIR", None), ) - - workspace = ApptainerWorkspace( - server_image=agent_server_image, - working_dir="/workspace", - forward_env=forward_env or [], - cache_dir=os.getenv("APPTAINER_CACHEDIR", None), - ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: @@ -272,9 +308,23 @@ def evaluate_instance( tools.append(Tool(name=TaskToolSet.name)) condenser = None if self.metadata.enable_condenser: + condenser_llm = build_eval_llm( + self.metadata.llm, + usage_id="condenser", + ) + if self.metadata.condenser_max_output_tokens is not None: + condenser_llm = condenser_llm.model_copy( + deep=True, + update={ + "max_output_tokens": ( + self.metadata.condenser_max_output_tokens + ), + }, + ) condenser = LLMSummarizingCondenser( - llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + llm=condenser_llm, max_size=self.metadata.condenser_max_size, + max_tokens=self.metadata.condenser_max_tokens, keep_first=self.metadata.condenser_keep_first, ) # Load public skills (respects EXTENSIONS_REF env var) @@ -440,6 +490,8 @@ def main() -> None: agent_type=args.agent_type, enable_condenser=enable_condenser, condenser_max_size=args.condenser_max_size, + condenser_max_tokens=args.condenser_max_tokens, + condenser_max_output_tokens=args.condenser_max_output_tokens, condenser_keep_first=args.condenser_keep_first, ) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 4df17499..1fce4dc1 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -124,6 +124,16 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: type=int, help="Maximum number of events before the condenser activates", ) + parser.add_argument( + "--condenser-max-tokens", + type=int, + help="Maximum number of prompt tokens before the condenser activates", + ) + parser.add_argument( + "--condenser-max-output-tokens", + type=int, + help="Maximum output tokens for LLM-generated condenser summaries", + ) parser.add_argument( "--condenser-keep-first", type=int, diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index 9dd47138..dbaec19b 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -87,6 +87,16 @@ class EvalMetadata(BaseModel): ge=1, description="Maximum number of events before the condenser activates", ) + condenser_max_tokens: int | None = Field( + default=None, + ge=1, + description="Maximum number of prompt tokens before the condenser activates", + ) + condenser_max_output_tokens: int | None = Field( + default=None, + ge=1, + description="Maximum output tokens for LLM-generated condenser summaries", + ) condenser_keep_first: int = Field( default=2, ge=0, diff --git a/tests/test_condenser_config.py b/tests/test_condenser_config.py index f14c5a3a..43348db1 100644 --- a/tests/test_condenser_config.py +++ b/tests/test_condenser_config.py @@ -112,10 +112,14 @@ def test_eval_metadata_accepts_condenser_params(): critic=PassCritic(), enable_condenser=True, condenser_max_size=100, + condenser_max_tokens=12345, + condenser_max_output_tokens=512, condenser_keep_first=5, ) assert metadata.enable_condenser is True assert metadata.condenser_max_size == 100 + assert metadata.condenser_max_tokens == 12345 + assert metadata.condenser_max_output_tokens == 512 assert metadata.condenser_keep_first == 5 @@ -132,6 +136,8 @@ def test_eval_metadata_condenser_defaults(): # Should use default values defined in EvalMetadata assert metadata.enable_condenser is True assert metadata.condenser_max_size == 240 + assert metadata.condenser_max_tokens is None + assert metadata.condenser_max_output_tokens is None assert metadata.condenser_keep_first == 2 @@ -143,6 +149,8 @@ def test_args_parser_has_condenser_args(): assert hasattr(args, "enable_condenser") assert hasattr(args, "disable_condenser") assert hasattr(args, "condenser_max_size") + assert hasattr(args, "condenser_max_tokens") + assert hasattr(args, "condenser_max_output_tokens") assert hasattr(args, "condenser_keep_first") @@ -168,7 +176,18 @@ def test_condenser_size_args(): """Test that condenser size arguments can be set.""" parser = get_parser(add_llm_config=False) args = parser.parse_args( - ["--condenser-max-size", "120", "--condenser-keep-first", "10"] + [ + "--condenser-max-size", + "120", + "--condenser-max-tokens", + "28000", + "--condenser-max-output-tokens", + "1024", + "--condenser-keep-first", + "10", + ] ) assert args.condenser_max_size == 120 + assert args.condenser_max_tokens == 28000 + assert args.condenser_max_output_tokens == 1024 assert args.condenser_keep_first == 10 diff --git a/tests/test_swebench_apptainer_build.py b/tests/test_swebench_apptainer_build.py new file mode 100644 index 00000000..58f41555 --- /dev/null +++ b/tests/test_swebench_apptainer_build.py @@ -0,0 +1,129 @@ +"""Tests for SWE-bench Apptainer image build fallback.""" + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +from benchmarks.swebench import apptainer_build, run_infer as swebench_run_infer +from benchmarks.utils.models import EvalInstance + + +class FakeApptainerWorkspace: + """Capture ApptainerWorkspace constructor arguments.""" + + def __init__(self, **kwargs: Any) -> None: + self.kwargs = kwargs + + +def _evaluation(): + metadata = SimpleNamespace( + workspace_type="apptainer", + agent_type="default", + env_setup_commands=[], + llm=SimpleNamespace(custom_tokenizer=None), + ) + evaluation = object.__new__(swebench_run_infer.SWEBenchEvaluation) + object.__setattr__(evaluation, "metadata", metadata) + return evaluation + + +def test_unsupported_apptainer_build_target_returns_error(): + output = apptainer_build.build_apptainer_agent_image( + base_image="docker.io/swebench/example:latest", + custom_tag="example", + target="binary", + ) + + assert output.tags == [] + assert output.error is not None + assert "source-minimal" in output.error + + +def test_apptainer_definition_installs_transformers_for_token_counting(): + definition = apptainer_build._definition_file_content( + base_image="docker.io/swebench/example:latest", + git_sha="abc123", + git_ref="main", + wrap_swebench_deps=False, + uv_path=Path("/usr/local/bin/uv"), + uvx_path=None, + ) + + assert 'uv pip install --python /agent-server/.venv/bin/python "transformers' in ( + definition + ) + + +def test_apptainer_workspace_uses_registry_image_when_available(monkeypatch): + monkeypatch.setattr(swebench_run_infer, "remote_image_exists", lambda image: True) + monkeypatch.setattr( + swebench_run_infer, + "ApptainerWorkspace", + FakeApptainerWorkspace, + ) + + workspace = _evaluation().prepare_workspace( + EvalInstance(id="django__django-12345", data={}) + ) + + assert isinstance(workspace, FakeApptainerWorkspace) + assert "server_image" in workspace.kwargs + assert "sif_file" not in workspace.kwargs + assert workspace.kwargs["extra_bind_mounts"] == [] + + +def test_apptainer_workspace_binds_existing_custom_tokenizer(monkeypatch, tmp_path): + tokenizer_dir = tmp_path / "tokenizer" + tokenizer_dir.mkdir() + evaluation = _evaluation() + evaluation.metadata.llm.custom_tokenizer = str(tokenizer_dir) + + monkeypatch.setattr(swebench_run_infer, "remote_image_exists", lambda image: True) + monkeypatch.setattr( + swebench_run_infer, + "ApptainerWorkspace", + FakeApptainerWorkspace, + ) + + workspace = evaluation.prepare_workspace( + EvalInstance(id="django__django-12345", data={}) + ) + + assert isinstance(workspace, FakeApptainerWorkspace) + assert workspace.kwargs["extra_bind_mounts"] == [ + f"{tokenizer_dir}:{tokenizer_dir}:ro" + ] + + +def test_apptainer_workspace_builds_local_sandbox_when_registry_image_missing( + monkeypatch, +): + built = {} + + def fake_build(**kwargs): + built.update(kwargs) + return Path("/tmp/local-agent.sandbox") + + monkeypatch.setattr(swebench_run_infer, "remote_image_exists", lambda image: False) + monkeypatch.setattr( + swebench_run_infer, + "ensure_apptainer_agent_image", + fake_build, + ) + monkeypatch.setattr( + swebench_run_infer, + "ApptainerWorkspace", + FakeApptainerWorkspace, + ) + + workspace = _evaluation().prepare_workspace( + EvalInstance(id="django__django-12345", data={}) + ) + + assert isinstance(workspace, FakeApptainerWorkspace) + assert workspace.kwargs["sif_file"] == "/tmp/local-agent.sandbox" + assert "server_image" not in workspace.kwargs + assert workspace.kwargs["extra_bind_mounts"] == [] + assert built["base_image"].startswith("docker.io/swebench/") + assert built["custom_tag"] == "sweb.eval.x86_64.django_1776_django-12345" + assert built["target"] == "source-minimal" diff --git a/uv.lock b/uv.lock index 3c0cf1ef..cecb4070 100644 --- a/uv.lock +++ b/uv.lock @@ -2573,13 +2573,14 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.24.0" +version = "1.27.0" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, { name = "alembic" }, { name = "docker" }, { name = "fastapi" }, + { name = "openai" }, { name = "openhands-sdk" }, { name = "pydantic" }, { name = "sqlalchemy" }, @@ -2594,6 +2595,7 @@ requires-dist = [ { name = "alembic", specifier = ">=1.13" }, { name = "docker", specifier = ">=7.1,<8" }, { name = "fastapi", specifier = ">=0.104" }, + { name = "openai", specifier = ">=2.33.0,<3" }, { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" }, { name = "pydantic", specifier = ">=2" }, { name = "sqlalchemy", specifier = ">=2" }, @@ -2719,7 +2721,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.24.0" +version = "1.27.0" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "agent-client-protocol" }, @@ -2771,7 +2773,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.24.0" +version = "1.27.0" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "binaryornot" }, @@ -2802,7 +2804,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.24.0" +version = "1.27.0" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" }, diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index c950fdb0..43376f18 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit c950fdb08abea040eebd0bb3d5ff63db293b9125 +Subproject commit 43376f1868ffd702746080714a59c16d3f69ec12