From 6150713fc247b41c222aedb175961f6f8ae4a5d4 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Mon, 2 Mar 2026 17:28:25 -0300 Subject: [PATCH 01/11] Centralize IMAGE_TAG_PREFIX for Docker image tags All benchmark runners previously hardcoded SDK_SHORT_SHA for Docker image tags, with each file duplicating an os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) override pattern. This centralizes the tag prefix into a single constant. - New IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) in version.py - All runners import and use IMAGE_TAG_PREFIX directly with no per-file env var fallback - EVAL_AGENT_SERVER_IMAGE in constants.py is now overridable via OPENHANDS_EVAL_AGENT_SERVER_IMAGE env var - modal_patches.py updated to propagate IMAGE_TAG_PREFIX (replaces SDK_SHA/SDK_SHORT_SHA pair) Extracted from #455. Co-Authored-By: Claude Opus 4.6 --- benchmarks/commit0/run_infer.py | 9 +++---- benchmarks/gaia/run_infer.py | 9 +++---- benchmarks/multiswebench/run_infer.py | 9 +++---- benchmarks/swebench/run_infer.py | 9 +++---- benchmarks/swebenchmultimodal/run_infer.py | 9 +++---- benchmarks/swefficiency/run_infer.py | 19 +++++-------- benchmarks/swtbench/run_infer.py | 9 +++---- benchmarks/utils/constants.py | 9 ++++++- benchmarks/utils/modal_patches.py | 31 ++++++++-------------- benchmarks/utils/version.py | 11 ++++++++ 10 files changed, 61 insertions(+), 63 deletions(-) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 36af180a5..e138334c0 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -31,7 +31,7 @@ EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import Agent, Conversation, Tool, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool @@ -191,7 +191,7 @@ def prepare_workspace( custom_tag = extract_custom_tag(base_docker_image) suffix = f"-{build_target}" if build_target != "binary" else "" agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) workspace = create_docker_workspace( agent_server_image=agent_server_image, @@ -206,11 +206,10 @@ def prepare_workspace( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) custom_tag = extract_custom_tag(base_docker_image) suffix = f"-{build_target}" if build_target != "binary" else "" agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): @@ -221,7 +220,7 @@ def prepare_workspace( logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index fa0317a8e..13178fae1 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -30,7 +30,7 @@ from benchmarks.utils.image_utils import create_docker_workspace, image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import ( Agent, Conversation, @@ -157,7 +157,7 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-gaia-binary" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary" ) workspace = create_docker_workspace( agent_server_image=agent_server_image, @@ -177,9 +177,8 @@ def prepare_workspace( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary" ) if not image_exists(agent_server_image): @@ -190,7 +189,7 @@ def prepare_workspace( logger.info( f"Using remote workspace with GAIA image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index ec5137a2d..cffd8a6e0 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -33,7 +33,7 @@ EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import Agent, Conversation, Tool, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool @@ -210,7 +210,7 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) ensure_local_image( agent_server_image=agent_server_image, @@ -225,14 +225,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -241,7 +240,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 231de93e9..134c12848 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -34,7 +34,7 @@ EvalOutput, ToolPresetType, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import Agent, Conversation, Tool, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool @@ -154,7 +154,7 @@ def prepare_workspace( f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else "" ) base_agent_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) wrap_needed = should_wrap_instance_id(instance.id) agent_server_image = base_agent_image @@ -186,14 +186,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -202,7 +201,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(image tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float( os.getenv( diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 6a1b533a7..defd2ba9b 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -31,7 +31,7 @@ EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import ( Agent, Conversation, @@ -163,7 +163,7 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) ensure_local_image( agent_server_image=agent_server_image, @@ -178,14 +178,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -194,7 +193,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py index cf5a902d2..7e007133f 100644 --- a/benchmarks/swefficiency/run_infer.py +++ b/benchmarks/swefficiency/run_infer.py @@ -27,7 +27,7 @@ EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools @@ -201,7 +201,7 @@ def prepare_workspace( # Build agent server image tag suffix = f"-{build_target}" if build_target != "binary" else "" agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) logger.info(f"Base image: {base_docker_image}") @@ -238,25 +238,20 @@ def prepare_workspace( elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) - # For remote, use SDK_SHORT_SHA from env if available - remote_agent_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" - ) - if not image_exists(remote_agent_image): + if not image_exists(agent_server_image): raise RuntimeError( - f"Agent server image {remote_agent_image} does not exist in container registry, " + f"Agent server image {agent_server_image} does not exist in container registry, " "make sure to build, push it, and make it public accessible before using remote workspace." ) logger.info( - f"Using remote workspace with image {remote_agent_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"Using remote workspace with image {agent_server_image} " + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) workspace = APIRemoteWorkspace( @@ -264,7 +259,7 @@ def prepare_workspace( "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" ), runtime_api_key=runtime_api_key, - server_image=remote_agent_image, + server_image=agent_server_image, target_type="source", forward_env=forward_env or [], resource_factor=resource_factor, diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 36b78f265..5fdafbfb0 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -25,7 +25,7 @@ EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.agent_server.docker.build import _base_slug from openhands.sdk import Agent, Conversation, Tool, __version__, get_logger from openhands.sdk.workspace import RemoteWorkspace @@ -169,7 +169,7 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) workspace = create_docker_workspace( agent_server_image=agent_server_image, @@ -179,14 +179,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -195,7 +194,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py index 882f1b636..85ecad22b 100644 --- a/benchmarks/utils/constants.py +++ b/benchmarks/utils/constants.py @@ -1,5 +1,12 @@ +import os + + OUTPUT_FILENAME = "output.jsonl" -EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server" + +# Image name for agent server (can be overridden via env var) +EVAL_AGENT_SERVER_IMAGE = os.getenv( + "OPENHANDS_EVAL_AGENT_SERVER_IMAGE", "ghcr.io/openhands/eval-agent-server" +) # Model identifier used in swebench-style prediction entries. # The swebench harness uses this value to create log directory structures diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py index 2a614388a..e1299e200 100644 --- a/benchmarks/utils/modal_patches.py +++ b/benchmarks/utils/modal_patches.py @@ -34,17 +34,17 @@ def emit(message: str) -> None: return emit -def _get_sdk_short_sha() -> str: +def _get_image_tag_prefix() -> str: """ - Resolve SDK short SHA from the benchmarks repo when available, otherwise - fall back to environment variables for the Modal function image. + Resolve the image tag prefix from the benchmarks repo when available, + otherwise fall back to environment variables for the Modal function image. """ try: - from benchmarks.utils.version import SDK_SHORT_SHA as version_sdk_short_sha + from benchmarks.utils.version import IMAGE_TAG_PREFIX - return version_sdk_short_sha + return IMAGE_TAG_PREFIX except Exception: - return os.getenv("SDK_SHORT_SHA", "").strip() or "unknown" + return os.getenv("IMAGE_TAG_PREFIX", "").strip() or "unknown" def _get_agent_server_image_repo() -> str: @@ -77,10 +77,10 @@ def _build_prebuilt_image_tag(test_spec) -> str: if not instance_id: raise RuntimeError("TestSpec missing instance_id; cannot select Modal image") - sdk_short_sha = _get_sdk_short_sha() - if sdk_short_sha in ("", "unknown", None): + image_tag_prefix = _get_image_tag_prefix() + if image_tag_prefix in ("", "unknown", None): raise RuntimeError( - "SDK short SHA is unavailable. Set SDK_SHORT_SHA or ensure the " + "Image tag prefix is unavailable. Set IMAGE_TAG_PREFIX or ensure the " "benchmarks repository has an initialized SDK submodule." ) @@ -88,7 +88,7 @@ def _build_prebuilt_image_tag(test_spec) -> str: suffix = f"-{target}" if target and target != "binary" else "" custom_tag = _get_custom_tag_from_instance_id(instance_id) agent_repo = _get_agent_server_image_repo() - return f"{agent_repo}:{sdk_short_sha}-{custom_tag}{suffix}" + return f"{agent_repo}:{image_tag_prefix}-{custom_tag}{suffix}" def _patch_modal_sklearn_install_flag() -> None: @@ -497,16 +497,7 @@ def _inject_modal_sitecustomize() -> None: ) env_vars = {"PYTHONPATH": "/root"} - try: - from benchmarks.utils.version import SDK_SHA, SDK_SHORT_SHA - - env_vars["SDK_SHA"] = SDK_SHA - env_vars["SDK_SHORT_SHA"] = SDK_SHORT_SHA - except Exception: - sdk_sha_env = os.getenv("SDK_SHA") - if sdk_sha_env: - env_vars["SDK_SHA"] = sdk_sha_env - env_vars["SDK_SHORT_SHA"] = _get_sdk_short_sha() + env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix() env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo() env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target() diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index 951c65925..df8a86a74 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -1,3 +1,4 @@ +import os import subprocess from pathlib import Path @@ -25,3 +26,13 @@ def get_sdk_sha() -> str: SDK_SHA = get_sdk_sha() SDK_SHORT_SHA = SDK_SHA[:7] + +# Centralized image tag prefix used by all benchmark runners. +# +# Docker image tags follow the format: -- +# e.g. "abc1234-sweb.eval.x86_64.django_1776_django-12155-binary" +# +# By default this is the SDK submodule short SHA. Set the IMAGE_TAG_PREFIX +# environment variable to override (e.g. when using pre-built images from +# a different SDK revision or a CI-provided tag). +IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) From 1d3f778908e5adb182a9aac1d01056d24b4d5e22 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 06:36:14 -0300 Subject: [PATCH 02/11] Add backward compatibility for SDK_SHORT_SHA env var Address review feedback: the env var rename from SDK_SHORT_SHA to IMAGE_TAG_PREFIX is a breaking change for existing users. - version.py: fall back to SDK_SHORT_SHA env var when IMAGE_TAG_PREFIX is not set, emitting a DeprecationWarning - modal_patches.py: continue propagating SDK_SHORT_SHA to modal execution environments alongside IMAGE_TAG_PREFIX - modal_patches.py: _get_image_tag_prefix() fallback also checks SDK_SHORT_SHA when IMAGE_TAG_PREFIX is unset Co-Authored-By: Claude Opus 4.6 --- benchmarks/utils/modal_patches.py | 12 ++++++++++-- benchmarks/utils/version.py | 20 +++++++++++++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py index e1299e200..58135fc62 100644 --- a/benchmarks/utils/modal_patches.py +++ b/benchmarks/utils/modal_patches.py @@ -44,7 +44,11 @@ def _get_image_tag_prefix() -> str: return IMAGE_TAG_PREFIX except Exception: - return os.getenv("IMAGE_TAG_PREFIX", "").strip() or "unknown" + return ( + os.getenv("IMAGE_TAG_PREFIX", "").strip() + or os.getenv("SDK_SHORT_SHA", "").strip() # deprecated fallback + or "unknown" + ) def _get_agent_server_image_repo() -> str: @@ -497,7 +501,11 @@ def _inject_modal_sitecustomize() -> None: ) env_vars = {"PYTHONPATH": "/root"} - env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix() + image_tag_prefix = _get_image_tag_prefix() + env_vars["IMAGE_TAG_PREFIX"] = image_tag_prefix + # Backward compatibility: propagate SDK_SHORT_SHA so that any code + # running inside Modal that reads this env var continues to work. + env_vars["SDK_SHORT_SHA"] = image_tag_prefix env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo() env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target() diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index df8a86a74..96fdefdac 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -1,5 +1,6 @@ import os import subprocess +import warnings from pathlib import Path @@ -35,4 +36,21 @@ def get_sdk_sha() -> str: # By default this is the SDK submodule short SHA. Set the IMAGE_TAG_PREFIX # environment variable to override (e.g. when using pre-built images from # a different SDK revision or a CI-provided tag). -IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) +# +# Backward compatibility: SDK_SHORT_SHA env var is still honored as a +# fallback when IMAGE_TAG_PREFIX is not set, but is deprecated. +_image_tag_prefix_env = os.getenv("IMAGE_TAG_PREFIX") +_sdk_short_sha_env = os.getenv("SDK_SHORT_SHA") + +if _image_tag_prefix_env is not None: + IMAGE_TAG_PREFIX = _image_tag_prefix_env +elif _sdk_short_sha_env is not None: + warnings.warn( + "SDK_SHORT_SHA env var is deprecated for overriding image tags. " + "Use IMAGE_TAG_PREFIX instead.", + DeprecationWarning, + stacklevel=1, + ) + IMAGE_TAG_PREFIX = _sdk_short_sha_env +else: + IMAGE_TAG_PREFIX = SDK_SHORT_SHA From 35f459a1cf360edad83e7ee343466201dde2cfaf Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 06:37:11 -0300 Subject: [PATCH 03/11] Revert "Add backward compatibility for SDK_SHORT_SHA env var" This reverts commit 1d3f778908e5adb182a9aac1d01056d24b4d5e22. --- benchmarks/utils/modal_patches.py | 12 ++---------- benchmarks/utils/version.py | 20 +------------------- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py index 58135fc62..e1299e200 100644 --- a/benchmarks/utils/modal_patches.py +++ b/benchmarks/utils/modal_patches.py @@ -44,11 +44,7 @@ def _get_image_tag_prefix() -> str: return IMAGE_TAG_PREFIX except Exception: - return ( - os.getenv("IMAGE_TAG_PREFIX", "").strip() - or os.getenv("SDK_SHORT_SHA", "").strip() # deprecated fallback - or "unknown" - ) + return os.getenv("IMAGE_TAG_PREFIX", "").strip() or "unknown" def _get_agent_server_image_repo() -> str: @@ -501,11 +497,7 @@ def _inject_modal_sitecustomize() -> None: ) env_vars = {"PYTHONPATH": "/root"} - image_tag_prefix = _get_image_tag_prefix() - env_vars["IMAGE_TAG_PREFIX"] = image_tag_prefix - # Backward compatibility: propagate SDK_SHORT_SHA so that any code - # running inside Modal that reads this env var continues to work. - env_vars["SDK_SHORT_SHA"] = image_tag_prefix + env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix() env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo() env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target() diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index 96fdefdac..df8a86a74 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -1,6 +1,5 @@ import os import subprocess -import warnings from pathlib import Path @@ -36,21 +35,4 @@ def get_sdk_sha() -> str: # By default this is the SDK submodule short SHA. Set the IMAGE_TAG_PREFIX # environment variable to override (e.g. when using pre-built images from # a different SDK revision or a CI-provided tag). -# -# Backward compatibility: SDK_SHORT_SHA env var is still honored as a -# fallback when IMAGE_TAG_PREFIX is not set, but is deprecated. -_image_tag_prefix_env = os.getenv("IMAGE_TAG_PREFIX") -_sdk_short_sha_env = os.getenv("SDK_SHORT_SHA") - -if _image_tag_prefix_env is not None: - IMAGE_TAG_PREFIX = _image_tag_prefix_env -elif _sdk_short_sha_env is not None: - warnings.warn( - "SDK_SHORT_SHA env var is deprecated for overriding image tags. " - "Use IMAGE_TAG_PREFIX instead.", - DeprecationWarning, - stacklevel=1, - ) - IMAGE_TAG_PREFIX = _sdk_short_sha_env -else: - IMAGE_TAG_PREFIX = SDK_SHORT_SHA +IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) From 80b38b2d4a5560118d1a8c88a0c8a5127f58fc9e Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 10:02:43 -0300 Subject: [PATCH 04/11] Add runtime deprecation warning for SDK_SHORT_SHA env var - version.py: fall back to SDK_SHORT_SHA env var when IMAGE_TAG_PREFIX is not set, emitting a DeprecationWarning so users know to migrate - modal_patches.py: _get_image_tag_prefix() fallback also checks SDK_SHORT_SHA when IMAGE_TAG_PREFIX is unset Co-Authored-By: Claude Opus 4.6 --- benchmarks/utils/modal_patches.py | 6 +++++- benchmarks/utils/version.py | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py index e1299e200..7fddfc725 100644 --- a/benchmarks/utils/modal_patches.py +++ b/benchmarks/utils/modal_patches.py @@ -44,7 +44,11 @@ def _get_image_tag_prefix() -> str: return IMAGE_TAG_PREFIX except Exception: - return os.getenv("IMAGE_TAG_PREFIX", "").strip() or "unknown" + return ( + os.getenv("IMAGE_TAG_PREFIX", "").strip() + or os.getenv("SDK_SHORT_SHA", "").strip() # deprecated fallback + or "unknown" + ) def _get_agent_server_image_repo() -> str: diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index df8a86a74..0df78692e 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -1,5 +1,6 @@ import os import subprocess +import warnings from pathlib import Path @@ -35,4 +36,11 @@ def get_sdk_sha() -> str: # By default this is the SDK submodule short SHA. Set the IMAGE_TAG_PREFIX # environment variable to override (e.g. when using pre-built images from # a different SDK revision or a CI-provided tag). -IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) +IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX") or os.getenv("SDK_SHORT_SHA") or SDK_SHORT_SHA +if os.getenv("SDK_SHORT_SHA") and not os.getenv("IMAGE_TAG_PREFIX"): + warnings.warn( + "SDK_SHORT_SHA env var is deprecated, use IMAGE_TAG_PREFIX instead. " + "Support for SDK_SHORT_SHA will be removed in a future version.", + DeprecationWarning, + stacklevel=2, + ) From 85950e6a1f3266cb802f0ed3817fb08efdcd9851 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 10:38:47 -0300 Subject: [PATCH 05/11] Revert "Add runtime deprecation warning for SDK_SHORT_SHA env var" This reverts commit 80b38b2d4a5560118d1a8c88a0c8a5127f58fc9e. --- benchmarks/utils/modal_patches.py | 6 +----- benchmarks/utils/version.py | 10 +--------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py index 7fddfc725..e1299e200 100644 --- a/benchmarks/utils/modal_patches.py +++ b/benchmarks/utils/modal_patches.py @@ -44,11 +44,7 @@ def _get_image_tag_prefix() -> str: return IMAGE_TAG_PREFIX except Exception: - return ( - os.getenv("IMAGE_TAG_PREFIX", "").strip() - or os.getenv("SDK_SHORT_SHA", "").strip() # deprecated fallback - or "unknown" - ) + return os.getenv("IMAGE_TAG_PREFIX", "").strip() or "unknown" def _get_agent_server_image_repo() -> str: diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index 0df78692e..df8a86a74 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -1,6 +1,5 @@ import os import subprocess -import warnings from pathlib import Path @@ -36,11 +35,4 @@ def get_sdk_sha() -> str: # By default this is the SDK submodule short SHA. Set the IMAGE_TAG_PREFIX # environment variable to override (e.g. when using pre-built images from # a different SDK revision or a CI-provided tag). -IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX") or os.getenv("SDK_SHORT_SHA") or SDK_SHORT_SHA -if os.getenv("SDK_SHORT_SHA") and not os.getenv("IMAGE_TAG_PREFIX"): - warnings.warn( - "SDK_SHORT_SHA env var is deprecated, use IMAGE_TAG_PREFIX instead. " - "Support for SDK_SHORT_SHA will be removed in a future version.", - DeprecationWarning, - stacklevel=2, - ) +IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) From 472efcb77be5dde98a87d7ae2165f410adb1107b Mon Sep 17 00:00:00 2001 From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:50:51 -0300 Subject: [PATCH 06/11] Update benchmarks/utils/version.py Co-authored-by: OpenHands Bot --- benchmarks/utils/version.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index df8a86a74..c6205381b 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -35,4 +35,14 @@ def get_sdk_sha() -> str: # By default this is the SDK submodule short SHA. Set the IMAGE_TAG_PREFIX # environment variable to override (e.g. when using pre-built images from # a different SDK revision or a CI-provided tag). -IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) +# Check for deprecated env var and warn users +_deprecated_sdk_short_sha = os.getenv("SDK_SHORT_SHA") +if _deprecated_sdk_short_sha is not None: + import warnings + warnings.warn( + "SDK_SHORT_SHA environment variable is deprecated. Use IMAGE_TAG_PREFIX instead.", + DeprecationWarning, + stacklevel=2, + ) + +IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX") or _deprecated_sdk_short_sha or SDK_SHORT_SHA From 44ff927a9a11f279966234a467f6eccc792d4df6 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 10:52:39 -0300 Subject: [PATCH 07/11] Fix inconsistent log wording and add IMAGE_TAG_PREFIX tests - swebench/run_infer.py: change "image tag prefix:" to "tag prefix:" for consistency with all other benchmark runners - tests/test_version.py: add unit tests for IMAGE_TAG_PREFIX resolution covering default, env var override, deprecated SDK_SHORT_SHA fallback, and precedence when both env vars are set Co-Authored-By: Claude Opus 4.6 --- benchmarks/swebench/run_infer.py | 2 +- tests/test_version.py | 51 ++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 tests/test_version.py diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index a65a1cb59..065c3aecd 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -200,7 +200,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(image tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float( os.getenv( diff --git a/tests/test_version.py b/tests/test_version.py new file mode 100644 index 000000000..4415cafa1 --- /dev/null +++ b/tests/test_version.py @@ -0,0 +1,51 @@ +"""Tests for benchmarks.utils.version IMAGE_TAG_PREFIX resolution.""" + +import importlib +import os +from unittest.mock import patch + +import pytest + + +def _reload_version(**env_overrides): + """Reload version module with custom environment variables.""" + import benchmarks.utils.version as version_mod + + with patch.dict(os.environ, env_overrides, clear=False): + # Remove env vars not in overrides so we test clean state + for key in ("IMAGE_TAG_PREFIX", "SDK_SHORT_SHA"): + if key not in env_overrides: + os.environ.pop(key, None) + importlib.reload(version_mod) + return version_mod + + +class TestImageTagPrefix: + def teardown_method(self): + """Restore version module to default state after each test.""" + import benchmarks.utils.version as version_mod + + for key in ("IMAGE_TAG_PREFIX", "SDK_SHORT_SHA"): + os.environ.pop(key, None) + importlib.reload(version_mod) + + def test_default_uses_sdk_short_sha(self): + """When no env vars are set, IMAGE_TAG_PREFIX defaults to SDK_SHORT_SHA.""" + mod = _reload_version() + assert mod.IMAGE_TAG_PREFIX == mod.SDK_SHORT_SHA + + def test_image_tag_prefix_env_override(self): + """IMAGE_TAG_PREFIX env var overrides the default.""" + mod = _reload_version(IMAGE_TAG_PREFIX="custom-tag") + assert mod.IMAGE_TAG_PREFIX == "custom-tag" + + def test_deprecated_sdk_short_sha_env_fallback(self): + """SDK_SHORT_SHA env var is honored with a deprecation warning.""" + with pytest.warns(DeprecationWarning, match="SDK_SHORT_SHA"): + mod = _reload_version(SDK_SHORT_SHA="legacy-tag") + assert mod.IMAGE_TAG_PREFIX == "legacy-tag" + + def test_image_tag_prefix_takes_precedence_over_sdk_short_sha(self): + """IMAGE_TAG_PREFIX env var wins over deprecated SDK_SHORT_SHA.""" + mod = _reload_version(IMAGE_TAG_PREFIX="new-tag", SDK_SHORT_SHA="old-tag") + assert mod.IMAGE_TAG_PREFIX == "new-tag" From bc76472821394f3ac1649310173bb23703898e26 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 11:01:54 -0300 Subject: [PATCH 08/11] Update uv.lock for SDK submodule v1.11.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regenerate lock file to match the current vendor/software-agent-sdk submodule version (1.10.0 → 1.11.5). Co-Authored-By: Claude Opus 4.6 --- uv.lock | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/uv.lock b/uv.lock index ba3be42c6..db834cb47 100644 --- a/uv.lock +++ b/uv.lock @@ -16,6 +16,18 @@ members = [ "openhands-workspace", ] +[[package]] +name = "agent-client-protocol" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1b/7b/7cdac86db388809d9e3bc58cac88cc7dfa49b7615b98fab304a828cd7f8a/agent_client_protocol-0.8.1.tar.gz", hash = "sha256:1bbf15663bf51f64942597f638e32a6284c5da918055d9672d3510e965143dbd", size = 68866, upload-time = "2026-02-13T15:34:54.567Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/f3/219eeca0ad4a20843d4b9eaac5532f87018b9d25730a62a16f54f6c52d1a/agent_client_protocol-0.8.1-py3-none-any.whl", hash = "sha256:9421a11fd435b4831660272d169c3812d553bb7247049c138c3ca127e4b8af8e", size = 54529, upload-time = "2026-02-13T15:34:53.344Z" }, +] + [[package]] name = "aiofiles" version = "24.1.0" @@ -2286,7 +2298,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.10.0" +version = "1.11.5" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2426,9 +2438,10 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.10.0" +version = "1.11.5" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ + { name = "agent-client-protocol" }, { name = "deprecation" }, { name = "fastmcp" }, { name = "filelock" }, @@ -2449,6 +2462,7 @@ boto3 = [ [package.metadata] requires-dist = [ + { name = "agent-client-protocol", specifier = ">=0.8.1" }, { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" }, { name = "deprecation", specifier = ">=2.1.0" }, { name = "fastmcp", specifier = ">=2.11.3" }, @@ -2466,7 +2480,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.10.0" +version = "1.11.5" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2495,7 +2509,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.10.0" +version = "1.11.5" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" }, From 616233cba5d85227d1bc0afd30131c04f546b0cf Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 11:07:10 -0300 Subject: [PATCH 09/11] Fix ruff format in version.py Co-Authored-By: Claude Opus 4.6 --- benchmarks/utils/version.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index c6205381b..ebc52d0a8 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -39,10 +39,13 @@ def get_sdk_sha() -> str: _deprecated_sdk_short_sha = os.getenv("SDK_SHORT_SHA") if _deprecated_sdk_short_sha is not None: import warnings + warnings.warn( "SDK_SHORT_SHA environment variable is deprecated. Use IMAGE_TAG_PREFIX instead.", DeprecationWarning, stacklevel=2, ) -IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX") or _deprecated_sdk_short_sha or SDK_SHORT_SHA +IMAGE_TAG_PREFIX = ( + os.getenv("IMAGE_TAG_PREFIX") or _deprecated_sdk_short_sha or SDK_SHORT_SHA +) From 280428e0c8d4feea960b7e6fd98be618a390c2cf Mon Sep 17 00:00:00 2001 From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com> Date: Tue, 3 Mar 2026 11:12:38 -0300 Subject: [PATCH 10/11] Update benchmarks/utils/modal_patches.py Co-authored-by: OpenHands Bot --- benchmarks/utils/modal_patches.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py index e1299e200..f2522c82d 100644 --- a/benchmarks/utils/modal_patches.py +++ b/benchmarks/utils/modal_patches.py @@ -497,7 +497,9 @@ def _inject_modal_sitecustomize() -> None: ) env_vars = {"PYTHONPATH": "/root"} - env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix() +env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix() +# Backward compatibility - remove in next major version +env_vars["SDK_SHORT_SHA"] = env_vars["IMAGE_TAG_PREFIX"] env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo() env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target() From da2dad2bd4fd0db510ac460d398669ed2f04eb03 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 3 Mar 2026 11:18:45 -0300 Subject: [PATCH 11/11] Fix indentation in modal_patches.py for IMAGE_TAG_PREFIX env vars Three lines were accidentally left-aligned at column 0 instead of being indented inside the _inject_modal_sitecustomize() function body. Co-Authored-By: Claude Opus 4.6 --- benchmarks/utils/modal_patches.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py index f2522c82d..515878249 100644 --- a/benchmarks/utils/modal_patches.py +++ b/benchmarks/utils/modal_patches.py @@ -497,9 +497,9 @@ def _inject_modal_sitecustomize() -> None: ) env_vars = {"PYTHONPATH": "/root"} -env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix() -# Backward compatibility - remove in next major version -env_vars["SDK_SHORT_SHA"] = env_vars["IMAGE_TAG_PREFIX"] + env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix() + # Backward compatibility - remove in next major version + env_vars["SDK_SHORT_SHA"] = env_vars["IMAGE_TAG_PREFIX"] env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo() env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target()