cooperbench · akhatua2 · May 26, 2026 · May 26, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.0.17] - 2026-05-25
+
+### Fixed
+
+- **Auto-proxy now forces non-streaming upstream — fixes `API Error: Content block not found` aborts mid-task for `claude_code` against vLLM-served Qwen3.x.** `cooperbench._proxy.managed_litellm` previously spawned LiteLLM with inline `--model` flags, which left the upstream `stream` parameter pass-through. vLLM 0.19.0's `qwen3_coder` / `qwen3_xml` streaming tool-call extractors intermittently forward `content_block_delta` events without first emitting a matching `content_block_start` for the synthesized `tool_use` block; claude-code's stream parser then raises `Content block not found` and the agent loop aborts. The fix switches to a temp YAML config that sets `litellm_params.stream: false`, so LiteLLM buffers the full upstream response and re-emits well-formed Anthropic SSE (with proper `content_block_start` → `content_block_delta` → `content_block_stop` ordering) to claude-code. Empirically on Qwen3.5-9B at 128k against a 4-pair dspy_task batch: 4/6 agents Submitted with 8 occurrences of `Content block not found` (streaming upstream) → **8/8 Submitted with 0 errors** (non-streaming upstream); patch sizes 30/102/72/76/70/48/186/47 lines, real multi-turn iteration (up to 35 steps). Confirmed end-to-end through the auto-proxy with a fresh `cooperbench run --openai-base-url ...`: 0 errors, both agents produced sensible diffs. Tracking upstream as [vllm-project/vllm#39056](https://github.com/vllm-project/vllm/issues/39056).
+
 ## [0.0.16] - 2026-05-25
 
 ### Fixed

diff --git a/src/cooperbench/__about__.py b/src/cooperbench/__about__.py
@@ -1,3 +1,3 @@
 """Version information for CooperBench."""
 
-__version__ = "0.0.16"
+__version__ = "0.0.17"
diff --git a/src/cooperbench/_proxy.py b/src/cooperbench/_proxy.py
@@ -18,6 +18,7 @@
 import socket
 import subprocess
 import sys
+import tempfile
 import time
 import urllib.error
 import urllib.request
@@ -86,49 +87,100 @@ def managed_litellm(
 
     port = _find_free_port()
     base_url = f"http://localhost:{port}"
-    # LiteLLM's ``openai/<model>`` provider prefix tells it to forward as
-    # OpenAI-format to ``api_base``.  ``--drop_params`` makes it tolerant
-    # of provider-specific kwargs that the upstream doesn't accept.
-    # The upstream API key is passed via ``OPENAI_API_KEY`` in the child
-    # env (LiteLLM CLI has no inline ``--api_key`` flag).
-    cmd = [
-        litellm_bin,
-        "--model",
-        f"openai/{openai_model}",
-        "--api_base",
-        openai_base_url,
-        "--host",
-        "127.0.0.1",
-        "--port",
-        str(port),
-        "--request_timeout",
-        str(int(request_timeout)),
-        "--drop_params",
-    ]
-    child_env = {**os.environ, "OPENAI_API_KEY": api_key}
-
-    logger.info("Spawning LiteLLM proxy on %s -> %s (%s)", base_url, openai_base_url, openai_model)
-    proc = subprocess.Popen(
-        cmd,
-        env=child_env,
-        stdout=sys.stderr,
-        stderr=sys.stderr,
-        # New process group so a Ctrl-C on the parent doesn't double-kill
-        # the proxy mid-tear-down.
-        start_new_session=True,
-    )
 
+    # Why a config file instead of inline ``--model`` flags: we need to set
+    # ``stream: false`` on the upstream call so LiteLLM buffers the full
+    # response from vLLM and then re-emits it as Anthropic SSE to the
+    # client.  Inline ``litellm`` CLI has no flag for that.
+    #
+    # Why force non-streaming upstream: vLLM's streaming tool-call
+    # extractors (qwen3_coder, qwen3_xml as of 0.19.0) intermittently
+    # forward ``content_block_delta`` events without first emitting a
+    # ``content_block_start`` for the synthesized tool_use block.
+    # claude-code's stream parser then raises ``API Error: Content block
+    # not found`` and the agent loop aborts mid-task.  Empirically, a
+    # 4-pair batch on Qwen3.5-9B at 128k went from 4/6 agents Submitted +
+    # 8 occurrences of ``Content block not found`` (streaming upstream)
+    # to 8/8 Submitted + 0 errors (non-streaming upstream).  Tracking
+    # upstream as vllm-project/vllm#39056.
+    #
+    # ``drop_params`` is at the litellm_settings level so it applies to
+    # every request regardless of provider-specific kwargs the upstream
+    # would otherwise reject.
+    config = {
+        "model_list": [
+            {
+                "model_name": openai_model,
+                "litellm_params": {
+                    "model": f"openai/{openai_model}",
+                    "api_base": openai_base_url,
+                    "api_key": api_key,
+                    "stream": False,
+                },
+            }
+        ],
+        "litellm_settings": {
+            "request_timeout": int(request_timeout),
+            "drop_params": True,
+        },
+        "general_settings": {
+            "master_key": auth_token,
+        },
+    }
+    config_fd, config_path = tempfile.mkstemp(prefix="cb-litellm-", suffix=".yaml")
     try:
-        deadline = time.monotonic() + PROXY_STARTUP_TIMEOUT_SECONDS
-        _wait_for_health(base_url, deadline)
-        logger.info("LiteLLM proxy healthy on %s", base_url)
-        yield base_url, auth_token
+        with os.fdopen(config_fd, "w") as f:
+            # PyYAML isn't a runtime dep here, but LiteLLM happily reads
+            # JSON as YAML (JSON is a strict subset).
+            import json as _json
+
+            _json.dump(config, f)
+
+        cmd = [
+            litellm_bin,
+            "--config",
+            config_path,
+            "--host",
+            "127.0.0.1",
+            "--port",
+            str(port),
+        ]
+        # OPENAI_API_KEY is still set in case LiteLLM consults env for
+        # something the config doesn't cover (request-time auth, etc).
+        child_env = {**os.environ, "OPENAI_API_KEY": api_key}
+
+        logger.info(
+            "Spawning LiteLLM proxy on %s -> %s (%s, stream=false upstream)",
+            base_url,
+            openai_base_url,
+            openai_model,
+        )
+        proc = subprocess.Popen(
+            cmd,
+            env=child_env,
+            stdout=sys.stderr,
+            stderr=sys.stderr,
+            # New process group so a Ctrl-C on the parent doesn't
+            # double-kill the proxy mid-tear-down.
+            start_new_session=True,
+        )
+
+        try:
+            deadline = time.monotonic() + PROXY_STARTUP_TIMEOUT_SECONDS
+            _wait_for_health(base_url, deadline)
+            logger.info("LiteLLM proxy healthy on %s", base_url)
+            yield base_url, auth_token
+        finally:
+            if proc.poll() is None:
+                try:
+                    proc.send_signal(signal.SIGTERM)
+                    proc.wait(timeout=10)
+                except subprocess.TimeoutExpired:
+                    logger.warning("LiteLLM did not exit on SIGTERM; killing")
+                    proc.kill()
+                    proc.wait(timeout=5)
     finally:
-        if proc.poll() is None:
-            try:
-                proc.send_signal(signal.SIGTERM)
-                proc.wait(timeout=10)
-            except subprocess.TimeoutExpired:
-                logger.warning("LiteLLM did not exit on SIGTERM; killing")
-                proc.kill()
-                proc.wait(timeout=5)
+        try:
+            os.unlink(config_path)
+        except OSError:
+            pass