Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.0.17] - 2026-05-25

### Fixed

- **Auto-proxy now forces non-streaming upstream — fixes `API Error: Content block not found` aborts mid-task for `claude_code` against vLLM-served Qwen3.x.** `cooperbench._proxy.managed_litellm` previously spawned LiteLLM with inline `--model` flags, which left the upstream `stream` parameter pass-through. vLLM 0.19.0's `qwen3_coder` / `qwen3_xml` streaming tool-call extractors intermittently forward `content_block_delta` events without first emitting a matching `content_block_start` for the synthesized `tool_use` block; claude-code's stream parser then raises `Content block not found` and the agent loop aborts. The fix switches to a temp YAML config that sets `litellm_params.stream: false`, so LiteLLM buffers the full upstream response and re-emits well-formed Anthropic SSE (with proper `content_block_start` → `content_block_delta` → `content_block_stop` ordering) to claude-code. Empirically on Qwen3.5-9B at 128k against a 4-pair dspy_task batch: 4/6 agents Submitted with 8 occurrences of `Content block not found` (streaming upstream) → **8/8 Submitted with 0 errors** (non-streaming upstream); patch sizes 30/102/72/76/70/48/186/47 lines, real multi-turn iteration (up to 35 steps). Confirmed end-to-end through the auto-proxy with a fresh `cooperbench run --openai-base-url ...`: 0 errors, both agents produced sensible diffs. Tracking upstream as [vllm-project/vllm#39056](https://github.com/vllm-project/vllm/issues/39056).

## [0.0.16] - 2026-05-25

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion src/cooperbench/__about__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version information for CooperBench."""

__version__ = "0.0.16"
__version__ = "0.0.17"
138 changes: 95 additions & 43 deletions src/cooperbench/_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import socket
import subprocess
import sys
import tempfile
import time
import urllib.error
import urllib.request
Expand Down Expand Up @@ -86,49 +87,100 @@ def managed_litellm(

port = _find_free_port()
base_url = f"http://localhost:{port}"
# LiteLLM's ``openai/<model>`` provider prefix tells it to forward as
# OpenAI-format to ``api_base``. ``--drop_params`` makes it tolerant
# of provider-specific kwargs that the upstream doesn't accept.
# The upstream API key is passed via ``OPENAI_API_KEY`` in the child
# env (LiteLLM CLI has no inline ``--api_key`` flag).
cmd = [
litellm_bin,
"--model",
f"openai/{openai_model}",
"--api_base",
openai_base_url,
"--host",
"127.0.0.1",
"--port",
str(port),
"--request_timeout",
str(int(request_timeout)),
"--drop_params",
]
child_env = {**os.environ, "OPENAI_API_KEY": api_key}

logger.info("Spawning LiteLLM proxy on %s -> %s (%s)", base_url, openai_base_url, openai_model)
proc = subprocess.Popen(
cmd,
env=child_env,
stdout=sys.stderr,
stderr=sys.stderr,
# New process group so a Ctrl-C on the parent doesn't double-kill
# the proxy mid-tear-down.
start_new_session=True,
)

# Why a config file instead of inline ``--model`` flags: we need to set
# ``stream: false`` on the upstream call so LiteLLM buffers the full
# response from vLLM and then re-emits it as Anthropic SSE to the
# client. Inline ``litellm`` CLI has no flag for that.
#
# Why force non-streaming upstream: vLLM's streaming tool-call
# extractors (qwen3_coder, qwen3_xml as of 0.19.0) intermittently
# forward ``content_block_delta`` events without first emitting a
# ``content_block_start`` for the synthesized tool_use block.
# claude-code's stream parser then raises ``API Error: Content block
# not found`` and the agent loop aborts mid-task. Empirically, a
# 4-pair batch on Qwen3.5-9B at 128k went from 4/6 agents Submitted +
# 8 occurrences of ``Content block not found`` (streaming upstream)
# to 8/8 Submitted + 0 errors (non-streaming upstream). Tracking
# upstream as vllm-project/vllm#39056.
#
# ``drop_params`` is at the litellm_settings level so it applies to
# every request regardless of provider-specific kwargs the upstream
# would otherwise reject.
config = {
"model_list": [
{
"model_name": openai_model,
"litellm_params": {
"model": f"openai/{openai_model}",
"api_base": openai_base_url,
"api_key": api_key,
"stream": False,
},
}
],
"litellm_settings": {
"request_timeout": int(request_timeout),
"drop_params": True,
},
"general_settings": {
"master_key": auth_token,
},
}
config_fd, config_path = tempfile.mkstemp(prefix="cb-litellm-", suffix=".yaml")
try:
deadline = time.monotonic() + PROXY_STARTUP_TIMEOUT_SECONDS
_wait_for_health(base_url, deadline)
logger.info("LiteLLM proxy healthy on %s", base_url)
yield base_url, auth_token
with os.fdopen(config_fd, "w") as f:
# PyYAML isn't a runtime dep here, but LiteLLM happily reads
# JSON as YAML (JSON is a strict subset).
import json as _json

_json.dump(config, f)

cmd = [
litellm_bin,
"--config",
config_path,
"--host",
"127.0.0.1",
"--port",
str(port),
]
# OPENAI_API_KEY is still set in case LiteLLM consults env for
# something the config doesn't cover (request-time auth, etc).
child_env = {**os.environ, "OPENAI_API_KEY": api_key}

logger.info(
"Spawning LiteLLM proxy on %s -> %s (%s, stream=false upstream)",
base_url,
openai_base_url,
openai_model,
)
proc = subprocess.Popen(
cmd,
env=child_env,
stdout=sys.stderr,
stderr=sys.stderr,
# New process group so a Ctrl-C on the parent doesn't
# double-kill the proxy mid-tear-down.
start_new_session=True,
)

try:
deadline = time.monotonic() + PROXY_STARTUP_TIMEOUT_SECONDS
_wait_for_health(base_url, deadline)
logger.info("LiteLLM proxy healthy on %s", base_url)
yield base_url, auth_token
finally:
if proc.poll() is None:
try:
proc.send_signal(signal.SIGTERM)
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
logger.warning("LiteLLM did not exit on SIGTERM; killing")
proc.kill()
proc.wait(timeout=5)
finally:
if proc.poll() is None:
try:
proc.send_signal(signal.SIGTERM)
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
logger.warning("LiteLLM did not exit on SIGTERM; killing")
proc.kill()
proc.wait(timeout=5)
try:
os.unlink(config_path)
except OSError:
pass
Loading