Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
get_default_on_result_writer,
)
from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
from benchmarks.utils.intelligent_routing import classify_and_route
from benchmarks.utils.litellm_proxy import build_eval_llm
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec
from benchmarks.utils.models import (
EvalInstance,
EvalMetadata,
Expand Down Expand Up @@ -385,7 +386,24 @@ def evaluate_instance(
if is_acp_agent(self.metadata.agent_type):
agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model)
else:
agent_llm = build_eval_llm(self.metadata.llm)
primary_llm = self.metadata.llm
if self.metadata.routing is not None:
decision = classify_and_route(
benchmark="commit0",
instance_data=instance.data,
router=self.metadata.routing,
)
logger.info(
"intelligent-routing instance=%s category=%s model=%s "
"vision_fallback=%s raw=%r",
instance.id,
decision.category,
decision.chosen_model_id,
decision.forced_vision_fallback,
decision.raw_classifier_output[:120],
)
primary_llm = decision.chosen_llm
agent_llm = build_eval_llm(primary_llm)
tools = get_tools_for_preset(
self.metadata.tool_preset, enable_browser=False
)
Expand All @@ -394,7 +412,7 @@ def evaluate_instance(
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=build_eval_llm(self.metadata.llm, usage_id="condenser"),
llm=build_eval_llm(primary_llm, usage_id="condenser"),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)
Expand Down Expand Up @@ -638,6 +656,14 @@ def main() -> None:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
routing_spec = maybe_load_router_spec(args.llm_config_path)
if routing_spec is not None:
logger.info(
"Using intelligent routing: classifier=%s tiers=%s fallback=%s",
routing_spec.classifier_llm.model,
sorted(routing_spec.tiers.keys()),
routing_spec.fallback_model_id,
)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))

dataset_description = (
Expand All @@ -658,6 +684,7 @@ def main() -> None:

metadata = EvalMetadata(
llm=llm,
routing=routing_spec,
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
Expand Down
33 changes: 30 additions & 3 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@
)
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
from benchmarks.utils.intelligent_routing import classify_and_route
from benchmarks.utils.litellm_proxy import build_eval_llm
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
from benchmarks.utils.tool_presets import get_tools_for_preset
from benchmarks.utils.version import IMAGE_TAG_PREFIX
Expand Down Expand Up @@ -325,7 +326,24 @@ def evaluate_instance(
if is_acp_agent(self.metadata.agent_type):
agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model)
else:
agent_llm = build_eval_llm(self.metadata.llm)
primary_llm = self.metadata.llm
if self.metadata.routing is not None:
decision = classify_and_route(
benchmark="gaia",
instance_data=instance.data,
router=self.metadata.routing,
)
logger.info(
"intelligent-routing instance=%s category=%s model=%s "
"vision_fallback=%s raw=%r",
instance.id,
decision.category,
decision.chosen_model_id,
decision.forced_vision_fallback,
decision.raw_classifier_output[:120],
)
primary_llm = decision.chosen_llm
agent_llm = build_eval_llm(primary_llm)
tools = get_tools_for_preset(self.metadata.tool_preset, enable_browser=True)
if self.metadata.enable_delegation:
tools.append(Tool(name=TaskToolSet.name))
Expand All @@ -334,7 +352,7 @@ def evaluate_instance(
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=build_eval_llm(self.metadata.llm, usage_id="condenser"),
llm=build_eval_llm(primary_llm, usage_id="condenser"),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)
Expand Down Expand Up @@ -618,6 +636,14 @@ def main() -> None:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
routing_spec = maybe_load_router_spec(args.llm_config_path)
if routing_spec is not None:
logger.info(
"Using intelligent routing: classifier=%s tiers=%s fallback=%s",
routing_spec.classifier_llm.model,
sorted(routing_spec.tiers.keys()),
routing_spec.fallback_model_id,
)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))

# Construct dataset description
Expand All @@ -635,6 +661,7 @@ def main() -> None:
# Create metadata
metadata = EvalMetadata(
llm=llm,
routing=routing_spec,
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
Expand Down
36 changes: 32 additions & 4 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
)
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import remote_image_exists
from benchmarks.utils.intelligent_routing import classify_and_route
from benchmarks.utils.litellm_proxy import build_eval_llm
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec
from benchmarks.utils.models import (
EvalInstance,
EvalMetadata,
Expand Down Expand Up @@ -262,7 +263,24 @@ def evaluate_instance(
if is_acp_agent(self.metadata.agent_type):
agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model)
else:
agent_llm = build_eval_llm(self.metadata.llm)
primary_llm = self.metadata.llm
if self.metadata.routing is not None:
decision = classify_and_route(
benchmark="swebench",
instance_data=instance.data,
router=self.metadata.routing,
)
logger.info(
"intelligent-routing instance=%s category=%s model=%s "
"vision_fallback=%s raw=%r",
instance.id,
decision.category,
decision.chosen_model_id,
decision.forced_vision_fallback,
decision.raw_classifier_output[:120],
)
primary_llm = decision.chosen_llm
agent_llm = build_eval_llm(primary_llm)
tools = get_tools_for_preset(
preset=self.metadata.tool_preset,
# Disable browser tools in CLI mode
Expand All @@ -273,7 +291,7 @@ def evaluate_instance(
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=build_eval_llm(self.metadata.llm, usage_id="condenser"),
llm=build_eval_llm(primary_llm, usage_id="condenser"),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)
Expand Down Expand Up @@ -395,7 +413,16 @@ def main() -> None:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
routing_spec = maybe_load_router_spec(args.llm_config_path)
if routing_spec is not None:
logger.info(
"Using intelligent routing: classifier=%s tiers=%s fallback=%s",
routing_spec.classifier_llm.model,
sorted(routing_spec.tiers.keys()),
routing_spec.fallback_model_id,
)
else:
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))

dataset_description = (
args.dataset.replace("/", "__") + "-" + args.split.replace("/", "__")
Expand All @@ -422,6 +449,7 @@ def main() -> None:

metadata = EvalMetadata(
llm=llm,
routing=routing_spec,
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
Expand Down
33 changes: 30 additions & 3 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@
)
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import remote_image_exists
from benchmarks.utils.intelligent_routing import classify_and_route
from benchmarks.utils.litellm_proxy import build_eval_llm
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec
from benchmarks.utils.models import (
EvalInstance,
EvalMetadata,
Expand Down Expand Up @@ -242,7 +243,24 @@ def evaluate_instance(
if is_acp_agent(self.metadata.agent_type):
agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model)
else:
agent_llm = build_eval_llm(self.metadata.llm)
primary_llm = self.metadata.llm
if self.metadata.routing is not None:
decision = classify_and_route(
benchmark="swebenchmultimodal",
instance_data=instance.data,
router=self.metadata.routing,
)
logger.info(
"intelligent-routing instance=%s category=%s model=%s "
"vision_fallback=%s raw=%r",
instance.id,
decision.category,
decision.chosen_model_id,
decision.forced_vision_fallback,
decision.raw_classifier_output[:120],
)
primary_llm = decision.chosen_llm
agent_llm = build_eval_llm(primary_llm)
tools = get_tools_for_preset(
self.metadata.tool_preset,
# Enable browser tools for frontend development tasks
Expand All @@ -253,7 +271,7 @@ def evaluate_instance(
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=build_eval_llm(self.metadata.llm, usage_id="condenser"),
llm=build_eval_llm(primary_llm, usage_id="condenser"),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)
Expand Down Expand Up @@ -441,6 +459,14 @@ def main() -> None:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
routing_spec = maybe_load_router_spec(args.llm_config_path)
if routing_spec is not None:
logger.info(
"Using intelligent routing: classifier=%s tiers=%s fallback=%s",
routing_spec.classifier_llm.model,
sorted(routing_spec.tiers.keys()),
routing_spec.fallback_model_id,
)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))

dataset_description = (
Expand Down Expand Up @@ -468,6 +494,7 @@ def main() -> None:

metadata = EvalMetadata(
llm=llm,
routing=routing_spec,
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
Expand Down
33 changes: 30 additions & 3 deletions benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
create_docker_workspace,
remote_image_exists,
)
from benchmarks.utils.intelligent_routing import classify_and_route
from benchmarks.utils.litellm_proxy import build_eval_llm
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec
from benchmarks.utils.models import (
EvalInstance,
EvalMetadata,
Expand Down Expand Up @@ -251,7 +252,24 @@ def evaluate_instance(
if is_acp_agent(self.metadata.agent_type):
agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model)
else:
agent_llm = build_eval_llm(self.metadata.llm)
primary_llm = self.metadata.llm
if self.metadata.routing is not None:
decision = classify_and_route(
benchmark="swtbench",
instance_data=instance.data,
router=self.metadata.routing,
)
logger.info(
"intelligent-routing instance=%s category=%s model=%s "
"vision_fallback=%s raw=%r",
instance.id,
decision.category,
decision.chosen_model_id,
decision.forced_vision_fallback,
decision.raw_classifier_output[:120],
)
primary_llm = decision.chosen_llm
agent_llm = build_eval_llm(primary_llm)
tools = get_tools_for_preset(
self.metadata.tool_preset,
# Disable browser tools in CLI mode
Expand All @@ -262,7 +280,7 @@ def evaluate_instance(
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=build_eval_llm(self.metadata.llm, usage_id="condenser"),
llm=build_eval_llm(primary_llm, usage_id="condenser"),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)
Expand Down Expand Up @@ -382,6 +400,14 @@ def main() -> None:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
routing_spec = maybe_load_router_spec(args.llm_config_path)
if routing_spec is not None:
logger.info(
"Using intelligent routing: classifier=%s tiers=%s fallback=%s",
routing_spec.classifier_llm.model,
sorted(routing_spec.tiers.keys()),
routing_spec.fallback_model_id,
)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))

dataset_description = (
Expand All @@ -408,6 +434,7 @@ def main() -> None:

metadata = EvalMetadata(
llm=llm,
routing=routing_spec,
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
Expand Down
Loading
Loading