OpenEuroLLM · ErlisLushtaku · Mar 31, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 6, 2026
diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py
@@ -17,6 +17,7 @@
     do_inference,
     download_hf,
     read_df,
+    truncate,
 )
 
 
@@ -51,6 +52,18 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
             return float(m.group(group_index).strip(" "))
 
 
+_PAIR_SCORE_MIN = 0
+_PAIR_SCORE_MAX = 10
+
+
+def build_pair_score_output_choices() -> list[str]:
+    return [
+        f"score_A: {a}\nscore_B: {b}"
+        for a in range(_PAIR_SCORE_MIN, _PAIR_SCORE_MAX + 1)
+        for b in range(_PAIR_SCORE_MIN, _PAIR_SCORE_MAX + 1)
+    ]
+
+
 _COMPLETION_LABEL_SINGLE = "Answer"
 _COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
 _EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
@@ -302,27 +315,30 @@ def annotate_battles(
     prompt_template = ChatPromptTemplate.from_messages(
         [("system", system_prompt), ("user", user_prompt_template)]
     )
-
-    def truncate(s: str, max_len: int | None = None):
-        if not isinstance(s, str):
-            return ""
-        if max_len is not None:
-            return s[:max_len]
-        else:
-            return s
-
-    inputs = prompt_template.batch(
-        [
+    truncated_completion_count = 0
+    input_payloads = []
+    for user_prompt, completion_A, completion_B in zip(
+        instructions, completions_A, completions_B, strict=True
+    ):
+        truncated_completion_A = truncate(completion_A, max_len=truncate_input_chars)
+        truncated_completion_B = truncate(completion_B, max_len=truncate_input_chars)
+        truncated_completion_count += int(truncated_completion_A != completion_A)
+        truncated_completion_count += int(truncated_completion_B != completion_B)
+        input_payloads.append(
             {
                 "user_prompt": user_prompt,
-                "completion_A": truncate(completion_A, max_len=truncate_input_chars),
-                "completion_B": truncate(completion_B, max_len=truncate_input_chars),
+                "completion_A": truncated_completion_A,
+                "completion_B": truncated_completion_B,
             }
-            for user_prompt, completion_A, completion_B in zip(
-                instructions, completions_A, completions_B, strict=True
-            )
-        ]
-    )
+        )
+    if truncated_completion_count:
+        print(
+            "Warning: truncated "
+            f"{truncated_completion_count} judge completions to "
+            f"{truncate_input_chars} characters before evaluation."
+        )
+    inputs = prompt_template.batch(input_payloads)
+
     print(f"Start LLM judge annotation ({len(inputs)} annotations).")
     judge_completions = do_inference(
         chat_model=judge_chat_model,

diff --git a/judgearena/generate_and_evaluate.py b/judgearena/generate_and_evaluate.py
@@ -12,7 +12,11 @@
 
 import pandas as pd
 
-from judgearena.evaluate import judge_and_parse_prefs, resolve_judge_prompts
+from judgearena.evaluate import (
+    build_pair_score_output_choices,
+    judge_and_parse_prefs,
+    resolve_judge_prompts,
+)
 from judgearena.generate import generate_base, generate_instructions
 from judgearena.instruction_dataset import load_instructions
 from judgearena.mt_bench.mt_bench_utils import run_mt_bench
@@ -30,16 +34,40 @@
 def try_load_dataset_completions(
     dataset: str, model: str, n_instructions: int | None
 ) -> pd.DataFrame | None:
-    """Try loading pre-existing completions from the dataset.
+    """Try loading pre-existing completions from the dataset or a local file.
 
     Some datasets (e.g. alpaca-eval) ship with completions for well-known
     models such as ``gpt4_1106_preview``.  When ``model`` matches a column in
     ``model_outputs/{dataset}.csv.zip``, those completions are returned
     directly so that no model instantiation / generation is needed.
 
+    ``model`` may also be a local dataframe path. Local files must contain
+    ``instruction_index`` and ``output`` columns.
+
     Returns a DataFrame with columns ``completion`` and ``instruction_index``,
     or ``None`` when no pre-existing completions are found.
     """
+    local_path = Path(model)
+    if local_path.exists():
+        print(f"Loading completions from local path '{local_path}'.")
+        df_outputs = read_df(local_path)
+        required_columns = {"instruction_index", "output"}
+        missing_columns = required_columns.difference(df_outputs.columns)
+        if missing_columns:
+            missing_columns_list = ", ".join(sorted(missing_columns))
+            raise ValueError(
+                f"Local completion file '{local_path}' is missing required columns: "
+                f"{missing_columns_list}."
+            )
+
+        df_outputs = df_outputs.loc[:, ["instruction_index", "output"]].rename(
+            columns={"output": "completion"}
+        )
+        df_outputs.loc[:, "completion"] = df_outputs.loc[:, "completion"].fillna("")
+        if n_instructions is not None:
+            df_outputs = df_outputs.head(n_instructions)
+        return df_outputs
+
     local_path_tables = data_root / "tables"
     download_hf(name=dataset, local_path=local_path_tables)
     output_path = local_path_tables / "model_outputs" / f"{dataset}.csv.zip"
@@ -337,7 +365,7 @@ def main(args: CliArgs):
     )
     if dataset_completions_A is not None:
         completions_A = dataset_completions_A.set_index("instruction_index").loc[
-            :, "completion"
+            instructions.index, "completion"
         ]
     else:
         completions_A = cache_function_dataframe(
@@ -356,7 +384,7 @@ def main(args: CliArgs):
     )
     if dataset_completions_B is not None:
         completions_B = dataset_completions_B.set_index("instruction_index").loc[
-            :, "completion"
+            instructions.index, "completion"
         ]
     else:
         completions_B = cache_function_dataframe(
@@ -377,12 +405,18 @@ def main(args: CliArgs):
     print(completions_B.values[0])
     print(f"Evaluating completions with judge {args.judge_model}.")
 
+    judge_model_kwargs = dict(args.engine_kwargs)
+    if not args.provide_explanation and args.judge_model.split("/")[0] == "VLLM":
+        judge_model_kwargs["structured_outputs_choice"] = (
+            build_pair_score_output_choices()
+        )
+
     judge_chat_model = make_model(
         model=args.judge_model,
         max_tokens=args.max_out_tokens_judge,
         max_model_len=args.max_model_len,
         chat_template=args.chat_template,
-        **args.engine_kwargs,
+        **judge_model_kwargs,
     )
 
     name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}"

diff --git a/judgearena/utils.py b/judgearena/utils.py
@@ -202,6 +202,7 @@ def __init__(
         **vllm_kwargs,
     ):
         from vllm import LLM, SamplingParams
+        from vllm.sampling_params import StructuredOutputsParams
 
         self.model_path = model
         self.max_tokens = max_tokens
@@ -230,13 +231,19 @@ def __init__(
                     RuntimeWarning,
                     stacklevel=2,
                 )
+        self._sampling_params_kwargs = {
+            "max_tokens": max_tokens,
+            "temperature": float(vllm_kwargs.pop("temperature", 0.6)),
+            "top_p": float(vllm_kwargs.pop("top_p", 0.95)),
+        }
+        structured_outputs_choice = vllm_kwargs.pop("structured_outputs_choice", None)
+        if structured_outputs_choice is not None:
+            self._sampling_params_kwargs["structured_outputs"] = (
+                StructuredOutputsParams(choice=structured_outputs_choice)
+            )
+        self.sampling_params = SamplingParams(**self._sampling_params_kwargs)
 
         self.llm = LLM(model=model, trust_remote_code=True, **vllm_kwargs)
-        self.sampling_params = SamplingParams(
-            max_tokens=max_tokens,
-            temperature=0.6,
-            top_p=0.95,
-        )
 
         # Resolve chat template:
         # 1. Explicit override always wins → use chat() with that template
@@ -262,6 +269,12 @@ def __init__(
                 self._use_generate = False
                 print(f"ChatVLLM: using tokenizer's chat template for '{model}'")
 
+    def set_temperature(self, temperature: float) -> None:
+        from vllm import SamplingParams
+
+        self._sampling_params_kwargs["temperature"] = float(temperature)
+        self.sampling_params = SamplingParams(**self._sampling_params_kwargs)
+
     def _to_messages(self, input_item) -> list[dict]:
         """Convert LangChain prompt input to OpenAI-style messages."""
         # Map LangChain message types to OpenAI roles

diff --git a/pyproject.toml b/pyproject.toml
@@ -81,5 +81,6 @@ quote-style = "double"
 indent-style = "space"
 
 [project.optional-dependencies]
-vllm = ["vllm==0.10.2", "transformers>=4.55.2,<5.0.0"]
+# vLLM on PyPI pins transformers<5; optional extra matches that so `uv lock` can resolve.
+vllm = ["vllm>=0.17.0,<1.0.0", "transformers>=4.56.0,<5.0.0"]
 llamacpp = ["llama-cpp-python>=0.3.0"]
diff --git a/slurmpilot_scripts/launch_generation_and_evaluation.py b/slurmpilot_scripts/launch_generation_and_evaluation.py
@@ -73,7 +73,7 @@
                 "dataset": f"{language}-contexts",
                 "model_A": baseline,
                 "model_B": model,
-                "judge_model": "VLLM/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8",
+                "judge_model": "VLLM/Qwen/Qwen3.5-27B-FP8",
                 "n_instructions": 100,
                 # "ignore_cache": None,
             }

diff --git a/tests/test_local_completion_loading.py b/tests/test_local_completion_loading.py
@@ -0,0 +1,121 @@
+import pandas as pd
+
+import judgearena.evaluate as evaluate
+import judgearena.generate_and_evaluate as generate_and_evaluate
+from judgearena.generate_and_evaluate import CliArgs
+from judgearena.generate_and_evaluate import main as main_generate_and_eval
+
+
+def test_build_pair_score_output_choices_covers_all_integer_pairs():
+    choices = evaluate.build_pair_score_output_choices()
+
+    assert len(choices) == 121
+    assert len(set(choices)) == 121
+    assert "score_A: 0\nscore_B: 0" in choices
+    assert "score_A: 10\nscore_B: 10" in choices
+
+
+def test_main_aligns_local_reference_by_instruction_index(tmp_path, monkeypatch):
+    instructions = pd.DataFrame(
+        {"instruction": ["Instruction B", "Instruction A"]},
+        index=pd.Index(["b", "a"], name="instruction_index"),
+    )
+    reference_path = tmp_path / "m-arena-hard-en-reference.csv"
+    pd.DataFrame(
+        {
+            "instruction_index": ["a", "b"],
+            "output": ["Answer A", "Answer B"],
+        }
+    ).to_csv(reference_path, index=False)
+
+    monkeypatch.setattr(
+        generate_and_evaluate,
+        "load_instructions",
+        lambda dataset, n_instructions=None: (
+            instructions.head(n_instructions)
+            if n_instructions is not None
+            else instructions
+        ),
+    )
+    monkeypatch.setattr(
+        generate_and_evaluate,
+        "cache_function_dataframe",
+        lambda fun, **_kwargs: fun(),
+    )
+
+    captured = {}
+
+    def fake_judge_and_parse_prefs(
+        *,
+        judge_chat_model,
+        instructions,
+        completions_A,
+        completions_B,
+        swap_mode,
+        provide_explanation,
+        system_prompt,
+        user_prompt_template,
+        truncate_input_chars,
+        use_tqdm,
+    ):
+        captured["instructions"] = instructions
+        captured["completions_A"] = completions_A
+        captured["completions_B"] = completions_B
+        annotations = [{"judge_completion": "score A: 0 score B: 10"}] * len(
+            instructions
+        )
+        prefs = pd.Series([1.0] * len(instructions))
+        return annotations, [], prefs
+
+    monkeypatch.setattr(
+        generate_and_evaluate,
+        "judge_and_parse_prefs",
+        fake_judge_and_parse_prefs,
+    )
+
+    prefs = main_generate_and_eval(
+        CliArgs(
+            dataset="m-arena-hard-en",
+            model_A="Dummy/no answer",
+            model_B=str(reference_path),
+            judge_model="Dummy/score A: 0 score B: 10",
+            n_instructions=2,
+            result_folder=str(tmp_path / "results"),
+        )
+    )
+
+    assert captured["instructions"] == ["Instruction B", "Instruction A"]
+    assert captured["completions_A"] == ["no answer", "no answer"]
+    assert captured["completions_B"] == ["Answer B", "Answer A"]
+    assert prefs.tolist() == [1.0, 1.0]
+
+
+def test_annotate_battles_warns_when_judge_completions_are_truncated(
+    monkeypatch, capsys
+):
+    captured = {}
+
+    def fake_do_inference(*, chat_model, inputs, use_tqdm):
+        captured["judge_prompt"] = inputs[0].to_messages()[1].content
+        return ["score_A: 0\nscore_B: 10"]
+
+    monkeypatch.setattr(evaluate, "do_inference", fake_do_inference)
+
+    annotations = evaluate.annotate_battles(
+        judge_chat_model=object(),
+        instructions=["Instruction"],
+        completions_A=["Answer A"],
+        completions_B=["Answer B"],
+        truncate_input_chars=3,
+    )
+
+    stdout = capsys.readouterr().out
+    assert (
+        "Warning: truncated 2 judge completions to 3 characters before evaluation."
+        in stdout
+    )
+    assert "Ans" in captured["judge_prompt"]
+    assert "Answer A" not in captured["judge_prompt"]
+    assert "Answer B" not in captured["judge_prompt"]
+    assert annotations[0].completion_A == "Answer A"
+    assert annotations[0].completion_B == "Answer B"