OpenEuroLLM · geoalgo · Apr 2, 2026 · Feb 14, 2026 · Feb 14, 2026 · Feb 15, 2026
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ Compared to other libraries, here is a breakdown of features:
 | **Arena-Hard-Auto** | ❌  | ❌  | ✅  | ❌  | ❌                         | ❌                                            |
 | **Lighteval** | ✅  | ❌  | ❌  | ❌  | ❌                         | ❌                                       |
 | **Evalchemy** | ✅  | ✅  | ❌  | ❌  | ❌                         | ❌                                           |
-| **OpenJury** | 🔜  | ✅  | ✅  | ✅  | ✅                         | ✅                                          |
+| **OpenJury** | ✅  | ✅  | ✅  | ✅  | ✅                         | ✅                                          |
 
 The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue 
 or send a PR, we will be happy to update the information.
@@ -172,10 +172,29 @@ python openjury/generate_and_evaluate.py \
 
 This override applies to all vLLM models in the run. For remote providers (OpenAI, Together, OpenRouter), the flag is ignored since they handle templates server-side.
 
+### MT-Bench (Multi-Turn Evaluation)
+
+MT-Bench evaluates multi-turn conversation ability using 80 two-turn questions across 8 categories
+(writing, roleplay, reasoning, math, coding, extraction, STEM, humanities).
+It uses category-dependent judge prompts and reference answers for math/reasoning/coding.
+Questions are automatically downloaded from the [LMSYS MT-Bench HuggingFace space](https://huggingface.co/spaces/lmsys/mt-bench).
+
+```bash
+uv run python openjury/generate_and_evaluate.py \
+  --dataset mt-bench \
+  --model_A VLLM/Qwen/Qwen2.5-7B-Instruct \
+  --model_B OpenRouter/openai/gpt-4o \
+  --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
+  --n_instructions 10
+```
+
+Results include per-category and per-turn win rate breakdowns. Use `--swap_mode both` to correct for judge position bias.
+
 ## 📊 Supported Datasets
 
 | Dataset               | Description                                                                                    |
 |-----------------------|------------------------------------------------------------------------------------------------|
+| `mt-bench`            | 80 multi-turn (2-turn) questions across 8 categories ([LMSYS MT-Bench](https://arxiv.org/abs/2306.05685)) |
 | `alpaca-eval`         | General instruction-following benchmark                                                        |
 | `arena-hard`          | More challenging evaluation suite                                                              |
 | `m-arena-hard`        | Translated version of Arena-Hard in 23 languages                                               |

diff --git a/openjury/evaluate.py b/openjury/evaluate.py
@@ -15,6 +15,7 @@
     data_root,
     download_hf,
     do_inference,
+    truncate,
 )
 
 
@@ -51,14 +52,22 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
 
 def load_judge_system_and_user_prompt(
     provide_explanation: bool = True,
+    multi_turn: bool = False,
 ) -> tuple[str, str]:
     # Prepare judge
     with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f:
         system_prompt = str(f.read())
 
-    prompt_filename = (
-        "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
-    )
+    if multi_turn:
+        prompt_filename = (
+            "prompt-multi-turn-with-explanation.txt"
+            if provide_explanation
+            else "prompt-multi-turn.txt"
+        )
+    else:
+        prompt_filename = (
+            "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
+        )
     with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f:
         user_prompt_template = str(f.read())
 
@@ -240,14 +249,6 @@ def annotate_battles(
         [("system", system_prompt), ("user", user_prompt_template)]
     )
 
-    def truncate(s: str, max_len: int | None = None):
-        if not isinstance(s, str):
-            return ""
-        if max_len is not None:
-            return s[:max_len]
-        else:
-            return s
-
     inputs = prompt_template.batch(
         [
             {

diff --git a/openjury/generate.py b/openjury/generate.py
@@ -4,16 +4,10 @@
 from openjury.utils import (
     do_inference,
     make_model,
+    truncate,
 )
 
 
-def truncate(s: str, max_len: int | None = None):
-    if max_len is not None:
-        return s[:max_len]
-    else:
-        return s
-
-
 def generate_instructions(
     instructions: pd.Series,
     model: str,
@@ -57,6 +51,92 @@ def generate_instructions(
     return df_outputs
 
 
+def generate_multiturn(
+    questions: pd.DataFrame,
+    model: str,
+    truncate_input_chars: int | None = 8192,
+    max_tokens: int | None = 8192,
+    use_tqdm: bool = True,
+    **model_kwargs,
+) -> pd.DataFrame:
+    """Generate two-turn completions for MT-Bench style questions.
+
+    Generates turn 1 answers first, then uses them as conversation context
+    to generate turn 2 answers.
+
+    Args:
+        questions: DataFrame with columns turn_1, turn_2, and index instruction_index.
+        model: Model specification string (e.g. "VLLM/model-name").
+        **model_kwargs: Provider-specific options forwarded to make_model
+            (e.g. max_model_len, chat_template for VLLM).
+    Returns:
+        DataFrame with columns: instruction_index, completion_turn_1, completion_turn_2
+    """
+    chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs)
+
+    system_prompt = "You are a helpful assistant."
+    turn1_template = ChatPromptTemplate.from_messages(
+        [("system", system_prompt), ("user", "{user_prompt}")]
+    )
+
+    turn1_inputs = turn1_template.batch(
+        [
+            {"user_prompt": truncate(row["turn_1"], max_len=truncate_input_chars)}
+            for _, row in questions.iterrows()
+        ]
+    )
+
+    print(f"Generating turn 1 completions ({len(turn1_inputs)} questions).")
+    completions_turn_1 = do_inference(
+        chat_model=chat_model,
+        inputs=turn1_inputs,
+        use_tqdm=use_tqdm,
+    )
+
+    turn2_inputs = []
+    for (_, row), t1_answer in zip(questions.iterrows(), completions_turn_1):
+        if row["turn_2"] is None:
+            turn2_inputs.append(
+                turn1_template.invoke(
+                    {"user_prompt": "No follow-up question."}
+                )
+            )
+        else:
+            multi_turn_template = ChatPromptTemplate.from_messages(
+                [
+                    ("system", system_prompt),
+                    ("user", "{turn_1}"),
+                    ("assistant", "{turn_1_answer}"),
+                    ("user", "{turn_2}"),
+                ]
+            )
+            turn2_inputs.append(
+                multi_turn_template.invoke(
+                    {
+                        "turn_1": truncate(row["turn_1"], max_len=truncate_input_chars),
+                        "turn_1_answer": truncate(str(t1_answer), max_len=truncate_input_chars),
+                        "turn_2": truncate(row["turn_2"], max_len=truncate_input_chars),
+                    }
+                )
+            )
+
+    print(f"Generating turn 2 completions ({len(turn2_inputs)} questions).")
+    completions_turn_2 = do_inference(
+        chat_model=chat_model,
+        inputs=turn2_inputs,
+        use_tqdm=use_tqdm,
+    )
+
+    df_outputs = pd.DataFrame(
+        data={
+            "instruction_index": questions.index.tolist(),
+            "completion_turn_1": completions_turn_1,
+            "completion_turn_2": completions_turn_2,
+        },
+    )
+    return df_outputs
+
+
 def generate_base(
     instructions: pd.Series,
     model: str,