Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions judgearena/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def _build_elo_args(
baseline_model=args.baseline_model,
judge_model=args.judge_model,
n_instructions=args.n_instructions,
judge_prompt_preset=args.judge_prompt_preset,
judge_system_prompt_file=args.judge_system_prompt_file,
judge_user_prompt_file=args.judge_user_prompt_file,
provide_explanation=args.provide_explanation,
swap_mode=args.swap_mode,
ignore_cache=args.ignore_cache,
Expand Down Expand Up @@ -221,6 +224,9 @@ def _build_generate_and_evaluate_args(
use_tqdm=args.use_tqdm,
judge_model=args.judge_model,
n_instructions=args.n_instructions,
judge_prompt_preset=args.judge_prompt_preset,
judge_system_prompt_file=args.judge_system_prompt_file,
judge_user_prompt_file=args.judge_user_prompt_file,
provide_explanation=args.provide_explanation,
swap_mode=args.swap_mode,
ignore_cache=args.ignore_cache,
Expand Down
44 changes: 44 additions & 0 deletions judgearena/cli_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ class BaseCliArgs:
judge_model: str

n_instructions: int | None = None
# Judge-prompt selection (see ``judgearena.prompts.registry``).
# ``judge_prompt_preset`` picks a named preset; the ``_file`` overrides
# take a path on disk and win over the preset. ``provide_explanation``
# is kept for backward compatibility and is equivalent to setting the
# preset to ``default_with_explanation``.
judge_prompt_preset: str | None = None
judge_system_prompt_file: str | None = None
judge_user_prompt_file: str | None = None
provide_explanation: bool = False
swap_mode: str = "fixed"
ignore_cache: bool = False
Expand Down Expand Up @@ -59,10 +67,46 @@ def add_common_arguments(parser: argparse.ArgumentParser) -> None:
type=int,
required=False,
)
parser.add_argument(
"--judge_prompt_preset",
type=str,
required=False,
default=None,
help=(
"Name of a judge-prompt preset registered in "
"``judgearena.prompts.registry`` (e.g. ``default``, "
"``default_with_explanation``, ``fluency``, "
"``fastchat-pairwise``). When omitted, the per-task default "
"is used."
),
)
parser.add_argument(
"--judge_system_prompt_file",
type=str,
required=False,
default=None,
help=(
"Path to a custom judge system prompt; takes precedence over "
"--judge_prompt_preset. Must be combined with "
"--judge_user_prompt_file."
),
)
parser.add_argument(
"--judge_user_prompt_file",
type=str,
required=False,
default=None,
help=(
"Path to a custom judge user-prompt template; takes precedence "
"over --judge_prompt_preset. Must be combined with "
"--judge_system_prompt_file."
),
)
parser.add_argument(
"--provide_explanation",
action="store_true",
help=(
"Equivalent to --judge_prompt_preset default_with_explanation. "
"If specified, judge will provide explanation before making a "
"judgement. Does not necessarily improve the accuracy of the judge "
"but enables some result interpretation."
Expand Down
82 changes: 54 additions & 28 deletions judgearena/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
is_arena_hard_dataset,
)
from judgearena.log import get_logger
from judgearena.prompts.registry import (
ResolvedJudgePrompt,
resolve_judge_prompt,
)
from judgearena.repro import _to_jsonable, write_run_metadata
from judgearena.utils import (
compute_pref_summary,
Expand Down Expand Up @@ -59,57 +63,79 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
return float(m.group(group_index).strip(" "))


_COMPLETION_LABEL_SINGLE = "Answer"
_COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
_EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
_SCORE_FENCE = "\n```"


def load_judge_system_and_user_prompt(
provide_explanation: bool = True,
multi_turn: bool = False,
) -> tuple[str, str]:
prompts_dir = Path(__file__).parent / "prompts"
system_prompt = (prompts_dir / "system-prompt.txt").read_text()
"""Load the bundled default judge prompts (back-compat shim).

prompt_filename = (
"prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
)
user_prompt_template = (prompts_dir / prompt_filename).read_text()
user_prompt_template = user_prompt_template.replace(
"{completion_label}",
_COMPLETION_LABEL_MULTI_TURN if multi_turn else _COMPLETION_LABEL_SINGLE,
)
user_prompt_template = user_prompt_template.replace(
"{explanation_suffix}",
_EXPLANATION_SUFFIX if provide_explanation else _SCORE_FENCE,
Prefer :func:`judgearena.prompts.registry.resolve_judge_prompt` for new
code; this function delegates to the registry but returns the
``(system, user_template)`` tuple expected by older callers.
"""
resolved = resolve_judge_prompt(
preset=("default_with_explanation" if provide_explanation else "default"),
multi_turn=multi_turn,
)

return system_prompt, user_prompt_template
return resolved.system_text, resolved.user_template_text


def resolve_judge_prompts(
*,
provide_explanation: bool,
provide_explanation: bool = False,
multi_turn: bool = False,
system_prompt: str | None = None,
user_prompt_template: str | None = None,
task: str | None = None,
preset: str | None = None,
system_file: str | None = None,
user_file: str | None = None,
) -> tuple[str, str]:
default_system_prompt, default_user_prompt_template = (
load_judge_system_and_user_prompt(
provide_explanation=provide_explanation, multi_turn=multi_turn
)
"""Resolve the judge ``(system_prompt, user_prompt_template)`` for a run.

Direct ``system_prompt`` / ``user_prompt_template`` overrides win.
Otherwise the registry is consulted with ``task`` / ``preset`` /
``system_file`` / ``user_file``. Legacy callers that pass nothing
end up with the ``default`` preset (or ``default_with_explanation``
when ``provide_explanation=True``) for backward compatibility.
"""
if system_prompt is not None and user_prompt_template is not None:
return system_prompt, user_prompt_template

resolved = resolve_judge_prompt(
task=task,
preset=preset,
system_file=system_file,
user_file=user_file,
multi_turn=multi_turn,
provide_explanation=provide_explanation,
)
return (
system_prompt if system_prompt is not None else default_system_prompt,
system_prompt if system_prompt is not None else resolved.system_text,
(
user_prompt_template
if user_prompt_template is not None
else default_user_prompt_template
else resolved.user_template_text
),
)


def resolve_run_judge_prompt(task: str, cli_args) -> ResolvedJudgePrompt:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like the intended single resolution path for the new CLI prompt options, but I can only find it referenced from the new unit test, not from production code.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! I'm working on a PR on top of this one that uses this across multiple files (generate_and_evaluate.py, estimate_elo_ratings.py, generate.py)

"""Resolve the judge prompt for a run from the CLI args dataclass.

Accepts a :class:`judgearena.cli_common.BaseCliArgs` instance (or any
object exposing the same attributes) and returns the full
:class:`ResolvedJudgePrompt`, including hashes/paths for metadata.
"""
return resolve_judge_prompt(
task=task,
preset=getattr(cli_args, "judge_prompt_preset", None),
system_file=getattr(cli_args, "judge_system_prompt_file", None),
user_file=getattr(cli_args, "judge_user_prompt_file", None),
provide_explanation=getattr(cli_args, "provide_explanation", False),
)


def evaluate_completions(
dataset: str = "alpaca-eval",
judge_chat_model: LLM = None,
Expand Down
23 changes: 22 additions & 1 deletion judgearena/prompts/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,22 @@
"""Prompt templates bundled with JudgeArena."""
"""Prompt templates bundled with JudgeArena.

The :mod:`judgearena.prompts.registry` submodule exposes the named presets
used by the judge plus a per-task default mapping; see ``--judge_prompt_preset``
on the CLI.
"""

from judgearena.prompts.registry import (
PRESETS,
TASK_DEFAULT_PRESET,
ResolvedJudgePrompt,
default_preset_for_task,
resolve_judge_prompt,
)

__all__ = [
"PRESETS",
"TASK_DEFAULT_PRESET",
"ResolvedJudgePrompt",
"default_preset_for_task",
"resolve_judge_prompt",
]
Loading