-
Notifications
You must be signed in to change notification settings - Fork 5
Changes related to running benchmark experiments for the paper: Support Qwen3.5 and thinking models, Skywork, truncation tracking, benchmark changes etc #32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
c6b2b0a
1f4bae8
25b0355
ab065fd
ef1c92c
32f2e7e
5f2edf0
cffb6dd
ac243aa
41298a4
319050d
cb7ada5
84faa05
c063f3d
ec7fc95
20ca9a5
217dc8d
8087c15
91d67ef
5e8efc9
2af4714
da6818e
891c417
fb36154
f33f191
e21639e
157d939
41d925e
16dc5e1
5411ff8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -81,5 +81,6 @@ quote-style = "double" | |
| indent-style = "space" | ||
|
|
||
| [project.optional-dependencies] | ||
| vllm = ["vllm==0.10.2", "transformers>=4.55.2,<5.0.0"] | ||
| # vLLM on PyPI pins transformers<5; optional extra matches that so `uv lock` can resolve. | ||
| vllm = ["vllm>=0.17.0,<1.0.0", "transformers>=4.56.0,<5.0.0"] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I tightened the range. 0.18.1 was working. I think the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed it to |
||
| llamacpp = ["llama-cpp-python>=0.3.0"] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,121 @@ | ||
| import pandas as pd | ||
|
|
||
| import judgearena.evaluate as evaluate | ||
| import judgearena.generate_and_evaluate as generate_and_evaluate | ||
| from judgearena.generate_and_evaluate import CliArgs | ||
| from judgearena.generate_and_evaluate import main as main_generate_and_eval | ||
|
|
||
|
|
||
| def test_build_pair_score_output_choices_covers_all_integer_pairs(): | ||
| choices = evaluate.build_pair_score_output_choices() | ||
|
|
||
| assert len(choices) == 121 | ||
| assert len(set(choices)) == 121 | ||
| assert "score_A: 0\nscore_B: 0" in choices | ||
| assert "score_A: 10\nscore_B: 10" in choices | ||
|
|
||
|
|
||
| def test_main_aligns_local_reference_by_instruction_index(tmp_path, monkeypatch): | ||
| instructions = pd.DataFrame( | ||
| {"instruction": ["Instruction B", "Instruction A"]}, | ||
| index=pd.Index(["b", "a"], name="instruction_index"), | ||
| ) | ||
| reference_path = tmp_path / "m-arena-hard-en-reference.csv" | ||
| pd.DataFrame( | ||
| { | ||
| "instruction_index": ["a", "b"], | ||
| "output": ["Answer A", "Answer B"], | ||
| } | ||
| ).to_csv(reference_path, index=False) | ||
|
|
||
| monkeypatch.setattr( | ||
| generate_and_evaluate, | ||
| "load_instructions", | ||
| lambda dataset, n_instructions=None: ( | ||
| instructions.head(n_instructions) | ||
| if n_instructions is not None | ||
| else instructions | ||
| ), | ||
| ) | ||
| monkeypatch.setattr( | ||
| generate_and_evaluate, | ||
| "cache_function_dataframe", | ||
| lambda fun, **_kwargs: fun(), | ||
| ) | ||
|
|
||
| captured = {} | ||
|
|
||
| def fake_judge_and_parse_prefs( | ||
| *, | ||
| judge_chat_model, | ||
| instructions, | ||
| completions_A, | ||
| completions_B, | ||
| swap_mode, | ||
| provide_explanation, | ||
| system_prompt, | ||
| user_prompt_template, | ||
| truncate_input_chars, | ||
| use_tqdm, | ||
| ): | ||
| captured["instructions"] = instructions | ||
| captured["completions_A"] = completions_A | ||
| captured["completions_B"] = completions_B | ||
| annotations = [{"judge_completion": "score A: 0 score B: 10"}] * len( | ||
| instructions | ||
| ) | ||
| prefs = pd.Series([1.0] * len(instructions)) | ||
| return annotations, [], prefs | ||
|
|
||
| monkeypatch.setattr( | ||
| generate_and_evaluate, | ||
| "judge_and_parse_prefs", | ||
| fake_judge_and_parse_prefs, | ||
| ) | ||
|
|
||
| prefs = main_generate_and_eval( | ||
| CliArgs( | ||
| dataset="m-arena-hard-en", | ||
| model_A="Dummy/no answer", | ||
| model_B=str(reference_path), | ||
| judge_model="Dummy/score A: 0 score B: 10", | ||
| n_instructions=2, | ||
| result_folder=str(tmp_path / "results"), | ||
| ) | ||
| ) | ||
|
|
||
| assert captured["instructions"] == ["Instruction B", "Instruction A"] | ||
| assert captured["completions_A"] == ["no answer", "no answer"] | ||
| assert captured["completions_B"] == ["Answer B", "Answer A"] | ||
| assert prefs.tolist() == [1.0, 1.0] | ||
|
|
||
|
|
||
| def test_annotate_battles_warns_when_judge_completions_are_truncated( | ||
| monkeypatch, capsys | ||
| ): | ||
| captured = {} | ||
|
|
||
| def fake_do_inference(*, chat_model, inputs, use_tqdm): | ||
| captured["judge_prompt"] = inputs[0].to_messages()[1].content | ||
| return ["score_A: 0\nscore_B: 10"] | ||
|
|
||
| monkeypatch.setattr(evaluate, "do_inference", fake_do_inference) | ||
|
|
||
| annotations = evaluate.annotate_battles( | ||
| judge_chat_model=object(), | ||
| instructions=["Instruction"], | ||
| completions_A=["Answer A"], | ||
| completions_B=["Answer B"], | ||
| truncate_input_chars=3, | ||
| ) | ||
|
|
||
| stdout = capsys.readouterr().out | ||
| assert ( | ||
| "Warning: truncated 2 judge completions to 3 characters before evaluation." | ||
| in stdout | ||
| ) | ||
| assert "Ans" in captured["judge_prompt"] | ||
| assert "Answer A" not in captured["judge_prompt"] | ||
| assert "Answer B" not in captured["judge_prompt"] | ||
| assert annotations[0].completion_A == "Answer A" | ||
| assert annotations[0].completion_B == "Answer B" |
Uh oh!
There was an error while loading. Please reload this page.