Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions judgearena/arenas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fast_langdetect import detect_language
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision
from judgearena.log import get_logger

logger = get_logger(__name__)
Expand All @@ -30,11 +31,13 @@ def _load_arena_dataframe(
) -> pd.DataFrame:
assert arena in KNOWN_ARENAS
if arena == "LMArena-55k":
repo_id = "lmarena-ai/arena-human-preference-55k"
path = snapshot_download(
repo_id="lmarena-ai/arena-human-preference-55k",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*.csv",
force_download=False,
revision=hf_revision(repo_id),
)
df = pd.read_csv(Path(path) / "train.csv")

Expand Down Expand Up @@ -70,11 +73,13 @@ def _winner_55k(row) -> str | None:

elif "LMArena" in arena:
size = arena.split("-")[1] # "100k" or "140k"
repo_id = f"lmarena-ai/arena-human-preference-{size}"
path = snapshot_download(
repo_id=f"lmarena-ai/arena-human-preference-{size}",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*parquet",
force_download=False,
revision=hf_revision(repo_id),
)
parquet_files = sorted((Path(path) / "data").glob("*.parquet"))
df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
Expand Down Expand Up @@ -171,9 +176,15 @@ def get_winner(
return df


_DEFAULT_COMPARIA_REVISION = (
hf_revision("ministere-culture/comparia-votes")
Comment thread
geoalgo marked this conversation as resolved.
Outdated
or "7a40bce496c1f2aa3be4001da85a49cb4743042b"
)


def load_arena_dataframe(
arena: str | None,
comparia_revision: str = "7a40bce496c1f2aa3be4001da85a49cb4743042b",
comparia_revision: str = _DEFAULT_COMPARIA_REVISION,
) -> pd.DataFrame:
"""Load battles from one or all arenas.

Expand Down
61 changes: 61 additions & 0 deletions judgearena/dataset_revisions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Pinned upstream revisions for every dataset/space JudgeArena downloads.

Pinning lets the run metadata answer "exactly which version of the data did
this run see?". When upstream rewrites a dataset (e.g. ComparIA periodically
republishes), an unpinned ``snapshot_download`` will silently start returning
different bytes; pinned revisions force callers to opt into upgrades.

To bump a revision, paste the new commit SHA from the dataset's HuggingFace
revision page (or the GitHub commit page for the FastChat raw URL).
"""

from __future__ import annotations

# HuggingFace dataset / space revisions. Keys are HuggingFace ``repo_id``
# strings; values are commit SHAs. ``None`` is allowed for repos where we
# do not yet have a stable pin and is recorded as such in the metadata so
# the gap is visible.
HF_DATASET_REVISIONS: dict[str, str | None] = {
# LMArena human-preference battles
"lmarena-ai/arena-human-preference-100k": None,
"lmarena-ai/arena-human-preference-140k": None,
"lmarena-ai/arena-human-preference-55k": None,
# ComparIA (already pinned via the legacy comparia_revision argument).
"ministere-culture/comparia-votes": ("7a40bce496c1f2aa3be4001da85a49cb4743042b"),
# m-ArenaHard (Cohere release)
"CohereLabs/m-ArenaHard": None,
# AlpacaEval instructions / model_outputs (geoalgo redistribution).
"geoalgo/llmjudge": None,
# MT-Bench questions (LMSYS Space).
"lmsys/mt-bench": None,
# Multilingual fluency contexts.
"geoalgo/multilingual-contexts-to-be-completed": None,
# Arena-Hard official source (used via datasets.load_dataset).
"lmarena-ai/arena-hard-auto": None,
Copy link
Copy Markdown
Collaborator

@geoalgo geoalgo Apr 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we tied all of those to the current commit as well?
I agree its probably better than to have those updated silently.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What SHAs were used for ablations?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed! Got the latest SHAs for each repo

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What SHAs were used for ablations?

We tagged comparia as this dataset was changing all the time (which is great). The other have not received any commit as they were static dumps as far as I know.

}


# Raw-URL pins (e.g. FastChat reference answers fetched as a raw GitHub URL).
# Mapping is "logical name" -> commit SHA on the upstream repo. The downloader
# rewrites the URL to point at the pinned SHA.
RAW_URL_REVISIONS: dict[str, str | None] = {
"lm-sys/FastChat": None,
}


def hf_revision(repo_id: str) -> str | None:
"""Return the pinned revision for ``repo_id`` (or ``None`` if not pinned)."""
return HF_DATASET_REVISIONS.get(repo_id)


def all_dataset_revisions() -> dict[str, str | None]:
"""Return a copy of every pin recorded in this module.

Used by :func:`judgearena.repro.write_run_metadata` to record the
pin table alongside each run so future readers know which version of
the data was visible at the time of the run.
"""
return {
**HF_DATASET_REVISIONS,
**{f"raw:{k}": v for k, v in RAW_URL_REVISIONS.items()},
}
14 changes: 10 additions & 4 deletions judgearena/instruction_dataset/arena_hard.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pandas as pd
from datasets import Dataset, DatasetDict, IterableDataset, load_dataset

from judgearena.dataset_revisions import hf_revision

ARENA_HARD_HF_REPO_ID = "lmarena-ai/arena-hard-auto"


Expand Down Expand Up @@ -41,10 +43,14 @@ def arena_hard_baseline_model(dataset: str) -> str | None:


def _load_official_arena_hard_dataset(spec: ArenaHardSpec) -> pd.DataFrame:
data = load_dataset(
path=ARENA_HARD_HF_REPO_ID,
data_dir=f"data/{spec.hf_variant}",
)
revision = hf_revision(ARENA_HARD_HF_REPO_ID)
load_kwargs: dict = {
"path": ARENA_HARD_HF_REPO_ID,
"data_dir": f"data/{spec.hf_variant}",
}
if revision:
load_kwargs["revision"] = revision
data = load_dataset(**load_kwargs)
return _dataset_like_to_dataframe(data)


Expand Down
5 changes: 4 additions & 1 deletion judgearena/instruction_dataset/m_arenahard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision
from judgearena.utils import data_root


def load_m_arenahard(local_path, language: str | None = None):
repo_id = "CohereLabs/m-ArenaHard"
snapshot_download(
repo_id="CohereLabs/m-ArenaHard",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*",
local_dir=local_path / "m-ArenaHard",
force_download=False,
revision=hf_revision(repo_id),
)

df_union = []
Expand Down
21 changes: 16 additions & 5 deletions judgearena/instruction_dataset/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,21 @@
import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import RAW_URL_REVISIONS, hf_revision
from judgearena.utils import data_root

FASTCHAT_GPT4_REFERENCE_URL = (
"https://raw.githubusercontent.com/lm-sys/FastChat/main/"
"fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl"
)

def _fastchat_reference_url() -> str:
"""URL for FastChat MT-Bench GPT-4 references, pinned when available."""
revision = RAW_URL_REVISIONS.get("lm-sys/FastChat")
rev = revision if revision else "main"
return (
f"https://raw.githubusercontent.com/lm-sys/FastChat/{rev}/"
"fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl"
)


FASTCHAT_GPT4_REFERENCE_URL = _fastchat_reference_url()


def _download_gpt4_references(local_dir: Path) -> Path | None:
Expand Down Expand Up @@ -47,14 +56,16 @@ def download_mt_bench(local_dir: Path | None = None) -> tuple[Path, Path | None]
question_path = local_dir / "data" / "mt_bench" / "question.jsonl"
if not question_path.exists():
try:
mt_bench_repo = "lmsys/mt-bench"
snapshot_download(
repo_id="lmsys/mt-bench",
repo_id=mt_bench_repo,
repo_type="space",
allow_patterns=[
"data/mt_bench/question.jsonl",
],
local_dir=local_dir,
force_download=False,
revision=hf_revision(mt_bench_repo),
)
except Exception as e:
raise RuntimeError(
Expand Down
26 changes: 20 additions & 6 deletions judgearena/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
from tqdm.asyncio import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from judgearena.instruction_dataset.arena_hard import (
download_arena_hard,
is_arena_hard_dataset,
)
from judgearena.dataset_revisions import hf_revision
from judgearena.log import get_logger

logger = get_logger(__name__)
Expand All @@ -30,6 +27,9 @@ def _data_root_path() -> Path:
return Path("~/judgearena-data/").expanduser()


# Defined eagerly because the ``judgearena.instruction_dataset`` package
# resolves ``judgearena.utils.data_root`` from inside its own __init__, so
# ``data_root`` must be importable as soon as ``utils`` is partially loaded.
data_root = _data_root_path()


Expand All @@ -40,12 +40,14 @@ def set_langchain_cache():
def download_hf(name: str, local_path: Path):
local_path.mkdir(exist_ok=True, parents=True)
# downloads the model from huggingface into `local_path` folder
repo_id = "geoalgo/llmjudge"
Comment thread
geoalgo marked this conversation as resolved.
Outdated
snapshot_download(
repo_id="geoalgo/llmjudge",
repo_id=repo_id,
repo_type="dataset",
allow_patterns=f"*{name}*",
local_dir=local_path,
force_download=False,
revision=hf_revision(repo_id),
)


Expand Down Expand Up @@ -482,12 +484,14 @@ def download_all():
else:
download_hf(name=dataset, local_path=local_path_tables)

contexts_repo = "geoalgo/multilingual-contexts-to-be-completed"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

snapshot_download(
repo_id="geoalgo/multilingual-contexts-to-be-completed",
repo_id=contexts_repo,
repo_type="dataset",
allow_patterns="*",
local_dir=data_root / "contexts",
force_download=False,
revision=hf_revision(contexts_repo),
)

from judgearena.instruction_dataset.mt_bench import download_mt_bench
Expand Down Expand Up @@ -582,5 +586,15 @@ def _to_python(x):
return pd.read_csv(cache_file)


# Imported at the bottom so that ``data_root``, ``download_hf`` and ``read_df``
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we rather move things in an util? It is better to import at the top of files.

# - all touched by ``judgearena.instruction_dataset.__init__`` - are already
# defined when the package import chain reaches back into this module.
# Re-exported for backward compatibility (e.g. tests that monkeypatch
# ``judgearena.utils.download_arena_hard``).
from judgearena.instruction_dataset.arena_hard import ( # noqa: E402
download_arena_hard,
is_arena_hard_dataset,
)

if __name__ == "__main__":
download_all()
Loading