Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ repos:
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
exclude: '^uv\.lock$'

- repo: local
hooks:
Expand Down
14 changes: 11 additions & 3 deletions judgearena/arenas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fast_langdetect import detect_language
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision
from judgearena.log import get_logger

logger = get_logger(__name__)
Expand All @@ -30,11 +31,13 @@ def _load_arena_dataframe(
) -> pd.DataFrame:
assert arena in KNOWN_ARENAS
if arena == "LMArena-55k":
repo_id = "lmarena-ai/arena-human-preference-55k"
path = snapshot_download(
repo_id="lmarena-ai/arena-human-preference-55k",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*.csv",
force_download=False,
revision=hf_revision(repo_id),
)
df = pd.read_csv(Path(path) / "train.csv")

Expand Down Expand Up @@ -70,11 +73,13 @@ def _winner_55k(row) -> str | None:

elif "LMArena" in arena:
size = arena.split("-")[1] # "100k" or "140k"
repo_id = f"lmarena-ai/arena-human-preference-{size}"
path = snapshot_download(
repo_id=f"lmarena-ai/arena-human-preference-{size}",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*parquet",
force_download=False,
revision=hf_revision(repo_id),
)
parquet_files = sorted((Path(path) / "data").glob("*.parquet"))
df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
Expand Down Expand Up @@ -171,9 +176,12 @@ def get_winner(
return df


_DEFAULT_COMPARIA_REVISION = hf_revision("ministere-culture/comparia-votes")


def load_arena_dataframe(
arena: str | None,
comparia_revision: str = "7a40bce496c1f2aa3be4001da85a49cb4743042b",
comparia_revision: str | None = _DEFAULT_COMPARIA_REVISION,
) -> pd.DataFrame:
"""Load battles from one or all arenas.

Expand Down
63 changes: 63 additions & 0 deletions judgearena/dataset_revisions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Pinned upstream revisions for every dataset/space JudgeArena downloads.

Pinning lets the run metadata answer "exactly which version of the data did
this run see?". When upstream rewrites a dataset (e.g. ComparIA periodically
republishes), an unpinned ``snapshot_download`` will silently start returning
different bytes; pinned revisions force callers to opt into upgrades.

To bump a revision, paste the new commit SHA from the dataset's HuggingFace
revision page (or the GitHub commit page for the FastChat raw URL).
"""

from __future__ import annotations

# HuggingFace dataset / space revisions. Keys are HuggingFace ``repo_id``
# strings; values are commit SHAs. ``None`` is allowed for repos where we
# do not yet have a stable pin and is recorded as such in the metadata so
# the gap is visible.
HF_DATASET_REVISIONS: dict[str, str | None] = {
# LMArena human-preference battles
"lmarena-ai/arena-human-preference-100k": "72e85b3ddc9c81bf7b659d6b03d4126dfd8fb34a",
"lmarena-ai/arena-human-preference-140k": "6322995ab34d7c2693e3f47dd13fa5caa0789a74",
"lmarena-ai/arena-human-preference-55k": "18c298340948c0e7f7727399fd459cca6ce0ca6f",
# ComparIA (already pinned via the legacy comparia_revision argument).
"ministere-culture/comparia-votes": "7a40bce496c1f2aa3be4001da85a49cb4743042b",
# m-ArenaHard (Cohere release)
"CohereLabs/m-ArenaHard": "ab393a96cd0b134a1acfa96e080af31e5e73a393",
# AlpacaEval instructions / model_outputs (geoalgo redistribution; the
# repo now redirects to ``judge-arena/judge-arena-dataset`` upstream, but
# ``snapshot_download`` follows the redirect transparently).
"geoalgo/llmjudge": "004c4a992956eeefffd36b63ade470f32fd0a582",
# MT-Bench questions (LMSYS Space).
"lmsys/mt-bench": "a4b674ca573c24143824ac7f60d9173e7081e37d",
# Multilingual fluency contexts.
"geoalgo/multilingual-contexts-to-be-completed": "06e73c95ad18d71a04b5a1b6464ed89d38195039",
# Arena-Hard official source (used via datasets.load_dataset).
"lmarena-ai/arena-hard-auto": "15f3746e21432264ce9b453999bde4f3c946d2e6",
}


# Raw-URL pins (e.g. FastChat reference answers fetched as a raw GitHub URL).
# Mapping is "logical name" -> commit SHA on the upstream repo. The downloader
# rewrites the URL to point at the pinned SHA.
RAW_URL_REVISIONS: dict[str, str | None] = {
"lm-sys/FastChat": "587d5cfa1609a43d192cedb8441cac3c17db105d",
}


def hf_revision(repo_id: str) -> str | None:
"""Return the pinned revision for ``repo_id`` (or ``None`` if not pinned)."""
return HF_DATASET_REVISIONS.get(repo_id)


def all_dataset_revisions() -> dict[str, str | None]:
"""Return a copy of every pin recorded in this module.

Used by :func:`judgearena.repro.write_run_metadata` to record the
pin table alongside each run so future readers know which version of
the data was visible at the time of the run.
"""
return {
**HF_DATASET_REVISIONS,
**{f"raw:{k}": v for k, v in RAW_URL_REVISIONS.items()},
}
2 changes: 1 addition & 1 deletion judgearena/instruction_dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
)
from judgearena.instruction_dataset.m_arenahard import load_m_arenahard
from judgearena.log import get_logger
from judgearena.utils import data_root, download_hf, read_df
from judgearena.paths import data_root, download_hf, read_df

logger = get_logger(__name__)

Expand Down
14 changes: 10 additions & 4 deletions judgearena/instruction_dataset/arena_hard.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pandas as pd
from datasets import Dataset, DatasetDict, IterableDataset, load_dataset

from judgearena.dataset_revisions import hf_revision

ARENA_HARD_HF_REPO_ID = "lmarena-ai/arena-hard-auto"


Expand Down Expand Up @@ -41,10 +43,14 @@ def arena_hard_baseline_model(dataset: str) -> str | None:


def _load_official_arena_hard_dataset(spec: ArenaHardSpec) -> pd.DataFrame:
data = load_dataset(
path=ARENA_HARD_HF_REPO_ID,
data_dir=f"data/{spec.hf_variant}",
)
revision = hf_revision(ARENA_HARD_HF_REPO_ID)
load_kwargs: dict = {
"path": ARENA_HARD_HF_REPO_ID,
"data_dir": f"data/{spec.hf_variant}",
}
if revision:
load_kwargs["revision"] = revision
data = load_dataset(**load_kwargs)
return _dataset_like_to_dataframe(data)


Expand Down
7 changes: 5 additions & 2 deletions judgearena/instruction_dataset/m_arenahard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.utils import data_root
from judgearena.dataset_revisions import hf_revision
from judgearena.paths import data_root


def load_m_arenahard(local_path, language: str | None = None):
repo_id = "CohereLabs/m-ArenaHard"
snapshot_download(
repo_id="CohereLabs/m-ArenaHard",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*",
local_dir=local_path / "m-ArenaHard",
force_download=False,
revision=hf_revision(repo_id),
)

df_union = []
Expand Down
23 changes: 17 additions & 6 deletions judgearena/instruction_dataset/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,21 @@
import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.utils import data_root
from judgearena.dataset_revisions import RAW_URL_REVISIONS, hf_revision
from judgearena.paths import data_root

FASTCHAT_GPT4_REFERENCE_URL = (
"https://raw.githubusercontent.com/lm-sys/FastChat/main/"
"fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl"
)

def _fastchat_reference_url() -> str:
"""URL for FastChat MT-Bench GPT-4 references, pinned when available."""
revision = RAW_URL_REVISIONS.get("lm-sys/FastChat")
rev = revision if revision else "main"
return (
f"https://raw.githubusercontent.com/lm-sys/FastChat/{rev}/"
"fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl"
)


FASTCHAT_GPT4_REFERENCE_URL = _fastchat_reference_url()


def _download_gpt4_references(local_dir: Path) -> Path | None:
Expand Down Expand Up @@ -47,14 +56,16 @@ def download_mt_bench(local_dir: Path | None = None) -> tuple[Path, Path | None]
question_path = local_dir / "data" / "mt_bench" / "question.jsonl"
if not question_path.exists():
try:
mt_bench_repo = "lmsys/mt-bench"
snapshot_download(
repo_id="lmsys/mt-bench",
repo_id=mt_bench_repo,
repo_type="space",
allow_patterns=[
"data/mt_bench/question.jsonl",
],
local_dir=local_dir,
force_download=False,
revision=hf_revision(mt_bench_repo),
)
except Exception as e:
raise RuntimeError(
Expand Down
55 changes: 55 additions & 0 deletions judgearena/paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Filesystem paths and small file-IO helpers anchored at JudgeArena's data root.

This is a tiny leaf module so it can be imported by every other module
(including ``judgearena.instruction_dataset``) without pulling in the rest of
``judgearena.utils``, which would create an import cycle with the
``instruction_dataset`` package.

Symbols here are re-exported from :mod:`judgearena.utils` for backward
compatibility, so existing ``from judgearena.utils import data_root`` /
``from judgearena.utils import download_hf, read_df`` callers keep working.
"""

from __future__ import annotations

import os
from pathlib import Path

import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision


def _data_root_path() -> Path:
raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA")
if raw:
return Path(raw).expanduser()
return Path("~/judgearena-data/").expanduser()


data_root: Path = _data_root_path()


def download_hf(name: str, local_path: Path) -> None:
"""Download AlpacaEval-style instruction/output tables into ``local_path``."""
local_path.mkdir(exist_ok=True, parents=True)
repo_id = "geoalgo/llmjudge"
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns=f"*{name}*",
local_dir=local_path,
force_download=False,
revision=hf_revision(repo_id),
)


def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame:
"""Read a CSV/CSV-zip/parquet dataframe from disk."""
assert filename.exists(), f"Dataframe file not found at {filename}"
if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"):
return pd.read_csv(filename, **pandas_kwargs)
else:
assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}"
return pd.read_parquet(filename, **pandas_kwargs)
43 changes: 12 additions & 31 deletions judgearena/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,50 +14,29 @@
from tqdm.asyncio import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from judgearena.dataset_revisions import hf_revision
from judgearena.instruction_dataset.arena_hard import (
download_arena_hard,
is_arena_hard_dataset,
)
from judgearena.log import get_logger

logger = get_logger(__name__)


def _data_root_path() -> Path:
raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA")
if raw:
return Path(raw).expanduser()
return Path("~/judgearena-data/").expanduser()
# ``data_root``, ``download_hf`` and ``read_df`` live in the leaf
# :mod:`judgearena.paths` module so that ``judgearena.instruction_dataset`` can
# import them without going through ``judgearena.utils``. We re-export them
# here so existing callers that do ``from judgearena.utils import data_root``
# (or ``download_hf`` / ``read_df``) keep working.
from judgearena.paths import data_root, download_hf, read_df

logger = get_logger(__name__)

data_root = _data_root_path()
__all__ = ["data_root", "download_hf", "read_df"]


def set_langchain_cache():
set_llm_cache(SQLiteCache(database_path=str(data_root / ".langchain.db")))


def download_hf(name: str, local_path: Path):
local_path.mkdir(exist_ok=True, parents=True)
# downloads the model from huggingface into `local_path` folder
snapshot_download(
repo_id="geoalgo/llmjudge",
repo_type="dataset",
allow_patterns=f"*{name}*",
local_dir=local_path,
force_download=False,
)


def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame:
assert filename.exists(), f"Dataframe file not found at {filename}"
if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"):
return pd.read_csv(filename, **pandas_kwargs)
else:
assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}"
return pd.read_parquet(filename, **pandas_kwargs)


def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]:
"""Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B)."""
prefs = pd.Series(prefs, dtype="float64")
Expand Down Expand Up @@ -482,12 +461,14 @@ def download_all():
else:
download_hf(name=dataset, local_path=local_path_tables)

contexts_repo = "geoalgo/multilingual-contexts-to-be-completed"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

snapshot_download(
repo_id="geoalgo/multilingual-contexts-to-be-completed",
repo_id=contexts_repo,
repo_type="dataset",
allow_patterns="*",
local_dir=data_root / "contexts",
force_download=False,
revision=hf_revision(contexts_repo),
)

from judgearena.instruction_dataset.mt_bench import download_mt_bench
Expand Down
Loading