-
Notifications
You must be signed in to change notification settings - Fork 5
Pin dataset revisions for reproducibility #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| """Pinned upstream revisions for every dataset/space JudgeArena downloads. | ||
|
|
||
| Pinning lets the run metadata answer "exactly which version of the data did | ||
| this run see?". When upstream rewrites a dataset (e.g. ComparIA periodically | ||
| republishes), an unpinned ``snapshot_download`` will silently start returning | ||
| different bytes; pinned revisions force callers to opt into upgrades. | ||
|
|
||
| To bump a revision, paste the new commit SHA from the dataset's HuggingFace | ||
| revision page (or the GitHub commit page for the FastChat raw URL). | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| # HuggingFace dataset / space revisions. Keys are HuggingFace ``repo_id`` | ||
| # strings; values are commit SHAs. ``None`` is allowed for repos where we | ||
| # do not yet have a stable pin and is recorded as such in the metadata so | ||
| # the gap is visible. | ||
| HF_DATASET_REVISIONS: dict[str, str | None] = { | ||
| # LMArena human-preference battles | ||
| "lmarena-ai/arena-human-preference-100k": None, | ||
| "lmarena-ai/arena-human-preference-140k": None, | ||
| "lmarena-ai/arena-human-preference-55k": None, | ||
| # ComparIA (already pinned via the legacy comparia_revision argument). | ||
| "ministere-culture/comparia-votes": ("7a40bce496c1f2aa3be4001da85a49cb4743042b"), | ||
| # m-ArenaHard (Cohere release) | ||
| "CohereLabs/m-ArenaHard": None, | ||
| # AlpacaEval instructions / model_outputs (geoalgo redistribution). | ||
| "geoalgo/llmjudge": None, | ||
| # MT-Bench questions (LMSYS Space). | ||
| "lmsys/mt-bench": None, | ||
| # Multilingual fluency contexts. | ||
| "geoalgo/multilingual-contexts-to-be-completed": None, | ||
| # Arena-Hard official source (used via datasets.load_dataset). | ||
| "lmarena-ai/arena-hard-auto": None, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we tied all of those to the current commit as well?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What SHAs were used for ablations?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed! Got the latest SHAs for each repo
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We tagged comparia as this dataset was changing all the time (which is great). The other have not received any commit as they were static dumps as far as I know. |
||
| } | ||
|
|
||
|
|
||
| # Raw-URL pins (e.g. FastChat reference answers fetched as a raw GitHub URL). | ||
| # Mapping is "logical name" -> commit SHA on the upstream repo. The downloader | ||
| # rewrites the URL to point at the pinned SHA. | ||
| RAW_URL_REVISIONS: dict[str, str | None] = { | ||
| "lm-sys/FastChat": None, | ||
| } | ||
|
|
||
|
|
||
| def hf_revision(repo_id: str) -> str | None: | ||
| """Return the pinned revision for ``repo_id`` (or ``None`` if not pinned).""" | ||
| return HF_DATASET_REVISIONS.get(repo_id) | ||
|
|
||
|
|
||
| def all_dataset_revisions() -> dict[str, str | None]: | ||
| """Return a copy of every pin recorded in this module. | ||
|
|
||
| Used by :func:`judgearena.repro.write_run_metadata` to record the | ||
| pin table alongside each run so future readers know which version of | ||
| the data was visible at the time of the run. | ||
| """ | ||
| return { | ||
| **HF_DATASET_REVISIONS, | ||
| **{f"raw:{k}": v for k, v in RAW_URL_REVISIONS.items()}, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,10 +14,7 @@ | |
| from tqdm.asyncio import tqdm | ||
| from tqdm.contrib.logging import logging_redirect_tqdm | ||
|
|
||
| from judgearena.instruction_dataset.arena_hard import ( | ||
| download_arena_hard, | ||
| is_arena_hard_dataset, | ||
| ) | ||
| from judgearena.dataset_revisions import hf_revision | ||
| from judgearena.log import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
@@ -30,6 +27,9 @@ def _data_root_path() -> Path: | |
| return Path("~/judgearena-data/").expanduser() | ||
|
|
||
|
|
||
| # Defined eagerly because the ``judgearena.instruction_dataset`` package | ||
| # resolves ``judgearena.utils.data_root`` from inside its own __init__, so | ||
| # ``data_root`` must be importable as soon as ``utils`` is partially loaded. | ||
| data_root = _data_root_path() | ||
|
|
||
|
|
||
|
|
@@ -40,12 +40,14 @@ def set_langchain_cache(): | |
| def download_hf(name: str, local_path: Path): | ||
| local_path.mkdir(exist_ok=True, parents=True) | ||
| # downloads the model from huggingface into `local_path` folder | ||
| repo_id = "geoalgo/llmjudge" | ||
|
geoalgo marked this conversation as resolved.
Outdated
|
||
| snapshot_download( | ||
| repo_id="geoalgo/llmjudge", | ||
| repo_id=repo_id, | ||
| repo_type="dataset", | ||
| allow_patterns=f"*{name}*", | ||
| local_dir=local_path, | ||
| force_download=False, | ||
| revision=hf_revision(repo_id), | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -482,12 +484,14 @@ def download_all(): | |
| else: | ||
| download_hf(name=dataset, local_path=local_path_tables) | ||
|
|
||
| contexts_repo = "geoalgo/multilingual-contexts-to-be-completed" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| snapshot_download( | ||
| repo_id="geoalgo/multilingual-contexts-to-be-completed", | ||
| repo_id=contexts_repo, | ||
| repo_type="dataset", | ||
| allow_patterns="*", | ||
| local_dir=data_root / "contexts", | ||
| force_download=False, | ||
| revision=hf_revision(contexts_repo), | ||
| ) | ||
|
|
||
| from judgearena.instruction_dataset.mt_bench import download_mt_bench | ||
|
|
@@ -582,5 +586,15 @@ def _to_python(x): | |
| return pd.read_csv(cache_file) | ||
|
|
||
|
|
||
| # Imported at the bottom so that ``data_root``, ``download_hf`` and ``read_df`` | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we rather move things in an util? It is better to import at the top of files. |
||
| # - all touched by ``judgearena.instruction_dataset.__init__`` - are already | ||
| # defined when the package import chain reaches back into this module. | ||
| # Re-exported for backward compatibility (e.g. tests that monkeypatch | ||
| # ``judgearena.utils.download_arena_hard``). | ||
| from judgearena.instruction_dataset.arena_hard import ( # noqa: E402 | ||
| download_arena_hard, | ||
| is_arena_hard_dataset, | ||
| ) | ||
|
|
||
| if __name__ == "__main__": | ||
| download_all() | ||
Uh oh!
There was an error while loading. Please reload this page.