-
-
Notifications
You must be signed in to change notification settings - Fork 50
feat: implement GET /run/{id} endpoints (#37) #319
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
1685dec
4ca818c
4461c63
34c0e28
d7f7cbb
c7cc8f5
aef8fce
595c1d1
e9b9da7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,185 @@ async def exist(id_: int, expdb: AsyncConnection) -> bool: | |
| return bool(row.one_or_none()) | ||
|
|
||
|
|
||
| async def get(run_id: int, expdb: AsyncConnection) -> Row | None: | ||
| """Fetch the core run row from the `run` table. | ||
|
|
||
| Returns the row if found, or None if no run with `run_id` exists. | ||
| The `error_message` column is NULL when the run completed without errors. | ||
| """ | ||
| row = await expdb.execute( | ||
| text( | ||
| """ | ||
| SELECT `rid`, `uploader`, `setup`, `task_id`, `error_message` | ||
| FROM `run` | ||
| WHERE `rid` = :run_id | ||
| """, | ||
| ), | ||
| parameters={"run_id": run_id}, | ||
| ) | ||
| return row.one_or_none() | ||
|
|
||
|
|
||
| async def get_uploader_name(uploader_id: int, userdb: AsyncConnection) -> str | None: | ||
| """Fetch the display name of a user from the openml database. | ||
|
|
||
| Queries the `users` table in the separate openml DB and concatenates | ||
| first_name + ' ' + last_name. Returns None if the user does not exist. | ||
| """ | ||
| row = await userdb.execute( | ||
| text( | ||
| """ | ||
| SELECT CONCAT(`first_name`, ' ', `last_name`) AS `name` | ||
| FROM `users` | ||
| WHERE `id` = :uploader_id | ||
| """, | ||
| ), | ||
| parameters={"uploader_id": uploader_id}, | ||
| ) | ||
| result = row.one_or_none() | ||
| return result.name if result else None | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we could rather extend the
then you can use that method instead.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is really a good thought, will extend |
||
|
|
||
|
|
||
| async def get_tags(run_id: int, expdb: AsyncConnection) -> list[str]: | ||
| """Fetch all tags associated with a run from the `run_tag` table. | ||
|
|
||
| The `id` column in `run_tag` refers to the run ID | ||
| """ | ||
| rows = await expdb.execute( | ||
| text( | ||
| """ | ||
| SELECT `tag` | ||
| FROM `run_tag` | ||
| WHERE `id` = :run_id | ||
| """, | ||
| ), | ||
| parameters={"run_id": run_id}, | ||
| ) | ||
| return [row.tag for row in rows.all()] | ||
|
|
||
|
|
||
| async def get_input_data(run_id: int, expdb: AsyncConnection) -> list[Row]: | ||
| """Fetch the dataset(s) used as input for a run, with name and url. | ||
|
|
||
| Joins `input_data` with `dataset` to include the dataset name and ARFF URL. | ||
| """ | ||
| rows = await expdb.execute( | ||
| text( | ||
| """ | ||
| SELECT `id`.`data` AS `did`, `d`.`name`, `d`.`url` | ||
| FROM `input_data` `id` | ||
| JOIN `dataset` `d` ON `id`.`data` = `d`.`did` | ||
| WHERE `id`.`run` = :run_id | ||
| """, | ||
| ), | ||
| parameters={"run_id": run_id}, | ||
| ) | ||
| return cast("list[Row]", rows.all()) | ||
|
|
||
|
|
||
| async def get_output_files(run_id: int, expdb: AsyncConnection) -> list[Row]: | ||
| """Fetch output files attached to a run from the `runfile` table. | ||
|
|
||
| Typical entries include the description XML and predictions ARFF. | ||
| The `field` column holds the file label (e.g. "description", "predictions"). | ||
|
|
||
| Note: the PHP response includes a deprecated `did` field hardcoded to "-1" | ||
| for each file. This implementation omits it entirely. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a detail of the API layer, not the database layer. Can move the comment there. |
||
| """ | ||
| rows = await expdb.execute( | ||
| text( | ||
| """ | ||
| SELECT `file_id`, `field` | ||
| FROM `runfile` | ||
| WHERE `source` = :run_id | ||
| """, | ||
| ), | ||
| parameters={"run_id": run_id}, | ||
| ) | ||
| return cast("list[Row]", rows.all()) | ||
|
|
||
|
|
||
| async def get_evaluations( | ||
| run_id: int, | ||
| expdb: AsyncConnection, | ||
| *, | ||
| evaluation_engine_ids: list[int], | ||
| ) -> list[Row]: | ||
| """Fetch evaluation metric results for a run. | ||
|
|
||
| Joins `evaluation` with `math_function` to resolve the metric name | ||
| (the `evaluation` table stores only a `function_id`, not the name directly). | ||
|
|
||
| Filters by `evaluation_engine_id IN (...)`. The list is configurable | ||
| via `config.toml [run] evaluation_engine_ids`. | ||
| Dynamic named parameters are used for aiomysql compatibility. | ||
| """ | ||
| if not evaluation_engine_ids: | ||
| return [] | ||
|
|
||
| # Build :eid_0, :eid_1, ... placeholders — one per engine ID. | ||
| eid_params = {f"eid_{i}": eid for i, eid in enumerate(evaluation_engine_ids)} | ||
| placeholders = ", ".join(f":eid_{i}" for i in range(len(evaluation_engine_ids))) | ||
|
|
||
| query = text( | ||
| f""" | ||
| SELECT `m`.`name`, `e`.`value`, `e`.`array_data` | ||
| FROM `evaluation` `e` | ||
| JOIN `math_function` `m` ON `e`.`function_id` = `m`.`id` | ||
| WHERE `e`.`source` = :run_id | ||
| AND `e`.`evaluation_engine_id` IN ({placeholders}) | ||
| """, # noqa: S608 # placeholders are trusted integer params, not user input | ||
| ) | ||
| rows = await expdb.execute( | ||
| query, | ||
| parameters={"run_id": run_id, **eid_params}, | ||
| ) | ||
| return cast("list[Row]", rows.all()) | ||
|
|
||
|
|
||
| async def get_task_type(task_id: int, expdb: AsyncConnection) -> str | None: | ||
| """Fetch the human-readable task type name for the task associated with a run. | ||
|
|
||
| Joins `task` and `task_type` on `ttid` to resolve the name | ||
| (e.g. "Supervised Classification"). | ||
| """ | ||
| row = await expdb.execute( | ||
| text( | ||
| """ | ||
| SELECT `tt`.`name` | ||
| FROM `task` `t` | ||
| JOIN `task_type` `tt` ON `t`.`ttid` = `tt`.`ttid` | ||
| WHERE `t`.`task_id` = :task_id | ||
| """, | ||
| ), | ||
| parameters={"task_id": task_id}, | ||
| ) | ||
| result = row.one_or_none() | ||
| return result.name if result else None | ||
|
|
||
|
|
||
| async def get_task_evaluation_measure(task_id: int, expdb: AsyncConnection) -> str | None: | ||
| """Fetch the evaluation measure configured for a task, if any. | ||
|
|
||
| Queries `task_inputs` for the row where `input = 'evaluation_measures'`. | ||
| Returns None (not an empty string) when no such row exists, so callers | ||
| can treat a falsy result uniformly. | ||
| """ | ||
| row = await expdb.execute( | ||
| text( | ||
| """ | ||
| SELECT `value` | ||
| FROM `task_inputs` | ||
| WHERE `task_id` = :task_id | ||
| AND `input` = 'evaluation_measures' | ||
| """, | ||
| ), | ||
| parameters={"task_id": task_id}, | ||
| ) | ||
| result = row.one_or_none() | ||
| return result.value if result else None | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please move these to the tasks module. I will have a look at consolidating these if appropriate. |
||
|
|
||
|
|
||
| async def get_trace(run_id: int, expdb: AsyncConnection) -> Sequence[Row]: | ||
| """Get trace rows for a run from the trace table.""" | ||
| rows = await expdb.execute( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,29 @@ | ||
| """Endpoints for run-related data.""" | ||
|
|
||
| from typing import Annotated | ||
| import asyncio | ||
| from typing import TYPE_CHECKING, Annotated, cast | ||
|
|
||
| from fastapi import APIRouter, Depends | ||
|
|
||
| if TYPE_CHECKING: | ||
| from sqlalchemy import Row | ||
| from sqlalchemy.ext.asyncio import AsyncConnection | ||
|
|
||
| import config | ||
| import database.flows | ||
| import database.runs | ||
| import database.setups | ||
| from core.errors import RunNotFoundError, RunTraceNotFoundError | ||
| from routers.dependencies import expdb_connection | ||
| from schemas.runs import RunTrace, TraceIteration | ||
| from routers.dependencies import expdb_connection, userdb_connection | ||
| from schemas.runs import ( | ||
| EvaluationScore, | ||
| InputDataset, | ||
| OutputFile, | ||
| ParameterSetting, | ||
| Run, | ||
| RunTrace, | ||
| TraceIteration, | ||
| ) | ||
|
|
||
| router = APIRouter(prefix="/run", tags=["run"]) | ||
|
|
||
|
|
@@ -42,3 +57,122 @@ async def get_run_trace( | |
| for row in trace_rows | ||
| ], | ||
| ) | ||
|
|
||
|
|
||
| @router.post( | ||
| path="/{run_id}", | ||
| description="Provided for convenience, same as `GET` endpoint.", | ||
| response_model_exclude_none=True, | ||
| ) | ||
| @router.get("/{run_id}", response_model_exclude_none=True) | ||
| async def get_run( | ||
|
sourcery-ai[bot] marked this conversation as resolved.
Outdated
|
||
| run_id: int, | ||
| expdb: Annotated[AsyncConnection, Depends(expdb_connection)], | ||
| userdb: Annotated[AsyncConnection, Depends(userdb_connection)], | ||
| ) -> Run: | ||
| """Get full metadata for a run by ID. | ||
|
|
||
| No authentication or visibility check is performed — all runs are | ||
| publicly accessible. | ||
| """ | ||
| # Core run record — all other data depends on uploader, setup, and task_id. | ||
| run = await database.runs.get(run_id, expdb) | ||
| if run is None: | ||
| msg = f"Run {run_id} not found." | ||
| # Reuse RunNotFoundError and pass code=236 at the call site for | ||
| # backward compat with the PHP GET /run/{id} error code | ||
| raise RunNotFoundError(msg, code=236) | ||
|
|
||
| # Evaluation engine IDs come from config.toml [run] so they can be | ||
| # extended when a new evaluation engine is deployed, without code changes. | ||
| engine_ids: list[int] = config.load_run_configuration().get("evaluation_engine_ids", [1]) | ||
|
|
||
| # Fetch all independent data concurrently. | ||
| ( | ||
| uploader_name, | ||
| tags, | ||
| input_data_rows, | ||
| output_file_rows, | ||
| evaluation_rows, | ||
| task_type, | ||
| task_evaluation_measure, | ||
| setup, | ||
| parameter_rows, | ||
| ) = cast( | ||
| "tuple[str | None, list[str], list[Row], list[Row], list[Row], str | None, str" | ||
| "| None, Row | None, list[Row]]", | ||
| await asyncio.gather( | ||
| database.runs.get_uploader_name(run.uploader, userdb), | ||
| database.runs.get_tags(run_id, expdb), | ||
| database.runs.get_input_data(run_id, expdb), | ||
| database.runs.get_output_files(run_id, expdb), | ||
| database.runs.get_evaluations(run_id, expdb, evaluation_engine_ids=engine_ids), | ||
| database.runs.get_task_type(run.task_id, expdb), | ||
| database.runs.get_task_evaluation_measure(run.task_id, expdb), | ||
| database.setups.get(run.setup, expdb), | ||
| database.setups.get_parameters(run.setup, expdb), | ||
|
Comment on lines
+104
to
+113
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: fd -t f "runs.py" | head -20Repository: openml/server-api Length of output: 128 🏁 Script executed: wc -l src/routers/openml/runs.pyRepository: openml/server-api Length of output: 91 🏁 Script executed: sed -n '90,120p' src/routers/openml/runs.pyRepository: openml/server-api Length of output: 1304 🏁 Script executed: # Check the full function context and initialization
sed -n '70,170p' src/routers/openml/runs.pyRepository: openml/server-api Length of output: 3129 🌐 Web query:
💡 Result: No, a single AsyncConnection instance is not safe to share across concurrent asyncio tasks. Like AsyncSession, AsyncConnection is stateful (e.g., holds a specific database connection and transaction state from the pool) and designed for use within a single task/coroutine. Sharing it across concurrent tasks (e.g., via asyncio.gather) leads to undefined behavior, potential errors detecting concurrent access (especially in SQLAlchemy 2.0+), and race conditions. The AsyncEngine, however, is safe to share across concurrent tasks. It manages an asyncio-compatible connection pool (using asyncio.Queue) that allows multiple tasks to concurrently acquire their own AsyncConnection instances via await engine.connect. Each task should create and use its own AsyncConnection within a context manager: async with engine.connect as conn: result = await conn.execute(stmt) This pattern supports high concurrency without blocking the event loop, as the pool handles connection checkout/checkin asynchronously. Official docs emphasize separate AsyncSession per task but the same principle applies to core AsyncConnection usage. GitHub discussions confirm sessions/connections cannot be shared concurrently, recommending per-task creation. Sync Connection is thread-safe when using Engine.pool, but async follows task-local pattern. Example for concurrent tasks: async def task(engine): async with engine.connect as conn: await conn.execute(...) async def main: await asyncio.gather(*(task(engine) for _ in range(10))) Citations:
🏁 Script executed: # Check if AsyncSession is used anywhere in the codebase
rg "AsyncSession" src/ --type pyRepository: openml/server-api Length of output: 43 🏁 Script executed: # Check database module to understand connection/session setup
fd -t f "database" -type d | head -5Repository: openml/server-api Length of output: 231 🏁 Script executed: # Look at database/__init__.py or similar for connection patterns
ls -la src/database/ | head -20Repository: openml/server-api Length of output: 828 🏁 Script executed: # Search for other asyncio.gather patterns in the codebase
rg "asyncio\.gather" src/ -A 5 -B 2Repository: openml/server-api Length of output: 6116 🏁 Script executed: # Check how AsyncConnection is created/provided
rg "AsyncConnection" src/ --type py -B 2 -A 2Repository: openml/server-api Length of output: 33637 🏁 Script executed: # Check the database module initialization
head -50 src/database/__init__.pyRepository: openml/server-api Length of output: 109 🏁 Script executed: # Look at how expdb is obtained in the router
grep -n "expdb" src/routers/openml/runs.py | head -20Repository: openml/server-api Length of output: 1193 Don't run concurrent queries on the same Eight of the nine Use sequential queries on the shared connection, or acquire separate connections per concurrent task from the engine. Sequential execution for shared connection- await asyncio.gather(
- database.runs.get_uploader_name(run.uploader, userdb),
- database.runs.get_tags(run_id, expdb),
- database.runs.get_input_data(run_id, expdb),
- database.runs.get_output_files(run_id, expdb),
- database.runs.get_evaluations(run_id, expdb, evaluation_engine_ids=engine_ids),
- database.runs.get_task_type(run.task_id, expdb),
- database.runs.get_task_evaluation_measure(run.task_id, expdb),
- database.setups.get(run.setup, expdb),
- database.setups.get_parameters(run.setup, expdb),
- ),
+ (
+ await database.runs.get_uploader_name(run.uploader, userdb),
+ await database.runs.get_tags(run_id, expdb),
+ await database.runs.get_input_data(run_id, expdb),
+ await database.runs.get_output_files(run_id, expdb),
+ await database.runs.get_evaluations(run_id, expdb, evaluation_engine_ids=engine_ids),
+ await database.runs.get_task_type(run.task_id, expdb),
+ await database.runs.get_task_evaluation_measure(run.task_id, expdb),
+ await database.setups.get(run.setup, expdb),
+ await database.setups.get_parameters(run.setup, expdb),
+ ),🤖 Prompt for AI Agents
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In theory valid point but for our usecase all operations are read so no such issue arises is what i feel and it is consistent with codebase (https://github.com/openml/server-api/blob/main/src/routers/openml/datasets.py) for reference. another point to note is that we can use connection pool(sourcery-ai suggested) but i feel it is too much overhead and complex for our usecase.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
A few clarifications to sharpen the discussion:
Given that this is a consistent established pattern in the codebase and not an isolated deviation, I'll defer to your judgment here. You may want to track this as a future codebase-wide improvement if you ever see 🧠 Learnings used
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had completely missed that the AsyncConnection is not safe to be used concurrently in multiple tasks at a time. I propose we ignore that issue in this PR, and update the code base to make appropriate use of sequential calls or separate connections, as is appropriate, in a separate PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I do agree with you @PGijsbers on AsyncConnection to not use concurrently. |
||
| ), | ||
| ) | ||
|
|
||
| # Flow is fetched after the gather because it requires setup.implementation_id. | ||
| # flows.get() selects fullName AS full_name for reliable case-insensitive access. | ||
| flow = await database.flows.get(setup.implementation_id, expdb) if setup else None | ||
|
|
||
| # Build parameter_setting list from the denormalised parameter rows | ||
| # returned by database.setups.get_parameters (which already JOINs input + implementation). | ||
| parameter_settings = [ | ||
| ParameterSetting( | ||
| name=p["name"], | ||
| value=p["value"], | ||
| component=p["flow_id"], # implementation_id of the sub-flow owning this param | ||
| ) | ||
| for p in parameter_rows | ||
| ] | ||
|
|
||
| input_datasets = [ | ||
| InputDataset(did=row.did, name=row.name, url=row.url) for row in input_data_rows | ||
| ] | ||
|
|
||
| # runfile.field is the file label (e.g. "description", "predictions") | ||
| output_files = [OutputFile(file_id=row.file_id, name=row.field) for row in output_file_rows] | ||
|
|
||
| evaluations = [ | ||
| EvaluationScore( | ||
| name=row.name, | ||
| # Whole-number floats (e.g. counts) are converted to int to match PHP's | ||
| # integer representation. e.g. 253.0 → 253, 0.0 → 0. | ||
| value=int(row.value) | ||
| if isinstance(row.value, float) and row.value.is_integer() | ||
| else row.value, | ||
|
sourcery-ai[bot] marked this conversation as resolved.
Outdated
|
||
| array_data=row.array_data, | ||
| ) | ||
| for row in evaluation_rows | ||
| ] | ||
|
|
||
| # Normalise task_evaluation_measure: empty string → None so the field is | ||
| # excluded entirely by response_model_exclude_none=True (matches PHP behaviour | ||
| # of returning "" but we opt to omit rather than return an empty string). | ||
| normalised_measure = task_evaluation_measure or None | ||
|
|
||
| # error_message is NULL in the DB when the run has no error. | ||
| # The PHP response returns an empty array [] in that case. | ||
| error_messages = [run.error_message] if run.error_message else [] | ||
|
|
||
| return Run( | ||
| run_id=run_id, | ||
| uploader=run.uploader, | ||
| uploader_name=uploader_name, | ||
| task_id=run.task_id, | ||
| task_type=task_type, | ||
| task_evaluation_measure=normalised_measure, | ||
| flow_id=setup.implementation_id if setup else 0, | ||
| flow_name=flow.full_name if flow else None, | ||
| setup_id=run.setup, | ||
| setup_string=setup.setup_string if setup else None, | ||
| parameter_setting=parameter_settings, | ||
| error_message=error_messages, | ||
| tag=tags, | ||
| # Preserve PHP envelope structure for backward compat | ||
| input_data={"dataset": input_datasets}, | ||
| output_data={"file": output_files, "evaluation": evaluations}, | ||
| ) | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When resolving merge conflicts, please make this a
DevelopmentConfigurationoption for now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure.