diff --git a/ami/main/admin.py b/ami/main/admin.py index 46fbb70b9..404325a93 100644 --- a/ami/main/admin.py +++ b/ami/main/admin.py @@ -13,6 +13,9 @@ from ami import tasks from ami.jobs.models import Job from ami.ml.models.project_pipeline_config import ProjectPipelineConfig +from ami.ml.post_processing.admin.actions import make_post_processing_action +from ami.ml.post_processing.admin.small_size_filter_form import SmallSizeFilterActionForm +from ami.ml.post_processing.small_size_filter import SmallSizeFilterTask from ami.ml.tasks import remove_duplicate_classifications from .models import ( @@ -34,6 +37,30 @@ Taxon, ) +# PostgreSQL ``bigint`` upper bound. The primary keys on these models are +# BigAutoField, so an all-digit search term longer than this cannot be a valid id. +_BIGINT_MAX = 9223372036854775807 + + +class IdSearchAdminMixin: + """Treat an all-digit admin search term as an exact primary-key lookup. + + The ids on these models are numeric and their text search fields (taxon and + determination names, image paths) never are, so a bare number is unambiguous and + jumps straight to that row. Anything else falls through to the normal + ``search_fields`` search. A number too large to be a valid id returns no results + rather than raising a database ``DataError``. + """ + + def get_search_results(self, request: HttpRequest, queryset: QuerySet[Any], search_term: str): + term = search_term.strip() + if term.isdigit(): + pk = int(term) + if pk > _BIGINT_MAX: + return queryset.none(), False + return queryset.filter(pk=pk), False + return super().get_search_results(request, queryset, search_term) # type: ignore[misc] + class ProjectPipelineConfigInline(admin.TabularInline): model = ProjectPipelineConfig @@ -355,6 +382,9 @@ def get_queryset(self, request: HttpRequest) -> QuerySet[Any]: class ClassificationInline(admin.TabularInline): model = Classification extra = 0 + # Link each row to its Classification change page, where the full scores / + # logits and the applied_to chain (post-processing provenance) are visible. + show_change_link = True fields = ( "taxon", "algorithm", @@ -378,6 +408,9 @@ def get_queryset(self, request: HttpRequest) -> QuerySet[Any]: class DetectionInline(admin.TabularInline): model = Detection extra = 0 + # Link each row to its Detection change page, where the classifications + # inline shows which algorithms (including post-processing) were applied. + show_change_link = True fields = ( "detection_algorithm", "source_image", @@ -395,7 +428,7 @@ class DetectionInline(admin.TabularInline): @admin.register(Detection) -class DetectionAdmin(admin.ModelAdmin[Detection]): +class DetectionAdmin(IdSearchAdminMixin, admin.ModelAdmin[Detection]): """Admin panel example for ``Detection`` model.""" list_display = ( @@ -409,11 +442,32 @@ class DetectionAdmin(admin.ModelAdmin[Detection]): ) autocomplete_fields = ("source_image", "occurrence") + # A digit term jumps to that detection by id (IdSearchAdminMixin); text searches path. + search_fields = ("source_image__path",) + # Skip the extra unfiltered COUNT(*) the changelist runs for its total; on a large + # table that count is as expensive as the page query it accompanies. + show_full_result_count = False def get_queryset(self, request: HttpRequest) -> QuerySet[Any]: - qs = super().get_queryset(request) - return qs.select_related("source_image", "occurrence").annotate( - classifications_count=models.Count("classifications"), + from django.db.models.functions import Coalesce + + qs = super().get_queryset(request).select_related("source_image", "occurrence") + # Correlated subquery instead of Count("classifications") + GROUP BY. The + # grouped aggregate over the whole detection x classification join must run + # before ORDER BY ... LIMIT can take a page, which on a large table is slow + # enough to exhaust work_mem and error out. The subquery runs only for the + # rows on the page. Coalesce maps "no classifications" to 0. + classifications_count = ( + Classification.objects.filter(detection=models.OuterRef("pk")) + .order_by() + .values("detection") + .annotate(c=models.Count("*")) + .values("c") + ) + return qs.annotate( + classifications_count=Coalesce( + models.Subquery(classifications_count, output_field=models.IntegerField()), 0 + ) ) @admin.display( @@ -423,13 +477,15 @@ def get_queryset(self, request: HttpRequest) -> QuerySet[Any]: def classifications_count(self, obj) -> int: return obj.classifications_count - ordering = ("-created_at",) + # Order by -id (indexed PK) rather than -created_at, which has no index and + # forces a full sort of the table to find the newest page. + ordering = ("-id",) inlines = [ClassificationInline] @admin.register(Occurrence) -class OccurrenceAdmin(admin.ModelAdmin[Occurrence]): +class OccurrenceAdmin(IdSearchAdminMixin, admin.ModelAdmin[Occurrence]): """Admin panel example for ``Occurrence`` model.""" list_display = ( @@ -450,19 +506,33 @@ class OccurrenceAdmin(admin.ModelAdmin[Occurrence]): "determination__rank", "created_at", ) + # A digit term jumps to that occurrence by id (IdSearchAdminMixin); text searches names. search_fields = ("determination__name", "determination__search_names") + # Skip the extra unfiltered COUNT(*) the changelist runs for its total; on a large + # table that count is as expensive as the page query it accompanies. + show_full_result_count = False def get_queryset(self, request: HttpRequest) -> QuerySet[Any]: + from django.db.models.functions import Coalesce + qs = super().get_queryset(request) qs = qs.select_related("determination", "project", "deployment", "event") - # Add detections count to queryset - qs = qs.annotate(detections_count=models.Count("detections")) - # Add min, max and avg detection__classifications counts to queryset - # qs = qs.annotate( - # min_detection_classifications=models.Min("detections__classifications"), - # max_detection_classifications=models.Max("detections__classifications"), - # avg_detection_classifications=models.Avg("detections__classifications"), - # ) + # Count detections with a correlated subquery instead of a JOIN + GROUP BY. + # A grouped count must aggregate the whole occurrence x detection join before + # the changelist's ORDER BY ... LIMIT can take a page, so it scans every row to + # show 25 (~15s on a 1.3M-row table). The subquery runs only for the rows that + # survive the limit. Coalesce maps "no detections" to 0 (a bare subquery is NULL, + # where the old JOIN count returned 0). + detections_count = ( + Detection.objects.filter(occurrence=models.OuterRef("pk")) + .order_by() + .values("occurrence") + .annotate(c=models.Count("*")) + .values("c") + ) + qs = qs.annotate( + detections_count=Coalesce(models.Subquery(detections_count, output_field=models.IntegerField()), 0) + ) return qs @admin.display( @@ -472,14 +542,45 @@ def get_queryset(self, request: HttpRequest) -> QuerySet[Any]: def detections_count(self, obj) -> int: return obj.detections_count - ordering = ("-created_at",) + # Per-occurrence post-processing trigger. Same factory as the capture-set + # action on SourceImageCollectionAdmin, scoped to one occurrence — the fast + # spot/dev path for iterating on a filter without running a whole collection. + # New per-occurrence tasks add their own action here the same way. + run_small_size_filter = make_post_processing_action( + SmallSizeFilterTask, + SmallSizeFilterActionForm, + scope_resolver=lambda occurrence: {"occurrence_id": occurrence.pk}, + name_resolver=lambda task_cls, occurrence: (f"Post-processing: {task_cls.name} on Occurrence {occurrence.pk}"), + ) + + @admin.action(description="Recompute determination from current classifications and identifications") + def recompute_determination(self, request: HttpRequest, queryset: QuerySet[Any]) -> None: + """Re-derive each selected occurrence's determination from its current + predictions and human identifications. + + Editing an occurrence's classifications by hand does not recompute its + determination — only Occurrence and Identification saves do — so this action + is the way to refresh it after manual changes. + """ + count = 0 + for occurrence in queryset: + occurrence.save(update_determination=True) + count += 1 + self.message_user(request, f"Recomputed determination for {count} occurrence(s).") + + actions = [run_small_size_filter, recompute_determination] + + # Order by -id (the indexed primary key) rather than -created_at, which has no + # index and would force a full sort of the table to find the newest page. id + # increases with insertion time, so newest-first is preserved. + ordering = ("-id",) # Add classifications as inline inlines = [DetectionInline] @admin.register(Classification) -class ClassificationAdmin(admin.ModelAdmin[Classification]): +class ClassificationAdmin(IdSearchAdminMixin, admin.ModelAdmin[Classification]): list_display = ( "__str__", "taxon", @@ -499,13 +600,38 @@ class ClassificationAdmin(admin.ModelAdmin[Classification]): "detection__source_image__project", "taxon__rank", ) + # FK fields render as AJAX autocompletes instead of uses that name. + self.assertIn(b'name="size_threshold"', response.content) + # No Job created on the GET-equivalent step. + self.assertEqual( + Job.objects.filter(project=self.project, job_type_key="post_processing").count(), + 0, + ) + + def test_select_across_is_refused_without_creating_jobs(self): + # "Select all across pages" would serialize the whole table into hidden + # inputs; the action refuses it instead of rendering an unbounded form. + response = self._post({"confirm": "yes", "size_threshold": "0.001", "select_across": "1"}) + self.assertEqual(response.status_code, 302) + self.assertEqual( + Job.objects.filter(project=self.project, job_type_key="post_processing").count(), + 0, + ) + + def test_invalid_threshold_rerenders_form_with_error(self): + response = self._post({"confirm": "yes", "size_threshold": "2.0"}) + self.assertEqual(response.status_code, 200) + # No Job created when form fails. + self.assertEqual( + Job.objects.filter(project=self.project, job_type_key="post_processing").count(), + 0, + ) + # Field error rendered inline (Django's errorlist, not the page-level errornote banner). + self.assertIn(b"errorlist", response.content) + + +class TestSmallSizeFilterCreatesJob(_SmallSizeFilterAdminCase): + def test_valid_post_creates_one_job_with_threshold_in_config(self): + response = self._post({"confirm": "yes", "size_threshold": "0.001"}) + self.assertEqual(response.status_code, 302) + + job = Job.objects.get( + project=self.project, + job_type_key="post_processing", + ) + self.assertEqual(job.params["task"], "small_size_filter") + self.assertEqual(job.params["config"]["size_threshold"], 0.001) + self.assertEqual(job.params["config"]["source_image_collection_id"], self.collection.pk) + + def test_success_message_links_to_the_created_job(self): + """The post-run admin message links each created Job to its admin change + page so the operator can follow progress and read any failure reason.""" + from django.contrib.messages import get_messages + + response = self._post({"confirm": "yes", "size_threshold": "0.001"}) + self.assertEqual(response.status_code, 302) + job = Job.objects.get(project=self.project, job_type_key="post_processing") + + text = " ".join(str(message) for message in get_messages(response.wsgi_request)) + self.assertIn(reverse("admin:jobs_job_change", args=[job.pk]), text) + self.assertIn(f">Job {job.pk}", text) + + +class TestSmallSizeFilterOccurrenceScope(TestCase): + """The per-occurrence trigger on OccurrenceAdmin uses the same factory with an + ``occurrence_id`` scope — the fast spot/dev path for iterating on a filter.""" + + @classmethod + def setUpTestData(cls) -> None: + cls.superuser = User.objects.create_superuser(email="ssfocc@example.com", password="x") + cls.project = Project.objects.create(name="SSF occurrence-scope test") + cls.occurrence = Occurrence.objects.create(project=cls.project) + + def setUp(self) -> None: + self.client = Client() + self.client.force_login(self.superuser) + + def test_valid_post_creates_one_job_scoped_to_the_occurrence(self): + url = reverse("admin:main_occurrence_changelist") + response = self.client.post( + url, + data={ + "action": "run_small_size_filter", + django_admin.helpers.ACTION_CHECKBOX_NAME: [str(self.occurrence.pk)], + "confirm": "yes", + "size_threshold": "0.001", + }, + ) + self.assertEqual(response.status_code, 302) + + job = Job.objects.get(project=self.project, job_type_key="post_processing") + self.assertEqual(job.params["task"], "small_size_filter") + self.assertEqual(job.params["config"]["occurrence_id"], self.occurrence.pk) + # Collection scope stays absent so the schema's exactly-one-scope rule holds. + self.assertIsNone(job.params["config"].get("source_image_collection_id")) + + +class TestSmallSizeFilterMultiCollection(_SmallSizeFilterAdminCase): + @classmethod + def setUpTestData(cls) -> None: + super().setUpTestData() + # Second collection in a different project. + cls.other_project = Project.objects.create(name="SSF admin test (other project)") + cls.other_collection = SourceImageCollection.objects.create( + project=cls.other_project, + name="Other collection", + ) + + def test_multi_collection_creates_one_job_per_collection_with_correct_project_fk(self): + response = self._post( + {"confirm": "yes", "size_threshold": "0.001"}, + pks=[self.collection.pk, self.other_collection.pk], + ) + self.assertEqual(response.status_code, 302) + + jobs = Job.objects.filter(job_type_key="post_processing").order_by("project_id") + self.assertEqual(jobs.count(), 2) + + by_project = {j.project_id: j for j in jobs} + self.assertEqual( + by_project[self.project.pk].params["config"]["source_image_collection_id"], + self.collection.pk, + ) + self.assertEqual( + by_project[self.other_project.pk].params["config"]["source_image_collection_id"], + self.other_collection.pk, + ) diff --git a/ami/ml/tests.py b/ami/ml/tests.py index 898a69d11..6781f80ee 100644 --- a/ami/ml/tests.py +++ b/ami/ml/tests.py @@ -13,11 +13,13 @@ Deployment, Detection, Event, + Identification, Occurrence, Project, SourceImage, SourceImageCollection, Taxon, + TaxonRank, group_images_into_events, ) from ami.ml.models import Algorithm, Pipeline, ProcessingService @@ -1436,24 +1438,29 @@ def test_labels_data_conversion_methods(self): class TestPostProcessingTasks(TestCase): - def setUp(self): - # Create test project, deployment, and default setup - self.project, self.deployment = setup_test_project() - create_taxa(project=self.project) - self._create_images_with_dimensions(deployment=self.deployment) - group_images_into_events(deployment=self.deployment) + @classmethod + def setUpTestData(cls): + # Project, taxa, images, events, and the collection are read-only from the + # tests' point of view — build them once per class. Detections (and the + # task runs that mutate them) happen per-test inside each test's + # rolled-back transaction. + cls.project, cls.deployment = setup_test_project() + create_taxa(project=cls.project) + cls._create_images_with_dimensions(deployment=cls.deployment) + group_images_into_events(deployment=cls.deployment) # Create a simple SourceImageCollection for testing - self.collection = SourceImageCollection.objects.create( + cls.collection = SourceImageCollection.objects.create( name="Test PostProcessing Collection", - project=self.project, + project=cls.project, method="manual", - kwargs={"image_ids": list(self.deployment.captures.values_list("pk", flat=True))}, + kwargs={"image_ids": list(cls.deployment.captures.values_list("pk", flat=True))}, ) - self.collection.populate_sample() + cls.collection.populate_sample() + @classmethod def _create_images_with_dimensions( - self, + cls, deployment, num_images: int = 5, width: int = 640, @@ -1526,6 +1533,136 @@ def test_small_size_filter_assigns_not_identifiable(self): f"Occurrence {occurrence.pk} should have its determination set to 'Not identifiable'.", ) + def test_occurrence_scope_only_touches_that_occurrence(self): + """Per-occurrence scope: running with ``occurrence_id`` flags only that + occurrence's detections and leaves sibling occurrences untouched.""" + detections = [] + for image in self.collection.images.all(): + det = Detection.objects.create( + source_image=image, + bbox=[0, 0, 10, 10], # small + created_at=datetime.datetime.now(datetime.timezone.utc), + ) + det.associate_new_occurrence() + detections.append(det) + self.assertGreaterEqual(len(detections), 2) + + target = detections[0] + SmallSizeFilterTask(occurrence_id=target.occurrence_id, size_threshold=0.01).run() + + not_identifiable_taxon = Taxon.objects.get(name="Not identifiable") + self.assertEqual( + Classification.objects.filter(detection=target, taxon=not_identifiable_taxon).count(), + 1, + "The scoped occurrence's detection should be flagged.", + ) + for other in detections[1:]: + self.assertFalse( + Classification.objects.filter(detection=other, taxon=not_identifiable_taxon).exists(), + f"Detection {other.pk} outside the scoped occurrence should be untouched.", + ) + + def test_run_reports_stage_metrics_on_job(self): + """The task surfaces ``detections_checked`` / ``detections_flagged`` / + ``occurrences_updated`` as stage params on its Job so an operator can see + what a run examined and changed without reading the log.""" + from ami.jobs.models import Job + + for image in self.collection.images.all(): + Detection.objects.create( + source_image=image, + bbox=[0, 0, 10, 10], # small → flagged + created_at=datetime.datetime.now(datetime.timezone.utc), + ).associate_new_occurrence() + total = Detection.objects.filter(source_image__in=self.collection.images.all()).count() + self.assertGreater(total, 0) + + job = Job.objects.create( + project=self.project, + name="stage metrics test", + job_type_key="post_processing", + params={ + "task": "small_size_filter", + "config": {"source_image_collection_id": self.collection.pk, "size_threshold": 0.01}, + }, + ) + job.progress.add_stage("Post Processing", key="post_processing") + job.save() + + SmallSizeFilterTask( + job=job, + source_image_collection_id=self.collection.pk, + size_threshold=0.01, + ).run() + + job.refresh_from_db() + params = {p.name: p.value for p in job.progress.get_stage("post_processing").params} + self.assertEqual(params.get("detections_checked"), total) + self.assertEqual(params.get("detections_flagged"), total) # every detection is small + # Each detection has its own occurrence here, so the deduped occurrence + # count equals the detection count. + self.assertEqual(params.get("occurrences_updated"), total) + + def test_occurrences_updated_counts_only_changed_determinations(self): + """``occurrences_updated`` counts occurrences whose determination actually + changed, not every occurrence the filter re-saved. + + An occurrence already pinned to a human identification keeps that + determination when its detection is flagged "Not identifiable", so it must + not inflate the metric. Only the un-identified occurrence, whose + determination flips, is counted. + """ + from ami.jobs.models import Job + + images = list(self.collection.images.all()) + self.assertGreaterEqual(len(images), 2) + + # Occurrence A: small detection, but a human identification pins the + # determination — flagging the detection does not change it. + human_taxon = Taxon.objects.create(name="Human-pinned species", rank=TaxonRank.SPECIES) + identifier = User.objects.create_user(email="identifier@insectai.org") # type: ignore[attr-defined] + det_with_id = Detection.objects.create( + source_image=images[0], + bbox=[0, 0, 10, 10], + created_at=datetime.datetime.now(datetime.timezone.utc), + ) + det_with_id.associate_new_occurrence() + Identification.objects.create(user=identifier, occurrence=det_with_id.occurrence, taxon=human_taxon) + + # Occurrence B: small detection, no identification — its determination + # flips to "Not identifiable" and is the only real change. + det_plain = Detection.objects.create( + source_image=images[1], + bbox=[0, 0, 10, 10], + created_at=datetime.datetime.now(datetime.timezone.utc), + ) + det_plain.associate_new_occurrence() + + job = Job.objects.create( + project=self.project, + name="changed-determination metric test", + job_type_key="post_processing", + params={ + "task": "small_size_filter", + "config": {"source_image_collection_id": self.collection.pk, "size_threshold": 0.01}, + }, + ) + job.progress.add_stage("Post Processing", key="post_processing") + job.save() + + SmallSizeFilterTask( + job=job, + source_image_collection_id=self.collection.pk, + size_threshold=0.01, + ).run() + + job.refresh_from_db() + params = {p.name: p.value for p in job.progress.get_stage("post_processing").params} + # Both detections are flagged small, but only the un-identified + # occurrence's determination changes, so only it is counted. + self.assertEqual(params.get("detections_flagged"), 2) + self.assertEqual(params.get("occurrences_updated"), 1) + class TestTaskStateManager(TestCase): """Test TaskStateManager for job progress tracking.""" diff --git a/ami/templates/admin/post_processing/_form_fieldset.html b/ami/templates/admin/post_processing/_form_fieldset.html new file mode 100644 index 000000000..d20478d1a --- /dev/null +++ b/ami/templates/admin/post_processing/_form_fieldset.html @@ -0,0 +1,9 @@ +{{ form.non_field_errors }} +{% for field in form %} +
+ {{ field.label_tag }} + {{ field }} + {% if field.help_text %}

{{ field.help_text }}

{% endif %} + {{ field.errors }} +
+{% endfor %} diff --git a/ami/templates/admin/post_processing/confirmation.html b/ami/templates/admin/post_processing/confirmation.html new file mode 100644 index 000000000..565f3f2e0 --- /dev/null +++ b/ami/templates/admin/post_processing/confirmation.html @@ -0,0 +1,37 @@ +{% extends "admin/base_site.html" %} + +{% load i18n admin_urls %} + +{% block title %} + {{ title }} | {{ site_title|default:_("Django site admin") }} +{% endblock title %} +{% block breadcrumbs %} + +{% endblock breadcrumbs %} +{% block content %} +
+ {% csrf_token %} + {% block intro %} +

+ You are about to run {{ task_label }} on + {{ selected_count }} selected + {{ model_meta.verbose_name }}{{ selected_count|pluralize }}. +

+ {% endblock intro %} +
+ {% translate "Parameters" %} + {% include "admin/post_processing/_form_fieldset.html" with form=form %} +
+ {% for pk in selected_pks %}{% endfor %} + + +
+ + {% translate 'Cancel' %} +
+
+{% endblock content %} diff --git a/docs/claude/planning/2026-05-01-post-processing-admin-scaffolding-design.md b/docs/claude/planning/2026-05-01-post-processing-admin-scaffolding-design.md new file mode 100644 index 000000000..d2ebbc991 --- /dev/null +++ b/docs/claude/planning/2026-05-01-post-processing-admin-scaffolding-design.md @@ -0,0 +1,347 @@ +# Post-Processing Admin Scaffolding — Design + +**Status:** Draft (awaiting user review) +**Date:** 2026-05-01 +**Branch:** `feat/post-processing-admin-scaffolding` +**Author:** Michael Bunsen (with Claude Opus 4.7) + +> **Update (2026-06-04):** Following review feedback, the admin controller glue +> was abstracted rather than left as per-task copy-paste. The confirm/render/ +> validate/enqueue flow now lives in `ami/ml/post_processing/admin/actions.py` +> as `make_post_processing_action(task_cls, form_class, scope_resolver=..., build_jobs=...)`. +> Each task declares only what varies (task class, knob form, row→Job mapping); +> tasks that don't fit one-Job-per-row (e.g. #1272's per-project event +> partitioning) pass a custom `build_jobs` callable. Config validation is owned +> solely by the task's pydantic `config_schema` — the knob form no longer +> re-encodes the bounds, and schema errors are mapped back onto the form for +> inline display. This supersedes the "module-private `_render_confirmation`, +> lift later" plan in the "Admin Action Rewrite" section below. + +## Context + +`ami/ml/post_processing/` currently ships one task on main: `SmallSizeFilterTask` (PR #954, merged). Two open PRs add more post-processing tasks and each independently grew its own admin-trigger plumbing: + +- **PR #999** (`feat/postprocessing-class-masking`, mohamedelabbas1996, open since 2025-10-14) adds `class_masking` and `rank_rollup` tasks. Admin trigger uses hand-rolled HTML in the action method; no `forms.Form` class. +- **PR #1272** (`claude/revive-tracking-feature-OyMO3`, current author, open) adds `tracking` task. Admin trigger uses a `forms.Form` subclass in `ami/ml/post_processing/admin_forms.py` with scope-aware dropdown init and per-project Job partitioning. + +Both PRs touch `ami/main/admin.py`, `ami/ml/post_processing/registry.py`, and add a near-identical `*_confirmation.html` template. Three independent ad-hoc patterns are emerging where one shared one would do. + +The existing `SmallSizeFilterTask` already reads `size_threshold` from `Job.params['config']` (default `0.0008`) — but the admin trigger hardcodes empty config, so the knob is unreachable. There's even a TODO comment in `small_size_filter.py:14` asking *"Could we use a pydantic model for config validation if it's just for this task?"*. This precursor PR answers that question and lands the answer as the shared pattern. + +## Goal + +Land a small precursor PR that establishes the shared admin-trigger pattern for post-processing tasks, using `SmallSizeFilterTask` as the migration consumer. Both #999 and #1272 rebase onto it and adopt the pattern. No new domain logic, no PR coordination drama, no carving up another contributor's work. + +Admin is **not** the long-term primary trigger surface for post-processing — REST API + UI will eventually drive this. The scaffolding here optimises for current-state needs (admin-only) without painting future API integration into a corner. + +## Scope + +### In + +1. **Pydantic config schema contract** on `BasePostProcessingTask` +2. **`BasePostProcessingActionForm`** — django form base class, `cleaned_data → config` contract +3. **Parameterized confirmation template** + form-fieldset partial +4. **Migrate `SmallSizeFilterTask`** to new pattern: + - Add `SmallSizeFilterConfig(size_threshold: float = 0.0008, source_image_collection_id: int)` pydantic model + - Add `SmallSizeFilterActionForm` with one field: `size_threshold` (`FloatField`, validation: `0 < x < 1`) + - Rewrite `SourceImageCollectionAdmin.run_small_size_filter` to render intermediate confirmation page using new template + form +5. **Tests** for scaffolding + migrated task + +### Out + +- Project-partitioning helper (defer to whichever multi-scope adopter lands first — #999 or #1272) +- REST API endpoints for triggering post-processing +- Management commands +- pgvector migrations (#1272 territory) +- Rank rollup (stays in #999 — PR coordination unnecessary now) +- Class masking (stays in #999) +- Tracking (stays in #1272) + +## Module Layout + +```text +ami/ml/post_processing/ +├── base.py # MODIFIED — +config_schema contract +├── registry.py # unchanged +├── small_size_filter.py # MODIFIED — schema-validated config, .config now BaseModel +├── admin/ # NEW +│ ├── __init__.py +│ ├── forms.py # BasePostProcessingActionForm +│ └── small_size_filter_form.py # SmallSizeFilterActionForm +└── tests/ # NEW directory (existing tests in ami/ml/tests.py) + ├── __init__.py + ├── test_base_schema.py + ├── test_admin_form.py + └── test_small_size_filter_admin.py + +ami/templates/admin/post_processing/ # NEW +├── confirmation.html # parameterized shell +└── _form_fieldset.html # partial — renders form fields uniformly + +ami/main/admin.py # MODIFIED — run_small_size_filter rewrites onto new pattern +``` + +Path note: spec uses `ami/ml/post_processing/admin/` as user requested. Tracking PR's `admin_forms.py` (top-level module) becomes `admin/tracking_form.py` on its rebase. + +## Pydantic Schema Contract + +`BasePostProcessingTask` gains a required class attribute `config_schema: type[BaseModel]` and validates config at construction. + +```python +# ami/ml/post_processing/base.py (sketch) +import pydantic + +class BasePostProcessingTask(abc.ABC): + key: str + name: str + config_schema: type[pydantic.BaseModel] # NEW + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + for attr in ("key", "name", "config_schema"): + if not hasattr(cls, attr) or getattr(cls, attr) is None: + raise TypeError(f"{cls.__name__} must define '{attr}' class attribute") + + def __init__(self, job=None, logger=None, **config): + self.job = job + # Validate config against schema. Raises pydantic.ValidationError on bad input. + self.config: pydantic.BaseModel = self.config_schema(**config) + # ... existing logger + algorithm setup unchanged ... +``` + +**Pydantic version:** repo uses Pydantic v1 (per `requirements/base.txt` + container-side memory). Use v1 syntax: `BaseModel`, `Field`, `validator`, `.dict()`. No `model_dump`/`model_validate`. + +**Validation timing:** + +- **Worker side (always):** `BasePostProcessingTask.__init__` validates. Bad config in a Job's params → task crashes with clear pydantic error in job logs. +- **Admin side (added in this PR):** the rewritten `run_small_size_filter` calls `SmallSizeFilterConfig(**form.to_config(), source_image_collection_id=collection.pk)` *before* `Job.objects.create`. Validation error → form re-renders with error, no Job created. + +**Per-task schema example:** + +```python +# ami/ml/post_processing/small_size_filter.py +class SmallSizeFilterConfig(pydantic.BaseModel): + source_image_collection_id: int + size_threshold: float = 0.0008 + + @pydantic.validator("size_threshold") + def _threshold_in_unit_interval(cls, v): + if not (0.0 < v < 1.0): + raise ValueError("size_threshold must be in (0, 1) exclusive") + return v + + class Config: + extra = "forbid" # unknown keys rejected — catches typos +``` + +**Migration impact for existing in-flight Jobs:** none. `Job.params['config']` payloads already match the new schema (only `source_image_collection_id` is required; `size_threshold` defaults). Workers picking up old jobs after deploy will validate cleanly. + +## Admin Form Base + +```python +# ami/ml/post_processing/admin/forms.py +class BasePostProcessingActionForm(forms.Form): + """Base for admin forms that build BasePostProcessingTask config dicts. + + Subclass adds knob fields. Override to_config() if mapping isn't 1:1 + (e.g. drop empty optional fields, derive computed values). + """ + + def to_config(self) -> dict: + return dict(self.cleaned_data) +``` + +That's it. The form base is intentionally thin — it's a contract marker (so admin actions know which type of form to render) plus one helper. Scope-aware kwargs (`events=`, `collection=`) are subclass-specific and don't belong on the base. + +## Confirmation Template + +```html +{# ami/templates/admin/post_processing/confirmation.html #} +{% extends "admin/base_site.html" %} +{% load i18n admin_urls %} + +{% block title %}{{ title }} | {{ site_title|default:_("Django site admin") }}{% endblock %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +
+ {% csrf_token %} + + {% block intro %} +

You are about to run {{ task_label }} on + {{ selected_count }} selected + {{ model_meta.verbose_name }}{{ selected_count|pluralize }}.

+ {% endblock %} + +
+ {% translate "Parameters" %} + {% include "admin/post_processing/_form_fieldset.html" with form=form %} +
+ + {% for pk in selected_pks %} + + {% endfor %} + + + +
+ + {% translate 'Cancel' %} +
+
+{% endblock %} +``` + +```html +{# ami/templates/admin/post_processing/_form_fieldset.html #} +{% for field in form %} +
+ {{ field.label_tag }} + {{ field }} + {% if field.help_text %}

{{ field.help_text }}

{% endif %} + {% for error in field.errors %}

{{ error }}

{% endfor %} +
+{% endfor %} +``` + +Per-task templates (e.g. tracking) extend the shell + override `{% block intro %}` for task-specific preamble. Small-size-filter uses bare shell. + +## Admin Action Rewrite + +```python +# ami/main/admin.py (sketch — only the changed action) +from ami.ml.post_processing.admin.small_size_filter_form import SmallSizeFilterActionForm +from ami.ml.post_processing.small_size_filter import SmallSizeFilterConfig + +@admin.action(description="Run Small Size Filter post-processing task (async)") +def run_small_size_filter(self, request, queryset): + if request.POST.get("confirm"): + form = SmallSizeFilterActionForm(request.POST) + if not form.is_valid(): + return _render_confirmation(request, queryset, form) + cfg = form.to_config() + jobs = [] + for collection in queryset: + try: + validated = SmallSizeFilterConfig( + **cfg, + source_image_collection_id=collection.pk, + ) + except pydantic.ValidationError as exc: + self.message_user(request, f"Bad config for collection {collection.pk}: {exc}", level="error") + continue + job = Job.objects.create( + name=f"Post-processing: SmallSizeFilter on Capture Set {collection.pk}", + project=collection.project, + job_type_key="post_processing", + params={"task": "small_size_filter", "config": validated.dict()}, + ) + job.enqueue() + jobs.append(job.pk) + self.message_user(request, f"Queued Small Size Filter for {len(jobs)} capture set(s). Jobs: {jobs}") + return None + + return _render_confirmation(request, queryset, SmallSizeFilterActionForm()) + + +def _render_confirmation(request, queryset, form): + return TemplateResponse( + request, + "admin/post_processing/confirmation.html", + { + **self.admin_site.each_context(request), + "title": "Run Small Size Filter", + "task_label": "Small Size Filter", + "form": form, + "selected_count": queryset.count(), + "selected_pks": [str(o.pk) for o in queryset], + "action_name": "run_small_size_filter", + "submit_label": "Run Small Size Filter", + "changelist_url": reverse("admin:main_sourceimagecollection_changelist"), + "model_meta": self.model._meta, + "opts": self.model._meta, + "action_checkbox_name": admin.helpers.ACTION_CHECKBOX_NAME, + }, + ) +``` + +`_render_confirmation` is a module-private helper near the action; not a class method on the admin site. If a future PR finds itself duplicating it across admins, lift it to `ami/ml/post_processing/admin/helpers.py` then. + +## Tests + +All four test files live under `ami/ml/post_processing/tests/`. New tests do not touch the existing `ami/ml/tests.py` file (which holds older post-processing smoke tests). + +**`test_base_schema.py`:** +- Subclassing without `config_schema` raises `TypeError` +- Bad config dict raises `pydantic.ValidationError` at task construction +- Valid config builds task, `task.config` is a `BaseModel` instance +- Unknown keys rejected (`extra="forbid"` semantics) + +**`test_admin_form.py`:** +- `BasePostProcessingActionForm.to_config()` returns dict matching `cleaned_data` +- `SmallSizeFilterActionForm` validates `size_threshold` in `(0, 1)` exclusive +- Form errors render in confirmation template (smoke render via Django test client) + +**`test_small_size_filter_admin.py`:** +- GET-equivalent (POST without `confirm`) renders intermediate page; no Job created +- POST with valid `confirm=yes` + `size_threshold=0.001` creates Job per collection with that threshold in `params['config']` +- POST with `size_threshold=2.0` re-renders form with error; no Job +- Multi-collection POST creates one Job per collection, each with correct project FK + +Existing `SmallSizeFilterTask` behavior tests (in `ami/ml/tests.py`, if any) should still pass — schema validation is additive, default value preserved. + +## Rebase Impact + +### PR #1272 (tracking) + +Net change: smaller diff. + +- `ami/ml/post_processing/admin_forms.py` → `ami/ml/post_processing/admin/tracking_form.py` (location move) +- `TrackingActionForm` extends `BasePostProcessingActionForm`, gains `to_config()` override that drops empty `feature_extraction_algorithm_id` +- `tracking_confirmation.html` extends new shell + overrides `{% block intro %}` for "you are about to run tracking on N events…" preamble +- New `TrackingConfig(pydantic.BaseModel)` schema replaces freeform dict; `tracking_task.py` reads typed `self.config` +- Admin actions in `ami/main/admin.py` reuse new template via `_render_confirmation` helper (or its lifted version) +- Per-project Job partitioning loop stays in #1272 (this PR doesn't ship the helper) + +Coordination: I (current author) own both #1272 and the precursor, so this rebase is internal. + +### PR #999 (class masking) + +Net change: smaller diff, but bigger lift than #1272 because #999 used hand-rolled HTML. + +- Hand-rolled `` HTML → `ClassMaskingActionForm(BasePostProcessingActionForm)` with `ModelChoiceField(queryset=TaxaList.objects.…)` + `ModelChoiceField(queryset=Algorithm.objects.…)` +- `class_masking_confirmation.html` becomes thin override of new shell with masking-specific intro/preview block +- New `ClassMaskingConfig` + `RankRollupConfig` schemas +- Admin action validates form + builds typed config via `to_config()` instead of pulling from `request.POST` directly + +Coordination: post-merge of precursor PR, message mohamedelabbas1996 in PR #999 with rebase guidance + concrete diff suggestions. Their existing rank-rollup work is unaffected; only the class-masking trigger needs reshaping. + +## Risks + +1. **Pydantic v1 vs v2 mismatch.** Container is v1 (per memory `MEMORY.md`: "Container uses Pydantic v1 — use `.dict()` / `.json()`, not `.model_dump()` / `.model_dump_json()`"). Spec uses v1 syntax throughout. CI runs in container, so v1 is enforced. + +2. **`__init_subclass__` strictness change.** Adding `config_schema` to required attrs breaks any out-of-tree subclass. Only in-tree consumers exist; check shows: `SmallSizeFilterTask` (will be migrated in this PR), and the `BasePostProcessingTask` referenced in #1272 + #999 (rebase territory). Acceptable. + +3. **Pydantic `BaseModel` in `Job.params['config']`.** Stored as dict via `validated.dict()`. JSONField round-trip is lossless for primitive-typed schemas. Risk: if a future schema uses `datetime` or non-JSON-native types, serialization needs explicit `.json()` → `json.loads(...)` round-trip. Out of scope for this PR (small-size-filter has only `int` + `float`). + +4. **Test runner uses `--keepdb`.** Existing `test_ami` DB has prior `SmallSizeFilterTask` migration. New tests don't add migrations. Should pass cleanly; verify with `docker compose -f docker-compose.ci.yml run --rm django python manage.py test ami.ml.post_processing.tests --keepdb`. + +5. **Form action POST vs GET ergonomics.** Django admin actions are POST-only. The "render confirmation page" leg uses POST without `confirm` flag. Existing pattern in #1272 + #999. No new risk. + +## Out of Scope (Future Work) + +- **Project-partitioning helper** (`enqueue_post_processing_jobs(queryset, task_cls, cleaned_data, scope_resolver)`). Belongs in whichever multi-scope adopter lands first (likely #1272, since tracking partitions; #999's masking is single-Occurrence-scoped per row). +- **REST API surface** for triggering post-processing from UI. Eventual replacement for admin trigger as primary surface. +- **Schema-driven form generation** (auto-build a `ModelForm`-style form from a pydantic schema). Tempting but premature; current task count = 1, second adopter has scope-aware dropdowns that don't fit auto-generation. +- **Job param schema versioning.** Once multiple post-processing tasks ship and config schemas evolve, a `schema_version` field on the schema may be needed for backward-compat with old Job rows. Defer until first breaking schema change. + +## Implementation Plan + +To be drafted by writing-plans skill after user approves this spec.