fix: BROS-499: Ground truth queue for onboarding mode skipped when “Desired agreement threshold” is enabled (#8586)

makseq · web-flow · commit 4487d49b95e1 · 2025-10-03T22:16:38.000+01:00
diff --git a/label_studio/core/settings/base.py b/label_studio/core/settings/base.py
@@ -617,6 +617,7 @@
 TASK_MIXIN = 'tasks.mixins.TaskMixin'
 LSE_PROJECT = None
 GET_TASKS_AGREEMENT_QUERYSET = None
+SHOULD_ATTEMPT_GROUND_TRUTH_FIRST = None
 ANNOTATION_MIXIN = 'tasks.mixins.AnnotationMixin'
 ORGANIZATION_MIXIN = 'organizations.mixins.OrganizationMixin'
 USER_MIXIN = 'users.mixins.UserMixin'
diff --git a/label_studio/data_manager/managers.py b/label_studio/data_manager/managers.py
@@ -548,11 +548,7 @@ def annotate_completed_at(queryset: TaskQuerySet) -> TaskQuerySet:
     is_lse_project = bool(LseProject)
     has_custom_agreement_queryset = bool(get_tasks_agreement_queryset)
 
-    if (
-        is_lse_project
-        and has_custom_agreement_queryset
-        and flag_set('fflag_feat_optic_161_project_settings_for_low_agreement_threshold_score_short', user='auto')
-    ):
+    if is_lse_project and has_custom_agreement_queryset:
         return annotated_completed_at_considering_agreement_threshold(queryset)
 
     return base_annotate_completed_at(queryset)
diff --git a/label_studio/projects/functions/next_task.md b/label_studio/projects/functions/next_task.md
@@ -0,0 +1,98 @@
+# Label Stream: Next Task Selection (LSE)
+
+This doc summarizes how Label Studio Enterprise selects the next task for labeling, based on the current code in `label_studio/projects/functions/next_task.py` and LSE-specific logic from `label_studio_enterprise/lse_projects/functions.py`.
+
+Notes
+- Queue labels shown in the UI (e.g. "Ground truth queue", "Show overlap first") indicate which strategies were attempted, not always the final source of the selected task.
+- Feature flags impact ordering and inclusion at several steps (see Legend below).
+
+## High-level flow
+
+```mermaid
+flowchart TD
+  A["Input: prepared_tasks, dm_queue, assigned_flag, user, project"] --> B
+
+  subgraph Build candidate pool not_solved_tasks
+    B[Start from prepared_tasks] --> B1[Exclude tasks annotated by this user]
+    B1 --> B2[Exclude user's postponed drafts]
+    B2 --> B3{assigned_flag?}
+    B3 -- yes --> B5[Skip agreement logic] --> B7
+
+    B3 -- no --> B4{"LSE low-agreement path?<br/>fflag OPTIC-161<br/>agreement_threshold set<br/>user is annotator"}
+    B4 -- yes --> B6["Filter by agreement threshold<br/>and annotator capacity"] --> B7[Optionally prioritize by low agreement]
+
+    B4 -- no --> B8{"Evaluation mode?<br/>fflag ALL-LEAP-1825<br/>show_ground_truth_first"}
+    B8 -- yes --> B7
+    B8 -- no --> B9[Filter: is_labeled=false] --> B7
+  end
+
+  B7 --> C{dm_queue?}
+  C -- yes --> DM["Data manager queue<br/>not_solved_tasks.first()"] --> K
+  C -- no --> D
+
+  subgraph No DM queue path
+    D{assigned_flag?} -- yes --> M["Manually assigned queue<br/>first() from not_solved_tasks"] --> K
+    D -- no --> E["Check existing lock for user<br/>if exists: Task lock"] --> F
+
+    F{prioritized_low_agreement?} -- yes --> LAL["Low agreement queue<br/>first unlocked"] --> K
+    F -- no --> G
+
+    G{"GT-first gating?<br/>should_attempt_ground_truth_first(user, project)"} -- yes --> GT["Ground truth queue<br/>_try_ground_truth()"] --> H
+    G -- no --> H
+
+    H{project.maximum_annotations > 1?} -- yes --> BF["Breadth first queue<br/>_try_breadth_first()"] --> I
+    H -- no --> I
+
+    I{"FF overlap-after?<br/>fflag FIX-BACK-LSDV-4523 AND show_overlap_first<br/>AND no next_task"}
+    I -- yes --> OF["Filter to overlap>1<br/>Show overlap first"] --> S
+    I -- no --> S
+
+    S{next_task selected?}
+    S -- yes --> P[Check post-queues]
+    S -- no --> T{project.sampling}
+    T -- Sequence --> SQ["Sequence queue<br/>first unlocked"] --> P
+    T -- Uncertainty --> AL["Active learning or random queue"] --> P
+    T -- Uniform --> UR["Uniform random queue<br/>random unlocked"] --> P
+  end
+
+  subgraph Post queues user-specific
+    P --> PD["Postponed draft queue<br/>user drafts: was_postponed=true, is_labeled=false"] --> SK
+    SK["Skipped queue (REQUEUE_FOR_ME)<br/>user annotations: was_cancelled=true, is_labeled=false"] --> K
+  end
+
+  K["Finalize<br/>- Set task lock if required<br/>- add_stream_history()<br/>- return next_task + queue_info"]
+```
+
+## Legend and flags
+
+- fflag FIX-BACK-LSDV-4523 (Overlap First Ordering): applies the "Show overlap first" filtering after GT/low-agreement/breadth-first attempts; otherwise, it is applied earlier while building the candidate pool.
+
+### GT-first gating
+- `should_attempt_ground_truth_first(user, project)` returns true when:
+  - `show_ground_truth_first=True` and either no `lse_project` or `annotator_evaluation_minimum_tasks` is not set, or
+  - the user's completed GT-equipped tasks < `annotator_evaluation_minimum_tasks`, or
+  - minimum tasks reached but the user's GT agreement score is missing or below `annotator_evaluation_minimum_score` (percent).
+- Otherwise returns false (GT-first disabled; proceed via low-agreement/overlap/sampling).
+
+## Queue labels appended to response
+
+The `queue_info` string aggregates labels as specific stages are attempted:
+- "Manually assigned queue" when `assigned_flag` path is used.
+- "Task lock" when returning a task already locked by the user.
+- "Low agreement queue" when the prioritized low-agreement branch returns a task.
+- "Ground truth queue" when GT is attempted (label may appear even if selection falls through).
+- "Breadth first queue" for in-progress tasks (when `maximum_annotations > 1`).
+- "Show overlap first" when overlap filtering is applied.
+- Sampling labels:
+  - "Sequence queue"
+  - "Active learning or random queue" (uncertainty)
+  - "Uniform random queue"
+- Post queues:
+  - "Postponed draft queue"
+  - "Skipped queue"
+
+## References
+- Core selection: `label_studio/projects/functions/next_task.py`
+- LSE agreement & counters: `label_studio_enterprise/lse_projects/functions.py`
+
+
diff --git a/label_studio/projects/functions/next_task.py b/label_studio/projects/functions/next_task.py
@@ -15,7 +15,17 @@
 
 logger = logging.getLogger(__name__)
 
+
+# Hook for GT-first gating (Enterprise can override via settings)
+def _oss_should_attempt_gt_first(user: User, project: Project) -> bool:
+    # Open-source default: if project enables GT-first, allow it without onboarding gates
+    return bool(project.show_ground_truth_first)
+
+
 get_tasks_agreement_queryset = load_func(settings.GET_TASKS_AGREEMENT_QUERYSET)
+should_attempt_ground_truth_first = (
+    load_func(settings.SHOULD_ATTEMPT_GROUND_TRUTH_FIRST) or _oss_should_attempt_gt_first
+)
 
 
 def get_next_task_logging_level(user: User) -> int:
@@ -158,33 +168,41 @@ def get_not_solved_tasks_qs(
     prioritized_on_agreement = False
     # if annotator is assigned for tasks, he must solve it regardless of is_labeled=True
     if not assigned_flag:
-        # include tasks that have been completed if their agreement is not at threshold if threshold setting is set
+        # low agreement strategy for auto-assigned annotators:
+        # Include tasks that have been completed if their agreement is not at threshold if threshold setting is set
         lse_project = getattr(project, 'lse_project', None)
         if (
             lse_project
-            and flag_set('fflag_feat_optic_161_project_settings_for_low_agreement_threshold_score_short', user='auto')
             and lse_project.agreement_threshold is not None
             and get_tasks_agreement_queryset
             and user.is_project_annotator(project)
         ):
-            not_solved_tasks = (
-                get_tasks_agreement_queryset(not_solved_tasks)
-                # include tasks that are not labeled or are labeled but fall below the agreement threshold
-                .filter(
-                    Q(_agreement__lt=lse_project.agreement_threshold, is_labeled=True) | Q(is_labeled=False)
-                ).annotate(annotators=Count('annotations__completed_by', distinct=True))
-                # skip tasks that have been annotated by the maximum additional number of annotators
-                .filter(annotators__lt=F('overlap') + lse_project.max_additional_annotators_assignable)
+            # Onboarding mode (GT-first) should keep GT tasks eligible regardless of is_labeled/agreement
+            qs = get_tasks_agreement_queryset(not_solved_tasks)
+            qs = qs.annotate(annotators=Count('annotations__completed_by', distinct=True))
+
+            low_agreement_pred = Q(_agreement__lt=lse_project.agreement_threshold, is_labeled=True) | Q(
+                is_labeled=False
             )
+            capacity_pred = Q(annotators__lt=F('overlap') + (lse_project.max_additional_annotators_assignable or 0))
+
+            if project.show_ground_truth_first:
+                gt_subq = Annotation.objects.filter(task=OuterRef('pk'), ground_truth=True)
+                qs = qs.annotate(has_ground_truths=Exists(gt_subq))
+                # Keep all GT tasks + apply low-agreement+capacity to the rest. For sure, we can do:
+                # - if user.solved_tasks_array.count < lse_project.annotator_evaluation_minimum_tasks
+                # - else, apply low-agreement+capacity to the rest (maybe performance will be better)
+                # but it's a question - what is better here. This version is simpler at least from the code perspective.
+                not_solved_tasks = qs.filter(Q(has_ground_truths=True) | (low_agreement_pred & capacity_pred))
+            else:
+                not_solved_tasks = qs.filter(low_agreement_pred & capacity_pred)
+
             prioritized_on_agreement, not_solved_tasks = _prioritize_low_agreement_tasks(not_solved_tasks, lse_project)
 
         # otherwise, filtering out completed tasks is sufficient
         else:
             # ignore tasks that are already labeled for onboarding mode
-            if not (
-                flag_set('fflag_feat_all_leap_1825_annotator_evaluation_short', user='auto')
-                and project.show_ground_truth_first
-            ):
+            if not project.show_ground_truth_first:
                 not_solved_tasks = not_solved_tasks.filter(is_labeled=False)
 
     if not flag_set('fflag_fix_back_lsdv_4523_show_overlap_first_order_27022023_short'):
@@ -220,33 +238,39 @@ def get_next_task_without_dm_queue(
     use_task_lock = True
     queue_info = ''
 
-    # ordered by data manager
+    # Manually assigned tasks
     if assigned_flag:
         logger.debug(f'User={user} try to get task from assigned')
         next_task = not_solved_tasks.first()
         use_task_lock = False
         queue_info += (' & ' if queue_info else '') + 'Manually assigned queue'
 
-    # If current user has already lock one task - return it (without setting the lock again)
+    # Task lock: if current user already has a locked task, return it (without setting the lock again)
     if not next_task:
         next_task = Task.get_locked_by(user, tasks=not_solved_tasks)
         if next_task:
             logger.debug(f'User={user} got already locked for them {next_task}')
             use_task_lock = False
             queue_info += (' & ' if queue_info else '') + 'Task lock'
 
+    # Ground truth: label GT first only during onboarding window for user (gated by min tasks and min score)
+    allow_gt_first = should_attempt_ground_truth_first(user, project)
+    if not next_task and allow_gt_first:
+        logger.debug(f'User={user} tries ground truth from prepared tasks')
+        next_task = _try_ground_truth(not_solved_tasks, project, user)
+        if next_task:
+            queue_info += (' & ' if queue_info else '') + 'Ground truth queue'
+
+    # Low agreement strategy: reassign this annotator to low agreement tasks
     if not next_task and prioritized_low_agreement:
         logger.debug(f'User={user} tries low agreement from prepared tasks')
         next_task = _get_first_unlocked(not_solved_tasks, user)
-        queue_info += (' & ' if queue_info else '') + 'Low agreement queue'
-
-    if not next_task and project.show_ground_truth_first:
-        logger.debug(f'User={user} tries ground truth from prepared tasks')
-        next_task = _try_ground_truth(not_solved_tasks, project, user)
-        queue_info += (' & ' if queue_info else '') + 'Ground truth queue'
+        if next_task:
+            queue_info += (' & ' if queue_info else '') + 'Low agreement queue'
 
+    # Breadth first: label in-progress tasks first;
     if not next_task and project.maximum_annotations > 1:
-        # if there are any tasks in progress (with maximum number of annotations), randomly sampling from them
+        # if there are already labeled tasks, but task.overlap still < project.maximum_annotations, randomly sampling from them
         logger.debug(f'User={user} tries depth first from prepared tasks')
         next_task = _try_breadth_first(not_solved_tasks, user)
         if next_task:
@@ -358,7 +382,7 @@ def get_next_task(
                 # don't output anything - just filter tasks with overlap
                 logger.debug(f'User={user} tries overlap first from prepared tasks')
                 _, tasks_with_overlap = _try_tasks_with_overlap(not_solved_tasks)
-                queue_info += 'Show overlap first'
+                queue_info += (' & ' if queue_info else '') + 'Show overlap first'
                 next_task, queue_info = get_task_from_qs_with_sampling(
                     tasks_with_overlap, user_solved_tasks_array, prepared_tasks, user, project, queue_info
                 )
diff --git a/label_studio/projects/migrations/0031_alter_project_show_ground_truth_first.py b/label_studio/projects/migrations/0031_alter_project_show_ground_truth_first.py
@@ -0,0 +1,22 @@
+# Generated by Django 5.1.12 on 2025-10-03 12:10
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("projects", "0030_project_search_vector_index"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="project",
+            name="show_ground_truth_first",
+            field=models.BooleanField(
+                default=False,
+                help_text="Onboarding mode (true): show ground truth tasks first in the labeling stream",
+                verbose_name="show ground truth first",
+            ),
+        ),
+    ]
diff --git a/label_studio/projects/models.py b/label_studio/projects/models.py
@@ -266,7 +266,11 @@ class SkipQueue(models.TextChoices):
     skip_queue = models.CharField(
         max_length=100, choices=SkipQueue.choices, null=True, default=SkipQueue.REQUEUE_FOR_OTHERS
     )
-    show_ground_truth_first = models.BooleanField(_('show ground truth first'), default=False)
+    show_ground_truth_first = models.BooleanField(
+        _('show ground truth first'),
+        default=False,
+        help_text='Onboarding mode (true): show ground truth tasks first in the labeling stream',
+    )
     show_overlap_first = models.BooleanField(_('show overlap first'), default=False)
     overlap_cohort_percentage = models.IntegerField(_('overlap_cohort_percentage'), default=100)
 
diff --git a/label_studio/tasks/models.py b/label_studio/tasks/models.py
@@ -276,9 +276,7 @@ def has_lock(self, user=None):
         """
         from projects.functions.next_task import get_next_task_logging_level
 
-        if self.project.show_ground_truth_first and flag_set(
-            'fflag_feat_all_leap_1825_annotator_evaluation_short', user='auto'
-        ):
+        if self.project.show_ground_truth_first:
             # in show_ground_truth_first mode(onboarding)
             # we ignore overlap setting for ground_truth tasks
             # https://humansignal.atlassian.net/browse/LEAP-1963