eval-sys · Cierra0506 · May 13, 2026 · May 13, 2026 · May 24, 2026 · May 24, 2026
diff --git a/pipeline.py b/pipeline.py
@@ -113,7 +113,7 @@ def main():
     parser.add_argument(
         "--reasoning-effort",
         default="default",
-        choices=["default", "minimal", "low", "medium", "high"],
+        choices=["default", "minimal", "low", "medium", "high", "xhigh"],
         help="Reasoning effort level for supported models (default: None)",
     )
 

diff --git a/src/model_config.py b/src/model_config.py
@@ -50,6 +50,11 @@ class ModelConfig:
             "api_key_var": "OPENAI_API_KEY",
             "litellm_input_model_name": "openai/gpt-5.2",
         },
+        "gpt-5.5": {
+            "provider": "openai",
+            "api_key_var": "OPENAI_API_KEY",
+            "litellm_input_model_name": "openai/gpt-5.5",
+        },
         "gpt-5": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",

diff --git a/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md b/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md
@@ -136,4 +136,6 @@ Debug and fix the query to produce accurate results. Then create a table with yo
 
 2. **Create a table** called `customer_analysis_fixed` in the `public` schema with your corrected query results. The table should have the same columns as the original query output.
 
-**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues.
+**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues.
+
+**Note on payments**: A rental's revenue is determined by the rental itself — every payment recorded against a rental counts toward the customer who made that rental, regardless of which `customer_id` happens to appear on the payment row.
diff --git a/tasks/postgres/standard/employees/employee_demographics_report/description.md b/tasks/postgres/standard/employees/employee_demographics_report/description.md
@@ -12,7 +12,7 @@ Generate a comprehensive employee demographics and basic statistics report for t
    * `age_group` (varchar) — age range ('20-29', '30-39', '40-49', '50-59', '60+')
    * `employee_count` (integer) — number of current employees in age group
    * `avg_salary` (decimal) — average current salary for age group
-   * `avg_tenure_days` (decimal) — average days of service
+   * `avg_tenure_days` (decimal) — average days of service as of the reference date `2002-08-01`
 
 3. **Create the birth month distribution table** — build a table called `birth_month_distribution` in the `employees` schema with:
    * `birth_month` (integer) — month number (1-12)
@@ -26,7 +26,7 @@ Generate a comprehensive employee demographics and basic statistics report for t
    * `still_employed` (integer) — how many from that year are still employed
    * `retention_rate` (decimal) — percentage still employed (still_employed/employees_hired * 100)
 
-5. **Apply age group classification** based on current age:
+5. **Apply age group classification** based on each employee's age as of the reference date `2002-08-01`. Only include age groups that contain at least one current employee — empty buckets must not appear in `age_group_analysis`.
    * **20-29**: Ages 20-29
    * **30-39**: Ages 30-39  
    * **40-49**: Ages 40-49

diff --git a/tasks/postgres/standard/employees/employee_demographics_report/verify.py b/tasks/postgres/standard/employees/employee_demographics_report/verify.py
@@ -119,7 +119,7 @@ def verify_age_group_results(conn) -> bool:
   SELECT
     e.id AS employee_id,
     e.hire_date,
-    EXTRACT(YEAR FROM AGE(CURRENT_DATE, e.birth_date))::INT AS age_years
+    EXTRACT(YEAR FROM AGE(DATE '2002-08-01', e.birth_date))::INT AS age_years
   FROM employees.employee e
   WHERE e.birth_date IS NOT NULL
 )
@@ -133,7 +133,7 @@ def verify_age_group_results(conn) -> bool:
   END AS age_group,
   COUNT(*)::INT AS employee_count,
   AVG(cs.amount) AS avg_salary,
-  AVG((CURRENT_DATE - a.hire_date)::INT) AS avg_tenure_days
+  AVG((DATE '2002-08-01' - a.hire_date)::INT) AS avg_tenure_days
 FROM emp_age a
 JOIN current_salary cs ON cs.employee_id = a.employee_id
 WHERE a.age_years >= 20

diff --git a/tasks/postgres/standard/employees/employee_performance_analysis/description.md b/tasks/postgres/standard/employees/employee_performance_analysis/description.md
@@ -24,4 +24,8 @@ Create a comprehensive employee performance evaluation system that analyzes care
 
 5. **Calculate salary equity metrics** — populate the department table with current salary statistics for active employees only to identify potential pay equity issues across departments.
 
-The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies.
+The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies.
+
+### Important Notes
+
+- Do NOT use ROUND functions - keep the full precision of calculated values
diff --git a/tasks/postgres/standard/employees/employee_retention_analysis/description.md b/tasks/postgres/standard/employees/employee_retention_analysis/description.md
@@ -13,7 +13,7 @@ Analyze employee retention patterns and identify factors contributing to turnove
    * `employee_id` (bigint) — the employee's ID  
    * `full_name` (varchar) — concatenated first and last name
    * `current_department` (varchar) — current department name
-   * `tenure_days` (integer) — days with the company
+   * `tenure_days` (integer) — days with the company as of the reference date `2002-08-01`
    * `current_salary` (integer) — current salary amount
    * `risk_category` (varchar) — risk level ('high_risk', 'medium_risk', 'low_risk')
 
@@ -25,7 +25,7 @@ Analyze employee retention patterns and identify factors contributing to turnove
    * `avg_tenure_days` (decimal) — average tenure in days for employees who left that year
    * `avg_final_salary` (decimal) — average final salary of departed employees that year
 
-4. **Apply risk assessment criteria** for current employees:
+4. **Apply risk assessment criteria** for current employees (measure tenure as of the reference date `2002-08-01`):
    * **High risk**: Employees in departments with retention rate < 80% AND tenure < 1095 days (3 years)
    * **Medium risk**: Employees in departments with retention rate < 85% AND tenure < 1825 days (5 years)  
    * **Low risk**: All other current employees

diff --git a/tasks/postgres/standard/employees/employee_retention_analysis/verify.py b/tasks/postgres/standard/employees/employee_retention_analysis/verify.py
@@ -136,11 +136,11 @@ def verify_high_risk_results(conn) -> bool:
             e.id AS employee_id,
             CONCAT(e.first_name, ' ', e.last_name) AS full_name,
             d.dept_name AS current_department,
-            (CURRENT_DATE - e.hire_date)::INTEGER AS tenure_days,
+            (DATE '2002-08-01' - e.hire_date)::INTEGER AS tenure_days,
             cs.current_amount::INTEGER AS current_salary,
             CASE
-                WHEN dr.retention_rate < 80  AND (CURRENT_DATE - e.hire_date) < 1095 THEN 'high_risk'
-                WHEN dr.retention_rate < 85  AND (CURRENT_DATE - e.hire_date) < 1825 THEN 'medium_risk'
+                WHEN dr.retention_rate < 80  AND (DATE '2002-08-01' - e.hire_date) < 1095 THEN 'high_risk'
+                WHEN dr.retention_rate < 85  AND (DATE '2002-08-01' - e.hire_date) < 1825 THEN 'medium_risk'
                 ELSE 'low_risk'
             END AS risk_category
             FROM employees.employee e

diff --git a/tasks/postgres/standard/employees/executive_dashboard_automation/description.md b/tasks/postgres/standard/employees/executive_dashboard_automation/description.md
@@ -8,7 +8,7 @@ Design a comprehensive reporting and automation system for executive dashboard a
    * `department_name` (varchar) — department name
    * `total_employees` (integer) — current active employee count
    * `avg_salary` (decimal) — average current salary
-   * `total_payroll` (bigint) — total monthly payroll cost
+   * `total_payroll` (bigint) — sum of current salary amounts for active employees in the department
    * `manager_name` (varchar) — current department manager name
 
    **View 2: `exec_hiring_trends`**  

diff --git a/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py b/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py
@@ -339,12 +339,15 @@ def verify_materialized_views(conn) -> bool:
 def verify_stored_procedures(conn) -> bool:
     """Verify that stored procedure was created."""
     with conn.cursor() as cur:
-        # Check if procedure exists
+        # Check if the routine exists in pg_proc. pg_proc lists both
+        # FUNCTION and PROCEDURE entries, so we don't have to filter on
+        # type — accepts either form of "stored procedure".
         cur.execute("""
-            SELECT routine_name FROM information_schema.routines 
-            WHERE routine_schema = 'employees' 
-            AND routine_type = 'FUNCTION'
-            AND routine_name = 'generate_monthly_report'
+            SELECT p.proname
+            FROM pg_proc p
+            JOIN pg_namespace n ON n.oid = p.pronamespace
+            WHERE n.nspname = 'employees'
+              AND p.proname = 'generate_monthly_report'
         """)
         procedures = [row[0] for row in cur.fetchall()]
 

diff --git a/tasks/postgres/standard/lego/consistency_enforcement/description.md b/tasks/postgres/standard/lego/consistency_enforcement/description.md
@@ -4,7 +4,7 @@ Implement a data consistency enforcement system for the LEGO database. The syste
 For any given `set_num`, the following invariant must be maintained:
 `lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false
 
-**Important**: If a set has no inventory records, the consistency check should be skipped.
+**Important**: If a set has no inventory records (or no non-spare parts in its latest inventory), treat the actual part count as `0`. The consistency check still applies — `num_parts` must equal `0` for such sets.
 
 # Your Tasks:
 
@@ -15,7 +15,7 @@ Write a single `SELECT` query to find all sets where the stored `num_parts` does
 
 1.  **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table.
 2.  **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false.
-3.  **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum.
+3.  **Compare and Filter**: `LEFT JOIN` this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum, using `COALESCE(actual_parts, 0)` so that sets without inventory are also surfaced when their `num_parts` is non-zero.
 
 ## Task 2: Fix Existing Inconsistencies
 

diff --git a/tasks/postgres/standard/lego/consistency_enforcement/verify.py b/tasks/postgres/standard/lego/consistency_enforcement/verify.py
@@ -1,6 +1,5 @@
 """
 Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints
-Version 2.1: Relaxed consistency check to allow for one known corner case mismatch.
 """
 
 import os
@@ -82,17 +81,15 @@ def get_mismatch_count(cur) -> int:
 def verify_data_consistency(conn) -> bool:
     """
     TASK 1 VERIFICATION: Checks if the initial data fix was successful.
-    (Relaxed: Allows for one corner-case mismatch).
     """
-    print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --")
+    print("\n-- Verifying Task 1: Data Consistency Fix --")
     with conn.cursor() as cur:
         count = get_mismatch_count(cur)
-        # RELAXED CONDITION: Allow 0 or 1 mismatch to pass.
-        if count > 1:
-            print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.")
+        if count > 0:
+            print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 after fix.")
             return False
-        
-        print("✅ PASS: Data consistency check passed (allowing for one known mismatch).")
+
+        print("✅ PASS: All sets have consistent part counts.")
         return True
 
 

diff --git a/tasks/postgres/standard/lego/database_security_policies/verify.py b/tasks/postgres/standard/lego/database_security_policies/verify.py
@@ -183,16 +183,19 @@ def test_theme_analyst_access(conn) -> bool:
                 return False
             print("✅ PASS: Reference tables appear to be accessible.")
 
-            # Test 4 & 5: Check related tables
+            # Test 4 & 5: Check related tables — counts must match exactly
+            # what is reachable through theme_id=18 (Star Wars: 65081-1 + K8008-1).
             cur.execute("SELECT COUNT(*) FROM lego_inventories;")
-            if cur.fetchone()[0] == 0:
-                print("❌ FAIL: No inventories are visible for the allowed sets.")
+            inv_count = cur.fetchone()[0]
+            if inv_count != 2:
+                print(f"❌ FAIL: Expected 2 inventories for Star Wars sets, got {inv_count}.")
                 cur.execute("RESET ROLE;")
                 return False
-            
+
             cur.execute("SELECT COUNT(*) FROM lego_inventory_parts;")
-            if cur.fetchone()[0] == 0:
-                print("❌ FAIL: No inventory parts are visible for the allowed sets.")
+            parts_count = cur.fetchone()[0]
+            if parts_count != 3:
+                print(f"❌ FAIL: Expected 3 inventory parts for Star Wars sets, got {parts_count}.")
                 cur.execute("RESET ROLE;")
                 return False
             print("✅ PASS: Related tables (inventories, inventory_parts) are correctly filtered.")

diff --git a/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md b/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md
@@ -36,9 +36,11 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set
    - Verify both inventory IDs exist in `lego_inventories` table
    - Verify part exists in `lego_parts` table
    - Verify color exists in `lego_colors` table
-   - Check source has sufficient quantity (including spare parts)
+   - Check the source's non-spare row for this `(part_num, color_id)` has sufficient quantity
    - Prevent self-transfers (source and target cannot be the same)
 
+   *Note: The function operates on non-spare rows only (`is_spare = false`).*
+
    **Validation B: Business Rules**
    - Maximum transfer quantity is 500 parts per operation
    - Minimum transfer quantity is 1 part
@@ -52,26 +54,22 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set
    - Calculate transfer feasibility
 
    **Step B: Source Inventory Update**
-   - Decrease quantity in source inventory
+   - Decrease quantity on the source's non-spare row
    - If quantity becomes zero, delete the row
-   - Handle spare parts appropriately (maintain `is_spare` flag)
 
    **Step C: Target Inventory Update**
-   - Check if part exists in target inventory
+   - Check if a non-spare row for `(part_num, color_id)` exists in target inventory
    - If exists: increase quantity
-   - If not exists: insert new record
-   - Handle spare parts appropriately
+   - If not exists: insert a new non-spare row (`is_spare = false`)
 
    **Step D: Audit Logging**
    - Log successful transfers with details
-   - Log failed transfers with error messages
    - Include transfer reason and status
 
 5. **Error handling requirements**:
    - Use `RAISE EXCEPTION` with descriptive error messages
    - Handle all validation failures gracefully
    - Ensure complete rollback on any failure
-   - Log all attempts (successful and failed)
 
 6. **Return value**:
    - Return success message: `'Successfully transferred {quantity} parts ({part_num}, color_id: {color_id}) from inventory {source_id} to inventory {target_id}. Reason: {reason}'`
@@ -81,7 +79,7 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set
 
 - **Transaction Safety**: All operations wrapped in transaction block
 - **Data Integrity**: No partial updates possible
-- **Audit Trail**: Complete logging of all transfer attempts
+- **Audit Trail**: Logging of successful transfer attempts
 - **Validation**: Comprehensive input and business rule validation
 - **Error Recovery**: Failed transfers leave database unchanged
 - **Performance**: Use appropriate locking to prevent race conditions
@@ -105,8 +103,7 @@ SELECT transfer_parts(14469, 14469, '3024', 15, 10, 'self_transfer');
 ## Verification Criteria:
 
 - Function handles all validation rules correctly
-- Audit logging captures all transfer attempts
-- Failed transfers are properly logged with error details
+- Audit logging captures successful transfer attempts
 - Self-transfers are prevented
 - Quantity limits are enforced
 - Database state remains consistent after failures