diff --git a/pipeline.py b/pipeline.py index 6d932f6e..6cb77444 100644 --- a/pipeline.py +++ b/pipeline.py @@ -113,7 +113,7 @@ def main(): parser.add_argument( "--reasoning-effort", default="default", - choices=["default", "minimal", "low", "medium", "high"], + choices=["default", "minimal", "low", "medium", "high", "xhigh"], help="Reasoning effort level for supported models (default: None)", ) diff --git a/src/model_config.py b/src/model_config.py index 11d1e67d..db5f5026 100644 --- a/src/model_config.py +++ b/src/model_config.py @@ -50,6 +50,11 @@ class ModelConfig: "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-5.2", }, + "gpt-5.5": { + "provider": "openai", + "api_key_var": "OPENAI_API_KEY", + "litellm_input_model_name": "openai/gpt-5.5", + }, "gpt-5": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", diff --git a/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md b/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md index 1d617e1a..af93d156 100644 --- a/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md +++ b/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md @@ -136,4 +136,6 @@ Debug and fix the query to produce accurate results. Then create a table with yo 2. **Create a table** called `customer_analysis_fixed` in the `public` schema with your corrected query results. The table should have the same columns as the original query output. -**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues. \ No newline at end of file +**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues. + +**Note on payments**: A rental's revenue is determined by the rental itself — every payment recorded against a rental counts toward the customer who made that rental, regardless of which `customer_id` happens to appear on the payment row. \ No newline at end of file diff --git a/tasks/postgres/standard/employees/employee_demographics_report/description.md b/tasks/postgres/standard/employees/employee_demographics_report/description.md index c79cb6e2..380121b3 100644 --- a/tasks/postgres/standard/employees/employee_demographics_report/description.md +++ b/tasks/postgres/standard/employees/employee_demographics_report/description.md @@ -12,7 +12,7 @@ Generate a comprehensive employee demographics and basic statistics report for t * `age_group` (varchar) — age range ('20-29', '30-39', '40-49', '50-59', '60+') * `employee_count` (integer) — number of current employees in age group * `avg_salary` (decimal) — average current salary for age group - * `avg_tenure_days` (decimal) — average days of service + * `avg_tenure_days` (decimal) — average days of service as of the reference date `2002-08-01` 3. **Create the birth month distribution table** — build a table called `birth_month_distribution` in the `employees` schema with: * `birth_month` (integer) — month number (1-12) @@ -26,7 +26,7 @@ Generate a comprehensive employee demographics and basic statistics report for t * `still_employed` (integer) — how many from that year are still employed * `retention_rate` (decimal) — percentage still employed (still_employed/employees_hired * 100) -5. **Apply age group classification** based on current age: +5. **Apply age group classification** based on each employee's age as of the reference date `2002-08-01`. Only include age groups that contain at least one current employee — empty buckets must not appear in `age_group_analysis`. * **20-29**: Ages 20-29 * **30-39**: Ages 30-39 * **40-49**: Ages 40-49 diff --git a/tasks/postgres/standard/employees/employee_demographics_report/verify.py b/tasks/postgres/standard/employees/employee_demographics_report/verify.py index ef059874..b039208c 100644 --- a/tasks/postgres/standard/employees/employee_demographics_report/verify.py +++ b/tasks/postgres/standard/employees/employee_demographics_report/verify.py @@ -119,7 +119,7 @@ def verify_age_group_results(conn) -> bool: SELECT e.id AS employee_id, e.hire_date, - EXTRACT(YEAR FROM AGE(CURRENT_DATE, e.birth_date))::INT AS age_years + EXTRACT(YEAR FROM AGE(DATE '2002-08-01', e.birth_date))::INT AS age_years FROM employees.employee e WHERE e.birth_date IS NOT NULL ) @@ -133,7 +133,7 @@ def verify_age_group_results(conn) -> bool: END AS age_group, COUNT(*)::INT AS employee_count, AVG(cs.amount) AS avg_salary, - AVG((CURRENT_DATE - a.hire_date)::INT) AS avg_tenure_days + AVG((DATE '2002-08-01' - a.hire_date)::INT) AS avg_tenure_days FROM emp_age a JOIN current_salary cs ON cs.employee_id = a.employee_id WHERE a.age_years >= 20 diff --git a/tasks/postgres/standard/employees/employee_performance_analysis/description.md b/tasks/postgres/standard/employees/employee_performance_analysis/description.md index fff8496b..2877f5c5 100644 --- a/tasks/postgres/standard/employees/employee_performance_analysis/description.md +++ b/tasks/postgres/standard/employees/employee_performance_analysis/description.md @@ -24,4 +24,8 @@ Create a comprehensive employee performance evaluation system that analyzes care 5. **Calculate salary equity metrics** — populate the department table with current salary statistics for active employees only to identify potential pay equity issues across departments. -The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies. \ No newline at end of file +The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies. + +### Important Notes + +- Do NOT use ROUND functions - keep the full precision of calculated values \ No newline at end of file diff --git a/tasks/postgres/standard/employees/employee_retention_analysis/description.md b/tasks/postgres/standard/employees/employee_retention_analysis/description.md index 3d5f70ae..05b17863 100644 --- a/tasks/postgres/standard/employees/employee_retention_analysis/description.md +++ b/tasks/postgres/standard/employees/employee_retention_analysis/description.md @@ -13,7 +13,7 @@ Analyze employee retention patterns and identify factors contributing to turnove * `employee_id` (bigint) — the employee's ID * `full_name` (varchar) — concatenated first and last name * `current_department` (varchar) — current department name - * `tenure_days` (integer) — days with the company + * `tenure_days` (integer) — days with the company as of the reference date `2002-08-01` * `current_salary` (integer) — current salary amount * `risk_category` (varchar) — risk level ('high_risk', 'medium_risk', 'low_risk') @@ -25,7 +25,7 @@ Analyze employee retention patterns and identify factors contributing to turnove * `avg_tenure_days` (decimal) — average tenure in days for employees who left that year * `avg_final_salary` (decimal) — average final salary of departed employees that year -4. **Apply risk assessment criteria** for current employees: +4. **Apply risk assessment criteria** for current employees (measure tenure as of the reference date `2002-08-01`): * **High risk**: Employees in departments with retention rate < 80% AND tenure < 1095 days (3 years) * **Medium risk**: Employees in departments with retention rate < 85% AND tenure < 1825 days (5 years) * **Low risk**: All other current employees diff --git a/tasks/postgres/standard/employees/employee_retention_analysis/verify.py b/tasks/postgres/standard/employees/employee_retention_analysis/verify.py index 744cc33f..913f5be0 100644 --- a/tasks/postgres/standard/employees/employee_retention_analysis/verify.py +++ b/tasks/postgres/standard/employees/employee_retention_analysis/verify.py @@ -136,11 +136,11 @@ def verify_high_risk_results(conn) -> bool: e.id AS employee_id, CONCAT(e.first_name, ' ', e.last_name) AS full_name, d.dept_name AS current_department, - (CURRENT_DATE - e.hire_date)::INTEGER AS tenure_days, + (DATE '2002-08-01' - e.hire_date)::INTEGER AS tenure_days, cs.current_amount::INTEGER AS current_salary, CASE - WHEN dr.retention_rate < 80 AND (CURRENT_DATE - e.hire_date) < 1095 THEN 'high_risk' - WHEN dr.retention_rate < 85 AND (CURRENT_DATE - e.hire_date) < 1825 THEN 'medium_risk' + WHEN dr.retention_rate < 80 AND (DATE '2002-08-01' - e.hire_date) < 1095 THEN 'high_risk' + WHEN dr.retention_rate < 85 AND (DATE '2002-08-01' - e.hire_date) < 1825 THEN 'medium_risk' ELSE 'low_risk' END AS risk_category FROM employees.employee e diff --git a/tasks/postgres/standard/employees/executive_dashboard_automation/description.md b/tasks/postgres/standard/employees/executive_dashboard_automation/description.md index 490cdc41..4f00e5d4 100644 --- a/tasks/postgres/standard/employees/executive_dashboard_automation/description.md +++ b/tasks/postgres/standard/employees/executive_dashboard_automation/description.md @@ -8,7 +8,7 @@ Design a comprehensive reporting and automation system for executive dashboard a * `department_name` (varchar) — department name * `total_employees` (integer) — current active employee count * `avg_salary` (decimal) — average current salary - * `total_payroll` (bigint) — total monthly payroll cost + * `total_payroll` (bigint) — sum of current salary amounts for active employees in the department * `manager_name` (varchar) — current department manager name **View 2: `exec_hiring_trends`** diff --git a/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py b/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py index 1a34fcc9..3988358b 100644 --- a/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py +++ b/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py @@ -339,12 +339,15 @@ def verify_materialized_views(conn) -> bool: def verify_stored_procedures(conn) -> bool: """Verify that stored procedure was created.""" with conn.cursor() as cur: - # Check if procedure exists + # Check if the routine exists in pg_proc. pg_proc lists both + # FUNCTION and PROCEDURE entries, so we don't have to filter on + # type — accepts either form of "stored procedure". cur.execute(""" - SELECT routine_name FROM information_schema.routines - WHERE routine_schema = 'employees' - AND routine_type = 'FUNCTION' - AND routine_name = 'generate_monthly_report' + SELECT p.proname + FROM pg_proc p + JOIN pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'employees' + AND p.proname = 'generate_monthly_report' """) procedures = [row[0] for row in cur.fetchall()] diff --git a/tasks/postgres/standard/lego/consistency_enforcement/description.md b/tasks/postgres/standard/lego/consistency_enforcement/description.md index f02a21cc..5e6dc6f7 100644 --- a/tasks/postgres/standard/lego/consistency_enforcement/description.md +++ b/tasks/postgres/standard/lego/consistency_enforcement/description.md @@ -4,7 +4,7 @@ Implement a data consistency enforcement system for the LEGO database. The syste For any given `set_num`, the following invariant must be maintained: `lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false -**Important**: If a set has no inventory records, the consistency check should be skipped. +**Important**: If a set has no inventory records (or no non-spare parts in its latest inventory), treat the actual part count as `0`. The consistency check still applies — `num_parts` must equal `0` for such sets. # Your Tasks: @@ -15,7 +15,7 @@ Write a single `SELECT` query to find all sets where the stored `num_parts` does 1. **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table. 2. **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false. -3. **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum. +3. **Compare and Filter**: `LEFT JOIN` this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum, using `COALESCE(actual_parts, 0)` so that sets without inventory are also surfaced when their `num_parts` is non-zero. ## Task 2: Fix Existing Inconsistencies diff --git a/tasks/postgres/standard/lego/consistency_enforcement/verify.py b/tasks/postgres/standard/lego/consistency_enforcement/verify.py index d0222cc4..c1aaa23d 100644 --- a/tasks/postgres/standard/lego/consistency_enforcement/verify.py +++ b/tasks/postgres/standard/lego/consistency_enforcement/verify.py @@ -1,6 +1,5 @@ """ Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints -Version 2.1: Relaxed consistency check to allow for one known corner case mismatch. """ import os @@ -82,17 +81,15 @@ def get_mismatch_count(cur) -> int: def verify_data_consistency(conn) -> bool: """ TASK 1 VERIFICATION: Checks if the initial data fix was successful. - (Relaxed: Allows for one corner-case mismatch). """ - print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --") + print("\n-- Verifying Task 1: Data Consistency Fix --") with conn.cursor() as cur: count = get_mismatch_count(cur) - # RELAXED CONDITION: Allow 0 or 1 mismatch to pass. - if count > 1: - print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.") + if count > 0: + print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 after fix.") return False - - print("✅ PASS: Data consistency check passed (allowing for one known mismatch).") + + print("✅ PASS: All sets have consistent part counts.") return True diff --git a/tasks/postgres/standard/lego/database_security_policies/verify.py b/tasks/postgres/standard/lego/database_security_policies/verify.py index 0a019c95..16384485 100644 --- a/tasks/postgres/standard/lego/database_security_policies/verify.py +++ b/tasks/postgres/standard/lego/database_security_policies/verify.py @@ -183,16 +183,19 @@ def test_theme_analyst_access(conn) -> bool: return False print("✅ PASS: Reference tables appear to be accessible.") - # Test 4 & 5: Check related tables + # Test 4 & 5: Check related tables — counts must match exactly + # what is reachable through theme_id=18 (Star Wars: 65081-1 + K8008-1). cur.execute("SELECT COUNT(*) FROM lego_inventories;") - if cur.fetchone()[0] == 0: - print("❌ FAIL: No inventories are visible for the allowed sets.") + inv_count = cur.fetchone()[0] + if inv_count != 2: + print(f"❌ FAIL: Expected 2 inventories for Star Wars sets, got {inv_count}.") cur.execute("RESET ROLE;") return False - + cur.execute("SELECT COUNT(*) FROM lego_inventory_parts;") - if cur.fetchone()[0] == 0: - print("❌ FAIL: No inventory parts are visible for the allowed sets.") + parts_count = cur.fetchone()[0] + if parts_count != 3: + print(f"❌ FAIL: Expected 3 inventory parts for Star Wars sets, got {parts_count}.") cur.execute("RESET ROLE;") return False print("✅ PASS: Related tables (inventories, inventory_parts) are correctly filtered.") diff --git a/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md b/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md index 9b5deda9..126ff5bb 100644 --- a/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md +++ b/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md @@ -36,9 +36,11 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set - Verify both inventory IDs exist in `lego_inventories` table - Verify part exists in `lego_parts` table - Verify color exists in `lego_colors` table - - Check source has sufficient quantity (including spare parts) + - Check the source's non-spare row for this `(part_num, color_id)` has sufficient quantity - Prevent self-transfers (source and target cannot be the same) + *Note: The function operates on non-spare rows only (`is_spare = false`).* + **Validation B: Business Rules** - Maximum transfer quantity is 500 parts per operation - Minimum transfer quantity is 1 part @@ -52,26 +54,22 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set - Calculate transfer feasibility **Step B: Source Inventory Update** - - Decrease quantity in source inventory + - Decrease quantity on the source's non-spare row - If quantity becomes zero, delete the row - - Handle spare parts appropriately (maintain `is_spare` flag) **Step C: Target Inventory Update** - - Check if part exists in target inventory + - Check if a non-spare row for `(part_num, color_id)` exists in target inventory - If exists: increase quantity - - If not exists: insert new record - - Handle spare parts appropriately + - If not exists: insert a new non-spare row (`is_spare = false`) **Step D: Audit Logging** - Log successful transfers with details - - Log failed transfers with error messages - Include transfer reason and status 5. **Error handling requirements**: - Use `RAISE EXCEPTION` with descriptive error messages - Handle all validation failures gracefully - Ensure complete rollback on any failure - - Log all attempts (successful and failed) 6. **Return value**: - Return success message: `'Successfully transferred {quantity} parts ({part_num}, color_id: {color_id}) from inventory {source_id} to inventory {target_id}. Reason: {reason}'` @@ -81,7 +79,7 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set - **Transaction Safety**: All operations wrapped in transaction block - **Data Integrity**: No partial updates possible -- **Audit Trail**: Complete logging of all transfer attempts +- **Audit Trail**: Logging of successful transfer attempts - **Validation**: Comprehensive input and business rule validation - **Error Recovery**: Failed transfers leave database unchanged - **Performance**: Use appropriate locking to prevent race conditions @@ -105,8 +103,7 @@ SELECT transfer_parts(14469, 14469, '3024', 15, 10, 'self_transfer'); ## Verification Criteria: - Function handles all validation rules correctly -- Audit logging captures all transfer attempts -- Failed transfers are properly logged with error details +- Audit logging captures successful transfer attempts - Self-transfers are prevented - Quantity limits are enforced - Database state remains consistent after failures \ No newline at end of file diff --git a/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py b/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py index 933e1a87..49a2fbf8 100644 --- a/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py +++ b/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py @@ -293,7 +293,26 @@ def verify_business_rule_validation(conn) -> bool: finally: conn.rollback() # Rollback after third test - return test1_passed and test2_passed and test3_passed + # Test 4: Negative transfer quantity (should fail) + print("Test 4: Negative transfer quantity (should fail)") + test4_passed = False + try: + with conn.cursor() as cur: + cur.execute( + "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", + (14469, 14686, '3024', 15, -5, 'negative_transfer') + ) + result = cur.fetchone() + print(f"❌ FAIL: Negative transfer should have failed but succeeded: {result[0]}") + except psycopg2.Error: + print(f"✅ PASS: Negative transfer correctly failed") + test4_passed = True + except Exception as e: + print(f"❌ FAIL: Negative transfer test failed with unexpected error: {e}") + finally: + conn.rollback() + + return test1_passed and test2_passed and test3_passed and test4_passed def verify_insufficient_quantity_error(conn) -> bool: @@ -341,49 +360,43 @@ def verify_insufficient_quantity_error(conn) -> bool: def verify_invalid_inventory_error(conn) -> bool: - """Test that transfer fails with invalid inventory IDs.""" - print("\n-- Verifying Invalid Inventory Error --") - passed = False - try: - source_id = 99999 # Non-existent inventory - target_id = 14686 - part_num = '3024' - color_id = 15 - transfer_qty = 10 - reason = 'invalid_test' - - target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id) - - with conn.cursor() as cur: - try: + """Test that transfer fails when any of inventory_id / part_num / color_id is invalid.""" + print("\n-- Verifying Invalid Reference Errors --") + + # Each case: (label, source_id, target_id, part_num, color_id) + cases = [ + ("invalid source inventory", 99999, 14686, '3024', 15), + ("invalid target inventory", 14469, 99999, '3024', 15), + ("invalid part_num", 14469, 14686, 'NOT_A_REAL_PART', 15), + ("invalid color_id", 14469, 14686, '3024', 99999), + ] + transfer_qty = 10 + reason = 'invalid_test' + all_passed = True + for label, source_id, target_id, part_num, color_id in cases: + try: + with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, transfer_qty, reason) ) result = cur.fetchone() - print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}") - except psycopg2.Error as e: - print(f"✅ PASS: Transfer correctly failed with an exception.") - # Rollback the aborted transaction - conn.rollback() - - target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id) - if target_final != target_initial: - print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}") - else: - print("✅ PASS: Database state unchanged after invalid inventory error") - passed = True - finally: - conn.rollback() - return passed + print(f"❌ FAIL ({label}): Transfer should have failed but succeeded: {result[0]}") + all_passed = False + except psycopg2.Error: + print(f"✅ PASS ({label}): Transfer correctly failed with an exception.") + finally: + conn.rollback() + return all_passed def verify_audit_logging(conn) -> bool: """ - Test that audit logging captures both successful and failed transfers. - This function uses commits to separate test cases and work around the - transactional paradox of logging a failure within a transaction that - is about to be rolled back by the client. + Test audit logging behavior: + - Part 1: a successful transfer must produce a log row within the transaction. + - Part 2: if the function raises (e.g., self-transfer), the whole transaction + rolls back — any log row the function may have written disappears too. + This is standard PostgreSQL transaction semantics for RAISE EXCEPTION. """ print("\n-- Verifying Audit Logging --") @@ -432,9 +445,7 @@ def verify_audit_logging(conn) -> bool: "SELECT transfer_parts(14469, 14469, '3024', 15, 5, 'audit_test_fail')" ) except psycopg2.Error: - # This is the expected failure path. - # The function should have logged the failure before raising the error. - # Now, we check the log table. + # Expected: self-transfer raises an exception, aborting the transaction. pass # The transaction is now in an aborted state. We must rollback to issue new commands. @@ -465,13 +476,13 @@ def verify_exact_quantity_transfer(conn) -> bool: target_id = 14686 # Use a fixed target inventory try: - # Find a part with a small quantity that doesn't conflict with the target inventory + # Find a non-spare part with a small quantity that doesn't conflict with the target inventory with conn.cursor() as cur: cur.execute( """ SELECT inventory_id, part_num, color_id, quantity FROM public.lego_inventory_parts - WHERE quantity BETWEEN 5 AND 20 AND inventory_id != %s + WHERE quantity BETWEEN 5 AND 20 AND inventory_id != %s AND is_spare = false LIMIT 1 """, (target_id,) diff --git a/tasks/postgres/standard/security/rls_business_access/description.md b/tasks/postgres/standard/security/rls_business_access/description.md index be00f6b7..8e49b5d5 100644 --- a/tasks/postgres/standard/security/rls_business_access/description.md +++ b/tasks/postgres/standard/security/rls_business_access/description.md @@ -18,13 +18,13 @@ Build RLS policies for a social platform where users create posts and comments i - **DELETE**: Only channel owners can delete channels ### 3. Posts Table Access Rules: -- **SELECT**: Users can read all posts in channels they have access to +- **SELECT**: Users can read all posts in channels they have access to. (Authors do NOT get a separate read privilege — visibility is determined solely by channel accessibility.) - **INSERT**: Authenticated users can create posts in any channel - **UPDATE**: Post authors OR channel moderators OR channel owners can edit posts - **DELETE**: Post authors OR channel moderators OR channel owners can delete posts ### 4. Comments Table Access Rules: -- **SELECT**: Users can read comments on posts they can access +- **SELECT**: Users can read comments on posts they can access. (Comment authors do NOT get a separate read privilege — visibility follows the post's channel accessibility only.) - **INSERT**: Authenticated users can comment on posts they can see - **UPDATE**: Comment authors OR post authors OR channel moderators OR channel owners can edit comments - **DELETE**: Comment authors OR post authors OR channel moderators OR channel owners can delete comments @@ -36,7 +36,7 @@ Build RLS policies for a social platform where users create posts and comments i ## Session Context: -Use `current_setting('app.current_user_id')` to get the current user ID from session context. +The session sets `app.current_user_id` to the user's UUID, or `''` for anonymous users. Use the pre-created helper `app_current_user_id()` in your policies — it returns the UUID or `NULL` for anonymous (a raw `::UUID` cast on the empty string would error). ## Schema Requirements: @@ -48,11 +48,12 @@ Use `current_setting('app.current_user_id')` to get the current user ID from ses 1. **Enable RLS** on all five tables 2. **Create policies** for SELECT, INSERT, UPDATE, DELETE operations on each table -3. **Helper functions** to check permissions efficiently: +3. **Helper functions are pre-created** — use them in your policies: + - `app_current_user_id()` — returns the current user UUID (or `NULL` for anonymous) - `is_channel_owner(channel_id, user_id)` - `is_channel_moderator(channel_id, user_id)` - `can_moderate_channel(channel_id, user_id)` -4. **Proper indexing** to ensure RLS policies perform well +4. **Performance indexes are pre-created** — focus on writing correct, efficient policies. ## Test Scenarios: diff --git a/tasks/postgres/standard/security/rls_business_access/prepare_environment.py b/tasks/postgres/standard/security/rls_business_access/prepare_environment.py index 645fc608..3c3cecf8 100644 --- a/tasks/postgres/standard/security/rls_business_access/prepare_environment.py +++ b/tasks/postgres/standard/security/rls_business_access/prepare_environment.py @@ -25,6 +25,18 @@ def setup_rls_environment(): conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() + # Defensive cleanup: ROLEs are cluster-level objects and survive + # `DROP DATABASE` of a previous task run. Drop stale `test_user` + # (created by verify.py) so re-runs don't trip over it. + try: + cur.execute("DROP OWNED BY test_user CASCADE;") + except psycopg2.Error: + pass # role may not exist or own nothing + try: + cur.execute("DROP ROLE IF EXISTS test_user;") + except psycopg2.Error as e: + print(f"⚠ Could not drop stale role test_user: {e}") + # 1. Users Table (with correct field name for verification) cur.execute(""" CREATE TABLE IF NOT EXISTS users ( diff --git a/tasks/postgres/standard/security/rls_business_access/verify.py b/tasks/postgres/standard/security/rls_business_access/verify.py index 8fef4527..9bd71a7f 100644 --- a/tasks/postgres/standard/security/rls_business_access/verify.py +++ b/tasks/postgres/standard/security/rls_business_access/verify.py @@ -94,7 +94,10 @@ def verify_rls_implementation(): SET email = 'alice.updated@example.com' WHERE id = '11111111-1111-1111-1111-111111111111' """) - test_results.append("✓ Users can update their own profile") + if cur.rowcount > 0: + test_results.append("✓ Users can update their own profile") + else: + test_results.append("✗ User update affected 0 rows (RLS too restrictive on own profile)") except Exception as e: test_results.append(f"✗ User cannot update own profile: {e}") @@ -125,7 +128,10 @@ def verify_rls_implementation(): SET description = 'Updated by Alice' WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' """) - test_results.append("✓ Channel owners can update their channels") + if cur.rowcount > 0: + test_results.append("✓ Channel owners can update their channels") + else: + test_results.append("✗ Channel owner update affected 0 rows (RLS too restrictive on own channel)") except Exception as e: test_results.append(f"✗ Channel owner cannot update channel: {e}") @@ -156,7 +162,10 @@ def verify_rls_implementation(): SET title = 'Updated by Alice' WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd' """) - test_results.append("✓ Post authors can update their posts") + if cur.rowcount > 0: + test_results.append("✓ Post authors can update their posts") + else: + test_results.append("✗ Post author update affected 0 rows (RLS too restrictive on own post)") except Exception as e: test_results.append(f"✗ Post author cannot update post: {e}") @@ -168,7 +177,10 @@ def verify_rls_implementation(): SET content = 'Moderated by Bob' WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd' """) - test_results.append("✓ Channel moderators can update posts in their channels") + if cur.rowcount > 0: + test_results.append("✓ Channel moderators can update posts in their channels") + else: + test_results.append("✗ Moderator update affected 0 rows (RLS too restrictive on moderator path)") except Exception as e: test_results.append(f"✗ Channel moderator cannot update post: {e}") @@ -199,7 +211,10 @@ def verify_rls_implementation(): SET content = 'Updated by Bob himself' WHERE id = '99999999-9999-9999-9999-999999999999' """) - test_results.append("✓ Comment authors can update their comments") + if cur.rowcount > 0: + test_results.append("✓ Comment authors can update their comments") + else: + test_results.append("✗ Comment author update affected 0 rows (RLS too restrictive on own comment)") except Exception as e: test_results.append(f"✗ Comment author cannot update comment: {e}") @@ -211,7 +226,10 @@ def verify_rls_implementation(): SET content = 'Moderated by post author Alice' WHERE id = '99999999-9999-9999-9999-999999999999' """) - test_results.append("✓ Post authors can moderate comments on their posts") + if cur.rowcount > 0: + test_results.append("✓ Post authors can moderate comments on their posts") + else: + test_results.append("✗ Post author moderation affected 0 rows (RLS too restrictive on post-author path)") except Exception as e: test_results.append(f"✗ Post author cannot moderate comment: {e}") @@ -247,20 +265,29 @@ def verify_rls_implementation(): # Test 7: Content visibility based on user context print("\n7. Testing content visibility...") - # Count posts visible to Alice + # Count posts visible to Alice (general+tech-talk public channels: 3 posts) cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice cur.execute("SELECT COUNT(*) FROM posts;") alice_posts = cur.fetchone()[0] - # Count posts visible to Eve + # Count posts visible to Eve (same two public channels: 3 posts; she does NOT see her own + # post in the private 'random' channel since she's not owner/moderator there) cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';") # Eve cur.execute("SELECT COUNT(*) FROM posts;") eve_posts = cur.fetchone()[0] - if alice_posts >= 2 and eve_posts >= 1: # Alice should see posts in channels she has access to + # Count posts visible to Charlie (owner of private 'random' channel: 3 public + 1 own = 4) + cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';") # Charlie + cur.execute("SELECT COUNT(*) FROM posts;") + charlie_posts = cur.fetchone()[0] + + if alice_posts == 3 and eve_posts == 3 and charlie_posts == 4: test_results.append("✓ Content visibility varies correctly based on user context") else: - test_results.append(f"✗ Content visibility issue: Alice sees {alice_posts}, Eve sees {eve_posts}") + test_results.append( + f"✗ Content visibility issue: Alice sees {alice_posts} (expected 3), " + f"Eve sees {eve_posts} (expected 3), Charlie sees {charlie_posts} (expected 4)" + ) # Test 8: Anonymous user access print("\n8. Testing anonymous user restrictions...") @@ -271,9 +298,10 @@ def verify_rls_implementation(): anon_users = cur.fetchone()[0] # Anonymous users should be able to see public user profiles per requirements - # Count public users that should be visible + # Count public users that should be visible (re-counted under anon's view; with + # correct RLS this equals the total number of public users) cur.execute("SELECT COUNT(*) FROM users WHERE is_public = true;") - public_users = cur.fetchone()[0] if cur.rowcount > 0 else 0 + public_users = cur.fetchone()[0] if anon_users == public_users and anon_users > 0: test_results.append(f"✓ Anonymous users can see {anon_users} public user profiles (correct)") @@ -282,7 +310,9 @@ def verify_rls_implementation(): else: test_results.append(f"✗ Anonymous users can see {anon_users} users but expected {public_users} public users") except Exception as e: - test_results.append("✓ Anonymous users properly restricted") + # An exception here usually means the policy failed to handle empty/NULL session + # context (e.g., `current_setting('app.current_user_id')::UUID` on empty string). + test_results.append(f"✗ Anonymous-user query raised an exception: {e}") # Print results print("\n" + "="*60) diff --git a/tasks/postgres/standard/security/user_permission_audit/description.md b/tasks/postgres/standard/security/user_permission_audit/description.md index 59a566bb..d0bd751a 100644 --- a/tasks/postgres/standard/security/user_permission_audit/description.md +++ b/tasks/postgres/standard/security/user_permission_audit/description.md @@ -10,9 +10,9 @@ You've been hired as a security consultant to audit the PostgreSQL database perm 2. **Catalog all database users and roles**: Use `pg_user`, `pg_roles`, and `pg_auth_members` to find all accounts 3. **Analyze current permissions**: Use `information_schema.table_privileges` to map permissions 4. **Identify security issues**: - - **Dangling users**: Inactive accounts that should be removed + - **Dangling users**: A *dangling user* is a database role that has been granted privileges on one or more business tables but is **not** assigned to any of the expected business roles in `USER_ROLE` below. (This definition naturally excludes PostgreSQL system roles such as `postgres`, `pg_read_all_data`, etc., since they aren't granted on business tables directly.) - **Missing permissions**: Users lacking permissions required for their business role - - **Excessive permissions**: Users with unnecessary permissions that should be revoked + - **Excessive permissions**: Any privilege that does not belong to the user's expected business role. **This includes every grant currently held by a dangling user** — each grant must still be reported as a separate `EXCESSIVE_PERMISSION` row in addition to the per-user `DANGLING_USER` row. ## Expected permissions by role (what they SHOULD have) @@ -95,8 +95,8 @@ CREATE TABLE security_audit_results ( audit_id SERIAL PRIMARY KEY, audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS' total_issues INTEGER NOT NULL, - users_affected INTEGER NOT NULL, - tables_affected INTEGER NOT NULL + users_affected INTEGER NOT NULL, -- COUNT(DISTINCT username) for this audit_type + tables_affected INTEGER NOT NULL -- COUNT(DISTINCT table_name) for this audit_type; NULL table_name does not count (so DANGLING_USERS is 0) ); ``` @@ -108,7 +108,7 @@ CREATE TABLE security_audit_details ( issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION' table_name VARCHAR(50), -- NULL for dangling users permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users - expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not + expected_access BOOLEAN NOT NULL -- TRUE if user should have access (MISSING_PERMISSION); FALSE otherwise (EXCESSIVE_PERMISSION, DANGLING_USER) ); ``` @@ -118,18 +118,4 @@ Your audit should populate both tables with: - **Summary data**: High-level counts of different types of security issues - **Detailed findings**: Specific permission gaps for each user and table combination -## Business Role Expectations - -Analyze usernames and infer their intended business roles based on naming patterns: - -- **analytics_user** → Analytics Team (needs user behavior and statistics data) -- **marketing_user** → Marketing Department (needs customer and product data for campaigns) -- **customer_service** → Customer Service (needs user profiles and order management) -- **finance_user** → Finance Team (needs financial and order data) -- **product_manager** → Product Management (needs full product catalog access) -- **security_auditor** → Security Team (needs audit logs and credential data) -- **developer_user** → Development Team (needs limited access for testing) -- **backup_user** → Backup Service (needs read-only access to all business data) -- **temp_contractor, old_employee, test_account** → Inactive/Temporary (should have NO permissions) - The verification process will check that your findings correctly identify the actual permission gaps in the system by comparing against expected results. diff --git a/tasks/postgres/standard/security/user_permission_audit/verify.py b/tasks/postgres/standard/security/user_permission_audit/verify.py index 13f1c2af..3bb5d908 100644 --- a/tasks/postgres/standard/security/user_permission_audit/verify.py +++ b/tasks/postgres/standard/security/user_permission_audit/verify.py @@ -97,6 +97,34 @@ def verify_security_audit(): } } + # Validate structure first — if the schema is wrong, abort before parsing. + structure_valid = True + for i, finding in enumerate(findings): + if len(finding) != 6: + print(f"| FAIL: Finding {i + 1} has wrong number of columns (expected 6, got {len(finding)})") + structure_valid = False + continue + + _detail_id, username, issue_type, _table_name, _permission_type, expected_access = finding + + if not username: + print(f"| FAIL: Finding {i + 1} missing username") + structure_valid = False + + if issue_type not in ['DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION']: + print(f"| FAIL: Finding {i + 1} invalid issue_type: {issue_type}") + structure_valid = False + + if expected_access not in [True, False]: + print(f"| FAIL: Finding {i + 1} invalid expected_access: {expected_access}") + structure_valid = False + + if not structure_valid: + print("| FAIL: aborting further checks because finding rows have invalid structure") + return False + + print("| ✓ structure is valid") + found_dangling = set() found_missing_permissions = set() found_excessive_permissions = set() @@ -130,54 +158,42 @@ def verify_security_audit(): missing_excessive_perms = expected_findings['excessive_permissions'] - found_excessive_permissions extra_excessive_perms = found_excessive_permissions - expected_findings['excessive_permissions'] - # Validate structure - structure_valid = True - for i, finding in enumerate(findings): - if len(finding) != 6: # Should have 6 columns - print(f"| FAIL: Finding {i + 1} has wrong number of columns (expected 6, got {len(finding)})") - structure_valid = False - continue - - detail_id, username, issue_type, table_name, permission_type, expected_access = finding - - if not username: - print(f"| FAIL: Finding {i + 1} missing username") - structure_valid = False - - if issue_type not in ['DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION']: - print(f"| FAIL: Finding {i + 1} invalid issue_type: {issue_type}") - structure_valid = False - - if expected_access not in [True, False]: - print(f"| FAIL: Finding {i + 1} invalid expected_access: {expected_access}") - structure_valid = False - - if structure_valid: - print(f"| ✓ structure is valid") - # Check for missing findings all_correct = True print(f"| Expected dangling users: {expected_findings['dangling_users']} Found: {found_dangling}") if missing_dangling: - print(f"| Missing dangling users: {missing_dangling}") + print(f"| FAIL: Missing dangling users (not reported): {missing_dangling}") + all_correct = False + if extra_dangling: + print(f"| FAIL: Unexpected dangling users (extra): {extra_dangling}") all_correct = False print( - f"| Expected missing permissions: {len(expected_findings['missing_permissions'])} Found: {len(found_missing_permissions)} Missing: {len(missing_missing_perms)}") + f"| Expected missing permissions: {len(expected_findings['missing_permissions'])} Found: {len(found_missing_permissions)} Not reported: {len(missing_missing_perms)} Extra: {len(extra_missing_perms)}") if missing_missing_perms: - print(f"| Missing 'missing permission' findings:") + print(f"| FAIL: Missing 'missing permission' findings:") for perm in sorted(missing_missing_perms): print(f"| - {perm[0]} should be granted {perm[2]} on {perm[1]}") all_correct = False + if extra_missing_perms: + print(f"| FAIL: Unexpected 'missing permission' findings:") + for perm in sorted(extra_missing_perms): + print(f"| - {perm[0]} / {perm[1]} / {perm[2]} (not expected as missing)") + all_correct = False print( - f"| Expected excessive permissions: {len(expected_findings['excessive_permissions'])} Found: {len(found_excessive_permissions)} Missing: {len(missing_excessive_perms)}") + f"| Expected excessive permissions: {len(expected_findings['excessive_permissions'])} Found: {len(found_excessive_permissions)} Not reported: {len(missing_excessive_perms)} Extra: {len(extra_excessive_perms)}") if missing_excessive_perms: - print(f"| Missing 'excessive permission' findings:") + print(f"| FAIL: Missing 'excessive permission' findings:") for perm in sorted(missing_excessive_perms): print(f"| - {perm[0]} should have {perm[2]} revoked on {perm[1]}") all_correct = False + if extra_excessive_perms: + print(f"| FAIL: Unexpected 'excessive permission' findings:") + for perm in sorted(extra_excessive_perms): + print(f"| - {perm[0]} / {perm[1]} / {perm[2]} (not expected as excessive)") + all_correct = False # Check audit summary table cur.execute( @@ -204,13 +220,19 @@ def verify_security_audit(): else: print(f"| ✓ {audit_type} summary matches expected values") - # Assert exact counts match expected - assert len(found_dangling) == 3, f"Expected 3 dangling users, found {len(found_dangling)}" - assert len(found_missing_permissions) == 13, f"Expected 13 missing permissions, found {len(found_missing_permissions)}" - assert len(found_excessive_permissions) == 13, f"Expected 13 excessive permissions, found {len(found_excessive_permissions)}" - - if all_correct and structure_valid and summary_correct: - print("| ✓ All assertions passed") + # Exact-count sanity checks (catch e.g. duplicate findings that summary missed). + count_correct = True + for label, found_set, expected_count in [ + ("dangling users", found_dangling, 3), + ("missing permissions", found_missing_permissions, 13), + ("excessive permissions", found_excessive_permissions, 13), + ]: + if len(found_set) != expected_count: + print(f"| FAIL: Expected {expected_count} {label}, found {len(found_set)}") + count_correct = False + + if all_correct and structure_valid and summary_correct and count_correct: + print("| ✓ All checks passed") return True else: return False diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/description.md b/tasks/postgres/standard/vectors/dba_vector_analysis/description.md index 97936c0e..ccff2e0a 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/description.md +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/description.md @@ -54,7 +54,7 @@ CREATE TABLE vector_analysis_columns ( column_name VARCHAR(100), dimensions INTEGER, data_type VARCHAR(50), - has_constraints BOOLEAN, + has_constraints BOOLEAN, -- true if the column has any non-default constraint (NOT NULL, CHECK, FK); false otherwise rows BIGINT ); ``` diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql b/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql index 12995a3a..4e4a74cc 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql @@ -3,7 +3,7 @@ /* ================================================================================ -EXPECTED VECTOR DATABASE STRUCTURE (created by vectors_setup.py) +EXPECTED VECTOR DATABASE STRUCTURE (created by prepare_environment.py) ================================================================================ Tables with Vector Columns: diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py b/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py index 52a981cd..42ecbb58 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py @@ -1,26 +1,500 @@ """ Environment preparation script for Vector Database DBA Analysis task. -This script imports and uses the shared vector database setup utilities. +Sets up a PostgreSQL database with the pgvector extension, sample RAG-style +tables (documents / document_chunks / user_queries plus three non-vector +metadata tables), HNSW indexes on the vector columns, and sample data. """ -import sys +import os import logging -from pathlib import Path +import psycopg2 +import json +import random +import numpy as np +from typing import List -# Add the vectors directory to import the shared utilities -sys.path.append(str(Path(__file__).resolve().parents[1])) +logger = logging.getLogger(__name__) -from vectors_setup import prepare_vector_environment -logger = logging.getLogger(__name__) +def get_connection_params(): + """Get database connection parameters from environment variables.""" + return { + 'host': os.getenv('POSTGRES_HOST', 'localhost'), + 'port': os.getenv('POSTGRES_PORT', '5432'), + 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), + 'password': os.getenv('POSTGRES_PASSWORD', 'password'), + 'database': os.getenv('POSTGRES_DATABASE', 'postgres') + } + + +def generate_mock_embedding(dimensions: int = 1536) -> List[float]: + """Generate a mock embedding vector with specified dimensions.""" + # Generate random values between -1 and 1, then normalize + vector = np.random.uniform(-1, 1, dimensions) + # Normalize to unit vector (common practice for embeddings) + norm = np.linalg.norm(vector) + if norm > 0: + vector = vector / norm + return vector.tolist() + + +def create_vector_extension(): + """Create the pgvector extension.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Creating pgvector extension...") + cur.execute("CREATE EXTENSION IF NOT EXISTS vector;") + logger.info("pgvector extension created successfully") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to create pgvector extension: {e}") + raise + + +def create_vector_tables(): + """Create sample tables with vector columns for RAG applications.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Creating vector database tables...") + + # Create documents table for document embeddings + cur.execute(""" + CREATE TABLE IF NOT EXISTS documents ( + id SERIAL PRIMARY KEY, + title TEXT NOT NULL, + content TEXT NOT NULL, + source_url TEXT, + document_type VARCHAR(50) DEFAULT 'article', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + word_count INTEGER, + embedding vector(1536) + ); + """) + + # Create chunks table for document chunks (common in RAG) + cur.execute(""" + CREATE TABLE IF NOT EXISTS document_chunks ( + id SERIAL PRIMARY KEY, + document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + chunk_text TEXT NOT NULL, + chunk_size INTEGER, + overlap_size INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + embedding vector(1536) + ); + """) + + # Create queries table for storing user queries and their embeddings + cur.execute(""" + CREATE TABLE IF NOT EXISTS user_queries ( + id SERIAL PRIMARY KEY, + query_text TEXT NOT NULL, + user_id VARCHAR(100), + session_id VARCHAR(100), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + response_time_ms INTEGER, + embedding vector(1536) + ); + """) + + # Create embeddings metadata table + cur.execute(""" + CREATE TABLE IF NOT EXISTS embedding_models ( + id SERIAL PRIMARY KEY, + model_name VARCHAR(100) NOT NULL UNIQUE, + provider VARCHAR(50) NOT NULL, + dimensions INTEGER NOT NULL, + max_tokens INTEGER, + cost_per_token DECIMAL(10, 8), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + is_active BOOLEAN DEFAULT TRUE + ); + """) + + # Create knowledge base table + cur.execute(""" + CREATE TABLE IF NOT EXISTS knowledge_base ( + id SERIAL PRIMARY KEY, + kb_name VARCHAR(100) NOT NULL, + description TEXT, + domain VARCHAR(50), + language VARCHAR(10) DEFAULT 'en', + total_documents INTEGER DEFAULT 0, + total_chunks INTEGER DEFAULT 0, + total_storage_mb DECIMAL(10, 2), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + """) + + # Create similarity search results cache + cur.execute(""" + CREATE TABLE IF NOT EXISTS search_cache ( + id SERIAL PRIMARY KEY, + query_hash VARCHAR(64) NOT NULL, + query_text TEXT NOT NULL, + results_json JSONB, + result_count INTEGER, + search_time_ms INTEGER, + similarity_threshold DECIMAL(4, 3), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + expires_at TIMESTAMP + ); + """) + + logger.info("Vector database tables created successfully") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to create vector tables: {e}") + raise + + +def create_vector_indexes(): + """Create indexes for vector columns and other frequently queried fields.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Creating vector indexes...") + + # Vector indexes using HNSW (Hierarchical Navigable Small World) + indexes = [ + ("documents_embedding_idx", "documents", "embedding", "hnsw"), + ("chunks_embedding_idx", "document_chunks", "embedding", "hnsw"), + ("queries_embedding_idx", "user_queries", "embedding", "hnsw"), + ] + + for idx_name, table_name, column_name, method in indexes: + try: + if method == "hnsw": + cur.execute(f""" + CREATE INDEX IF NOT EXISTS {idx_name} + ON {table_name} USING hnsw ({column_name} vector_cosine_ops); + """) + else: + cur.execute(f""" + CREATE INDEX IF NOT EXISTS {idx_name} + ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); + """) + logger.info(f"Created index {idx_name} on {table_name}") + except psycopg2.Error as e: + logger.warning(f"Could not create {method} index {idx_name}: {e}") + # Try with IVFFlat as fallback + if method == "hnsw": + try: + cur.execute(f""" + CREATE INDEX IF NOT EXISTS {idx_name}_ivf + ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); + """) + logger.info(f"Created fallback IVFFlat index {idx_name}_ivf on {table_name}") + except psycopg2.Error as e2: + logger.warning(f"Could not create fallback index: {e2}") + + # Regular indexes for performance + regular_indexes = [ + ("documents_title_idx", "documents", "title"), + ("documents_type_idx", "documents", "document_type"), + ("documents_created_idx", "documents", "created_at"), + ("chunks_doc_id_idx", "document_chunks", "document_id"), + ("chunks_index_idx", "document_chunks", "chunk_index"), + ("queries_user_idx", "user_queries", "user_id"), + ("queries_created_idx", "user_queries", "created_at"), + ("cache_hash_idx", "search_cache", "query_hash"), + ("cache_expires_idx", "search_cache", "expires_at"), + ] + + for idx_name, table_name, column_name in regular_indexes: + try: + cur.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} ({column_name});") + logger.debug(f"Created regular index {idx_name}") + except psycopg2.Error as e: + logger.warning(f"Could not create regular index {idx_name}: {e}") + + logger.info("Vector indexes created successfully") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to create vector indexes: {e}") + raise + + +def insert_sample_data(): + """Insert sample data into vector tables.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Inserting sample data...") + + # Insert embedding models + embedding_models = [ + ('text-embedding-3-small', 'OpenAI', 1536, 8192, 0.00000002, True), + ('text-embedding-3-large', 'OpenAI', 3072, 8192, 0.00000013, True), + ('text-embedding-ada-002', 'OpenAI', 1536, 8192, 0.00000010, False), + ('all-MiniLM-L6-v2', 'Sentence-Transformers', 384, 512, 0.0, True), + ('all-mpnet-base-v2', 'Sentence-Transformers', 768, 514, 0.0, True), + ] + + for model_data in embedding_models: + cur.execute(""" + INSERT INTO embedding_models (model_name, provider, dimensions, max_tokens, cost_per_token, is_active) + VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (model_name) DO NOTHING; + """, model_data) + + # Insert knowledge bases + knowledge_bases = [ + ('Technical Documentation', 'Software engineering and API documentation', 'technology'), + ('Research Papers', 'Academic papers and research publications', 'research'), + ('Customer Support', 'FAQ and troubleshooting guides', 'support'), + ('Product Catalog', 'Product descriptions and specifications', 'commerce'), + ('Legal Documents', 'Contracts, policies, and legal texts', 'legal'), + ] + + kb_ids = [] + for kb_data in knowledge_bases: + cur.execute(""" + INSERT INTO knowledge_base (kb_name, description, domain, total_documents, total_chunks, total_storage_mb) + VALUES (%s, %s, %s, %s, %s, %s) + RETURNING id; + """, kb_data + (random.randint(50, 500), random.randint(200, 2000), round(random.uniform(10.5, 250.8), 2))) + kb_ids.append(cur.fetchone()[0]) + + # Insert sample documents + sample_documents = [ + ("PostgreSQL Performance Tuning", "Comprehensive guide to optimizing PostgreSQL database performance including indexing strategies, query optimization, and configuration tuning.", "https://example.com/pg-performance", "technical_guide"), + ("Vector Similarity Search", "Understanding vector embeddings and similarity search algorithms for AI applications and recommendation systems.", "https://example.com/vector-search", "technical_guide"), + ("RAG Implementation Best Practices", "Best practices for implementing Retrieval-Augmented Generation systems using vector databases and large language models.", "https://example.com/rag-practices", "best_practices"), + ("Database Security Guidelines", "Security considerations and implementation guidelines for PostgreSQL databases in production environments.", "https://example.com/db-security", "security_guide"), + ("Machine Learning with SQL", "Integrating machine learning workflows with SQL databases and leveraging database extensions for AI applications.", "https://example.com/ml-sql", "tutorial"), + ("API Documentation Standards", "Standards and best practices for creating comprehensive and user-friendly API documentation.", "https://example.com/api-docs", "documentation"), + ("Microservices Architecture", "Design patterns and implementation strategies for microservices architecture in modern applications.", "https://example.com/microservices", "architecture_guide"), + ("Data Pipeline Optimization", "Optimizing data processing pipelines for scalability, reliability, and performance in enterprise environments.", "https://example.com/data-pipelines", "optimization_guide"), + ("Cloud Database Migration", "Step-by-step guide for migrating on-premises databases to cloud infrastructure with minimal downtime.", "https://example.com/cloud-migration", "migration_guide"), + ("NoSQL vs SQL Comparison", "Detailed comparison of NoSQL and SQL databases, including use cases, performance characteristics, and selection criteria.", "https://example.com/nosql-sql", "comparison_guide"), + ] + + doc_ids = [] + for title, content, url, doc_type in sample_documents: + embedding = generate_mock_embedding(1536) + word_count = len(content.split()) + + cur.execute(""" + INSERT INTO documents (title, content, source_url, document_type, word_count, embedding) + VALUES (%s, %s, %s, %s, %s, %s) + RETURNING id; + """, (title, content, url, doc_type, word_count, embedding)) + doc_ids.append(cur.fetchone()[0]) + + # Insert document chunks + chunk_count = 0 + for doc_id in doc_ids: + # Generate 3-7 chunks per document + num_chunks = random.randint(3, 7) + for chunk_idx in range(num_chunks): + chunk_text = f"This is chunk {chunk_idx + 1} of document {doc_id}. " + \ + "It contains relevant information that would be useful for similarity search and RAG applications. " + \ + "The content includes technical details, examples, and best practices." + chunk_size = len(chunk_text) + overlap_size = random.randint(20, 50) if chunk_idx > 0 else 0 + embedding = generate_mock_embedding(1536) + + cur.execute(""" + INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_size, overlap_size, embedding) + VALUES (%s, %s, %s, %s, %s, %s); + """, (doc_id, chunk_idx, chunk_text, chunk_size, overlap_size, embedding)) + chunk_count += 1 + + # Insert sample user queries + sample_queries = [ + ("How to optimize PostgreSQL performance?", "user123", "session_abc1"), + ("What are vector embeddings?", "user456", "session_def2"), + ("Best practices for RAG implementation", "user789", "session_ghi3"), + ("Database security checklist", "user123", "session_abc2"), + ("Machine learning with databases", "user456", "session_def3"), + ("API documentation examples", "user321", "session_jkl1"), + ("Microservices design patterns", "user654", "session_mno2"), + ("Data pipeline best practices", "user987", "session_pqr3"), + ("Cloud migration strategies", "user111", "session_stu4"), + ("NoSQL vs SQL databases", "user222", "session_vwx5"), + ] + + for query_text, user_id, session_id in sample_queries: + embedding = generate_mock_embedding(1536) + response_time = random.randint(50, 500) + + cur.execute(""" + INSERT INTO user_queries (query_text, user_id, session_id, response_time_ms, embedding) + VALUES (%s, %s, %s, %s, %s); + """, (query_text, user_id, session_id, response_time, embedding)) + + # Insert some search cache entries + for i in range(5): + query_hash = f"hash_{random.randint(100000, 999999)}" + query_text = f"Sample cached query {i + 1}" + results = [{"doc_id": random.randint(1, len(doc_ids)), "similarity": round(random.uniform(0.7, 0.95), 3)} for _ in range(3)] + result_count = len(results) + search_time = random.randint(10, 100) + threshold = round(random.uniform(0.6, 0.8), 3) + + cur.execute(""" + INSERT INTO search_cache (query_hash, query_text, results_json, result_count, search_time_ms, similarity_threshold) + VALUES (%s, %s, %s, %s, %s, %s); + """, (query_hash, query_text, json.dumps(results), result_count, search_time, threshold)) + + logger.info(f"Sample data inserted successfully:") + logger.info(f" {len(sample_documents)} documents") + logger.info(f" {chunk_count} document chunks") + logger.info(f" {len(sample_queries)} user queries") + logger.info(f" {len(embedding_models)} embedding models") + logger.info(f" {len(knowledge_bases)} knowledge bases") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to insert sample data: {e}") + raise + + +def verify_vector_setup(): + """Verify that the vector database was set up correctly.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + + with conn.cursor() as cur: + logger.info("Verifying vector database setup...") + + # Check extension + cur.execute("SELECT extname FROM pg_extension WHERE extname = 'vector';") + if cur.fetchone(): + logger.info("pgvector extension is installed") + else: + logger.error("pgvector extension not found") + return False + + # Check tables and record counts + tables_to_check = [ + 'documents', 'document_chunks', 'user_queries', + 'embedding_models', 'knowledge_base', 'search_cache' + ] + + table_counts = {} + for table in tables_to_check: + cur.execute(f'SELECT COUNT(*) FROM {table}') + count = cur.fetchone()[0] + table_counts[table] = count + logger.info(f"Table {table}: {count} records") + + # Check vector columns + cur.execute(""" + SELECT table_name, column_name, data_type + FROM information_schema.columns + WHERE data_type = 'USER-DEFINED' + AND udt_name = 'vector' + ORDER BY table_name, column_name; + """) + + vector_columns = cur.fetchall() + logger.info(f"Found {len(vector_columns)} vector columns:") + for table, column, dtype in vector_columns: + logger.info(f" {table}.{column} ({dtype})") + + # Check indexes + cur.execute(""" + SELECT schemaname, tablename, indexname, indexdef + FROM pg_indexes + WHERE indexdef LIKE '%vector%' OR indexdef LIKE '%hnsw%' OR indexdef LIKE '%ivfflat%' + ORDER BY tablename, indexname; + """) + + vector_indexes = cur.fetchall() + logger.info(f"Found {len(vector_indexes)} vector indexes:") + for schema, table, index, definition in vector_indexes: + logger.info(f" {index} on {table}") + + # Test a simple vector similarity query + mock_embedding = generate_mock_embedding(1536) + cur.execute(""" + SELECT id, title, embedding <-> %s::vector as distance + FROM documents + ORDER BY embedding <-> %s::vector + LIMIT 3; + """, (mock_embedding, mock_embedding)) + + results = cur.fetchall() + logger.info(f"Vector similarity query returned {len(results)} results") + + conn.close() + logger.info("Vector database verification completed successfully") + return table_counts, vector_columns, vector_indexes + + except psycopg2.Error as e: + logger.error(f"Verification failed: {e}") + raise def prepare_environment(): """Main function to prepare the vector database environment.""" - prepare_vector_environment() + logger.info("Preparing vector database environment...") + + try: + # Create pgvector extension + create_vector_extension() + + # Create vector tables + create_vector_tables() + + # Insert sample data first + insert_sample_data() + + # Create indexes after data insertion for better performance + create_vector_indexes() + + # Verify the setup + table_counts, vector_columns, vector_indexes = verify_vector_setup() + + logger.info("Vector database environment prepared successfully!") + logger.info(f"Total tables created: {len(table_counts)}") + logger.info(f"Total vector columns: {len(vector_columns)}") + logger.info(f"Total vector indexes: {len(vector_indexes)}") + + return { + 'table_counts': table_counts, + 'vector_columns': vector_columns, + 'vector_indexes': vector_indexes + } + + except Exception as e: + logger.error(f"Failed to prepare vector environment: {e}") + raise if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - prepare_environment() \ No newline at end of file + prepare_environment() diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py b/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py index bdf4e8eb..634d83b1 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py @@ -92,7 +92,25 @@ def verify_vector_analysis_columns(conn) -> Dict[str, Any]: if extra_vectors: results['issues'].append(f"Non-existing: {extra_vectors}") - if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors: + # Verify analysis values: dimensions and rows must match reality. + values_ok = True + cur.execute(""" + SELECT table_name, column_name, dimensions, rows + FROM vector_analysis_columns + ORDER BY table_name, column_name; + """) + for tbl, col, dims, rows in cur.fetchall(): + if dims != 1536: + results['issues'].append(f"{tbl}.{col}: dimensions={dims}, expected 1536") + values_ok = False + if (tbl, col) in actual_vector_columns: + cur.execute(f'SELECT COUNT(*) FROM "{tbl}"') + actual_rows = cur.fetchone()[0] + if rows != actual_rows: + results['issues'].append(f"{tbl}.{col}: rows={rows}, expected {actual_rows}") + values_ok = False + + if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors and values_ok: results['passed'] = True except psycopg2.Error as e: @@ -167,7 +185,36 @@ def verify_vector_analysis_storage_consumption(conn) -> Dict[str, Any]: if extra_tables: results['issues'].append(f"Agent analyzed non-vector tables: {extra_tables}") - if not missing and not extra and count > 0 and not missing_tables and not extra_tables: + # Verify analysis values for each row: size/byte/pct/count sanity. + values_ok = True + cur.execute(""" + SELECT table_name, total_size_bytes, vector_data_bytes, + regular_data_bytes, vector_storage_pct, row_count + FROM vector_analysis_storage_consumption + ORDER BY table_name; + """) + for tbl, total_b, vec_b, reg_b, pct, row_cnt in cur.fetchall(): + if tbl not in actual_vector_tables: + continue + cur.execute(f'SELECT COUNT(*) FROM "{tbl}"') + actual_rows = cur.fetchone()[0] + if row_cnt != actual_rows: + results['issues'].append(f"{tbl}: row_count={row_cnt}, expected {actual_rows}") + values_ok = False + if total_b is None or total_b <= 0: + results['issues'].append(f"{tbl}: total_size_bytes={total_b}, expected > 0") + values_ok = False + if vec_b is None or vec_b <= 0: + results['issues'].append(f"{tbl}: vector_data_bytes={vec_b}, expected > 0") + values_ok = False + if reg_b is None or reg_b < 0: + results['issues'].append(f"{tbl}: regular_data_bytes={reg_b}, expected >= 0") + values_ok = False + if pct is None or not (0 <= pct <= 100): + results['issues'].append(f"{tbl}: vector_storage_pct={pct}, expected within [0, 100]") + values_ok = False + + if not missing and not extra and count > 0 and not missing_tables and not extra_tables and values_ok: results['passed'] = True except psycopg2.Error as e: @@ -240,7 +287,28 @@ def verify_vector_analysis_indices(conn) -> Dict[str, Any]: # Allow agent to find more indexes than just vector ones (they might include related indexes) # but at least they should find the vector-specific ones - if not missing and not extra and count > 0 and not missing_indexes: + # Verify analysis values: column_name, index_type, and index_size_bytes. + values_ok = True + actual_index_names = {ix for _s, _t, ix in actual_vector_indexes} + cur.execute(""" + SELECT index_name, column_name, index_type, index_size_bytes + FROM vector_analysis_indices + ORDER BY table_name, index_name; + """) + for idx_name, col_name, idx_type, idx_size in cur.fetchall(): + if idx_name not in actual_index_names: + continue # skip extra/unrelated indexes the agent may have added + if col_name != 'embedding': + results['issues'].append(f"{idx_name}: column_name={col_name!r}, expected 'embedding'") + values_ok = False + if (idx_type or '').lower() not in ('hnsw', 'ivfflat'): + results['issues'].append(f"{idx_name}: index_type={idx_type!r}, expected 'hnsw' or 'ivfflat'") + values_ok = False + if idx_size is None or idx_size <= 0: + results['issues'].append(f"{idx_name}: index_size_bytes={idx_size}, expected > 0") + values_ok = False + + if not missing and not extra and count > 0 and not missing_indexes and values_ok: results['passed'] = True except psycopg2.Error as e: @@ -315,8 +383,6 @@ def main(): print(f"Results: {passed_checks}/{total_checks} checks passed") if passed_checks == total_checks: sys.exit(0) - elif passed_checks >= total_checks * 0.75: - sys.exit(0) else: sys.exit(1) except psycopg2.Error as e: diff --git a/tasks/postgres/standard/vectors/vectors_setup.py b/tasks/postgres/standard/vectors/vectors_setup.py deleted file mode 100644 index c28dcf10..00000000 --- a/tasks/postgres/standard/vectors/vectors_setup.py +++ /dev/null @@ -1,500 +0,0 @@ -""" -Shared Vector Database Setup Utilities - -This module provides utilities for setting up a complete PostgreSQL database -with pgvector extension and sample RAG-related tables with vector data. -Used by all vector database tasks. -""" - -import os -import logging -import psycopg2 -import json -import random -import numpy as np -from typing import List - -logger = logging.getLogger(__name__) - -def get_connection_params(): - """Get database connection parameters from environment variables.""" - return { - 'host': os.getenv('POSTGRES_HOST', 'localhost'), - 'port': os.getenv('POSTGRES_PORT', '5432'), - 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), - 'password': os.getenv('POSTGRES_PASSWORD', 'password'), - 'database': os.getenv('POSTGRES_DATABASE', 'postgres') - } - - -def generate_mock_embedding(dimensions: int = 1536) -> List[float]: - """Generate a mock embedding vector with specified dimensions.""" - # Generate random values between -1 and 1, then normalize - vector = np.random.uniform(-1, 1, dimensions) - # Normalize to unit vector (common practice for embeddings) - norm = np.linalg.norm(vector) - if norm > 0: - vector = vector / norm - return vector.tolist() - - -def create_vector_extension(): - """Create the pgvector extension.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Creating pgvector extension...") - cur.execute("CREATE EXTENSION IF NOT EXISTS vector;") - logger.info("pgvector extension created successfully") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to create pgvector extension: {e}") - raise - - -def create_vector_tables(): - """Create sample tables with vector columns for RAG applications.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Creating vector database tables...") - - # Create documents table for document embeddings - cur.execute(""" - CREATE TABLE IF NOT EXISTS documents ( - id SERIAL PRIMARY KEY, - title TEXT NOT NULL, - content TEXT NOT NULL, - source_url TEXT, - document_type VARCHAR(50) DEFAULT 'article', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - word_count INTEGER, - embedding vector(1536) - ); - """) - - # Create chunks table for document chunks (common in RAG) - cur.execute(""" - CREATE TABLE IF NOT EXISTS document_chunks ( - id SERIAL PRIMARY KEY, - document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE, - chunk_index INTEGER NOT NULL, - chunk_text TEXT NOT NULL, - chunk_size INTEGER, - overlap_size INTEGER DEFAULT 0, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - embedding vector(1536) - ); - """) - - # Create queries table for storing user queries and their embeddings - cur.execute(""" - CREATE TABLE IF NOT EXISTS user_queries ( - id SERIAL PRIMARY KEY, - query_text TEXT NOT NULL, - user_id VARCHAR(100), - session_id VARCHAR(100), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - response_time_ms INTEGER, - embedding vector(1536) - ); - """) - - # Create embeddings metadata table - cur.execute(""" - CREATE TABLE IF NOT EXISTS embedding_models ( - id SERIAL PRIMARY KEY, - model_name VARCHAR(100) NOT NULL UNIQUE, - provider VARCHAR(50) NOT NULL, - dimensions INTEGER NOT NULL, - max_tokens INTEGER, - cost_per_token DECIMAL(10, 8), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - is_active BOOLEAN DEFAULT TRUE - ); - """) - - # Create knowledge base table - cur.execute(""" - CREATE TABLE IF NOT EXISTS knowledge_base ( - id SERIAL PRIMARY KEY, - kb_name VARCHAR(100) NOT NULL, - description TEXT, - domain VARCHAR(50), - language VARCHAR(10) DEFAULT 'en', - total_documents INTEGER DEFAULT 0, - total_chunks INTEGER DEFAULT 0, - total_storage_mb DECIMAL(10, 2), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - """) - - # Create similarity search results cache - cur.execute(""" - CREATE TABLE IF NOT EXISTS search_cache ( - id SERIAL PRIMARY KEY, - query_hash VARCHAR(64) NOT NULL, - query_text TEXT NOT NULL, - results_json JSONB, - result_count INTEGER, - search_time_ms INTEGER, - similarity_threshold DECIMAL(4, 3), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - expires_at TIMESTAMP - ); - """) - - logger.info("Vector database tables created successfully") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to create vector tables: {e}") - raise - - -def create_vector_indexes(): - """Create indexes for vector columns and other frequently queried fields.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Creating vector indexes...") - - # Vector indexes using HNSW (Hierarchical Navigable Small World) - indexes = [ - ("documents_embedding_idx", "documents", "embedding", "hnsw"), - ("chunks_embedding_idx", "document_chunks", "embedding", "hnsw"), - ("queries_embedding_idx", "user_queries", "embedding", "hnsw"), - ] - - for idx_name, table_name, column_name, method in indexes: - try: - if method == "hnsw": - cur.execute(f""" - CREATE INDEX IF NOT EXISTS {idx_name} - ON {table_name} USING hnsw ({column_name} vector_cosine_ops); - """) - else: - cur.execute(f""" - CREATE INDEX IF NOT EXISTS {idx_name} - ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); - """) - logger.info(f"Created index {idx_name} on {table_name}") - except psycopg2.Error as e: - logger.warning(f"Could not create {method} index {idx_name}: {e}") - # Try with IVFFlat as fallback - if method == "hnsw": - try: - cur.execute(f""" - CREATE INDEX IF NOT EXISTS {idx_name}_ivf - ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); - """) - logger.info(f"Created fallback IVFFlat index {idx_name}_ivf on {table_name}") - except psycopg2.Error as e2: - logger.warning(f"Could not create fallback index: {e2}") - - # Regular indexes for performance - regular_indexes = [ - ("documents_title_idx", "documents", "title"), - ("documents_type_idx", "documents", "document_type"), - ("documents_created_idx", "documents", "created_at"), - ("chunks_doc_id_idx", "document_chunks", "document_id"), - ("chunks_index_idx", "document_chunks", "chunk_index"), - ("queries_user_idx", "user_queries", "user_id"), - ("queries_created_idx", "user_queries", "created_at"), - ("cache_hash_idx", "search_cache", "query_hash"), - ("cache_expires_idx", "search_cache", "expires_at"), - ] - - for idx_name, table_name, column_name in regular_indexes: - try: - cur.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} ({column_name});") - logger.debug(f"Created regular index {idx_name}") - except psycopg2.Error as e: - logger.warning(f"Could not create regular index {idx_name}: {e}") - - logger.info("Vector indexes created successfully") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to create vector indexes: {e}") - raise - - -def insert_sample_data(): - """Insert sample data into vector tables.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Inserting sample data...") - - # Insert embedding models - embedding_models = [ - ('text-embedding-3-small', 'OpenAI', 1536, 8192, 0.00000002, True), - ('text-embedding-3-large', 'OpenAI', 3072, 8192, 0.00000013, True), - ('text-embedding-ada-002', 'OpenAI', 1536, 8192, 0.00000010, False), - ('all-MiniLM-L6-v2', 'Sentence-Transformers', 384, 512, 0.0, True), - ('all-mpnet-base-v2', 'Sentence-Transformers', 768, 514, 0.0, True), - ] - - for model_data in embedding_models: - cur.execute(""" - INSERT INTO embedding_models (model_name, provider, dimensions, max_tokens, cost_per_token, is_active) - VALUES (%s, %s, %s, %s, %s, %s) - ON CONFLICT (model_name) DO NOTHING; - """, model_data) - - # Insert knowledge bases - knowledge_bases = [ - ('Technical Documentation', 'Software engineering and API documentation', 'technology'), - ('Research Papers', 'Academic papers and research publications', 'research'), - ('Customer Support', 'FAQ and troubleshooting guides', 'support'), - ('Product Catalog', 'Product descriptions and specifications', 'commerce'), - ('Legal Documents', 'Contracts, policies, and legal texts', 'legal'), - ] - - kb_ids = [] - for kb_data in knowledge_bases: - cur.execute(""" - INSERT INTO knowledge_base (kb_name, description, domain, total_documents, total_chunks, total_storage_mb) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id; - """, kb_data + (random.randint(50, 500), random.randint(200, 2000), round(random.uniform(10.5, 250.8), 2))) - kb_ids.append(cur.fetchone()[0]) - - # Insert sample documents - sample_documents = [ - ("PostgreSQL Performance Tuning", "Comprehensive guide to optimizing PostgreSQL database performance including indexing strategies, query optimization, and configuration tuning.", "https://example.com/pg-performance", "technical_guide"), - ("Vector Similarity Search", "Understanding vector embeddings and similarity search algorithms for AI applications and recommendation systems.", "https://example.com/vector-search", "technical_guide"), - ("RAG Implementation Best Practices", "Best practices for implementing Retrieval-Augmented Generation systems using vector databases and large language models.", "https://example.com/rag-practices", "best_practices"), - ("Database Security Guidelines", "Security considerations and implementation guidelines for PostgreSQL databases in production environments.", "https://example.com/db-security", "security_guide"), - ("Machine Learning with SQL", "Integrating machine learning workflows with SQL databases and leveraging database extensions for AI applications.", "https://example.com/ml-sql", "tutorial"), - ("API Documentation Standards", "Standards and best practices for creating comprehensive and user-friendly API documentation.", "https://example.com/api-docs", "documentation"), - ("Microservices Architecture", "Design patterns and implementation strategies for microservices architecture in modern applications.", "https://example.com/microservices", "architecture_guide"), - ("Data Pipeline Optimization", "Optimizing data processing pipelines for scalability, reliability, and performance in enterprise environments.", "https://example.com/data-pipelines", "optimization_guide"), - ("Cloud Database Migration", "Step-by-step guide for migrating on-premises databases to cloud infrastructure with minimal downtime.", "https://example.com/cloud-migration", "migration_guide"), - ("NoSQL vs SQL Comparison", "Detailed comparison of NoSQL and SQL databases, including use cases, performance characteristics, and selection criteria.", "https://example.com/nosql-sql", "comparison_guide"), - ] - - doc_ids = [] - for title, content, url, doc_type in sample_documents: - embedding = generate_mock_embedding(1536) - word_count = len(content.split()) - - cur.execute(""" - INSERT INTO documents (title, content, source_url, document_type, word_count, embedding) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id; - """, (title, content, url, doc_type, word_count, embedding)) - doc_ids.append(cur.fetchone()[0]) - - # Insert document chunks - chunk_count = 0 - for doc_id in doc_ids: - # Generate 3-7 chunks per document - num_chunks = random.randint(3, 7) - for chunk_idx in range(num_chunks): - chunk_text = f"This is chunk {chunk_idx + 1} of document {doc_id}. " + \ - "It contains relevant information that would be useful for similarity search and RAG applications. " + \ - "The content includes technical details, examples, and best practices." - chunk_size = len(chunk_text) - overlap_size = random.randint(20, 50) if chunk_idx > 0 else 0 - embedding = generate_mock_embedding(1536) - - cur.execute(""" - INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_size, overlap_size, embedding) - VALUES (%s, %s, %s, %s, %s, %s); - """, (doc_id, chunk_idx, chunk_text, chunk_size, overlap_size, embedding)) - chunk_count += 1 - - # Insert sample user queries - sample_queries = [ - ("How to optimize PostgreSQL performance?", "user123", "session_abc1"), - ("What are vector embeddings?", "user456", "session_def2"), - ("Best practices for RAG implementation", "user789", "session_ghi3"), - ("Database security checklist", "user123", "session_abc2"), - ("Machine learning with databases", "user456", "session_def3"), - ("API documentation examples", "user321", "session_jkl1"), - ("Microservices design patterns", "user654", "session_mno2"), - ("Data pipeline best practices", "user987", "session_pqr3"), - ("Cloud migration strategies", "user111", "session_stu4"), - ("NoSQL vs SQL databases", "user222", "session_vwx5"), - ] - - for query_text, user_id, session_id in sample_queries: - embedding = generate_mock_embedding(1536) - response_time = random.randint(50, 500) - - cur.execute(""" - INSERT INTO user_queries (query_text, user_id, session_id, response_time_ms, embedding) - VALUES (%s, %s, %s, %s, %s); - """, (query_text, user_id, session_id, response_time, embedding)) - - # Insert some search cache entries - for i in range(5): - query_hash = f"hash_{random.randint(100000, 999999)}" - query_text = f"Sample cached query {i + 1}" - results = [{"doc_id": random.randint(1, len(doc_ids)), "similarity": round(random.uniform(0.7, 0.95), 3)} for _ in range(3)] - result_count = len(results) - search_time = random.randint(10, 100) - threshold = round(random.uniform(0.6, 0.8), 3) - - cur.execute(""" - INSERT INTO search_cache (query_hash, query_text, results_json, result_count, search_time_ms, similarity_threshold) - VALUES (%s, %s, %s, %s, %s, %s); - """, (query_hash, query_text, json.dumps(results), result_count, search_time, threshold)) - - logger.info(f"Sample data inserted successfully:") - logger.info(f" {len(sample_documents)} documents") - logger.info(f" {chunk_count} document chunks") - logger.info(f" {len(sample_queries)} user queries") - logger.info(f" {len(embedding_models)} embedding models") - logger.info(f" {len(knowledge_bases)} knowledge bases") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to insert sample data: {e}") - raise - - -def verify_vector_setup(): - """Verify that the vector database was set up correctly.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - - with conn.cursor() as cur: - logger.info("Verifying vector database setup...") - - # Check extension - cur.execute("SELECT extname FROM pg_extension WHERE extname = 'vector';") - if cur.fetchone(): - logger.info("pgvector extension is installed") - else: - logger.error("pgvector extension not found") - return False - - # Check tables and record counts - tables_to_check = [ - 'documents', 'document_chunks', 'user_queries', - 'embedding_models', 'knowledge_base', 'search_cache' - ] - - table_counts = {} - for table in tables_to_check: - cur.execute(f'SELECT COUNT(*) FROM {table}') - count = cur.fetchone()[0] - table_counts[table] = count - logger.info(f"Table {table}: {count} records") - - # Check vector columns - cur.execute(""" - SELECT table_name, column_name, data_type - FROM information_schema.columns - WHERE data_type = 'USER-DEFINED' - AND udt_name = 'vector' - ORDER BY table_name, column_name; - """) - - vector_columns = cur.fetchall() - logger.info(f"Found {len(vector_columns)} vector columns:") - for table, column, dtype in vector_columns: - logger.info(f" {table}.{column} ({dtype})") - - # Check indexes - cur.execute(""" - SELECT schemaname, tablename, indexname, indexdef - FROM pg_indexes - WHERE indexdef LIKE '%vector%' OR indexdef LIKE '%hnsw%' OR indexdef LIKE '%ivfflat%' - ORDER BY tablename, indexname; - """) - - vector_indexes = cur.fetchall() - logger.info(f"Found {len(vector_indexes)} vector indexes:") - for schema, table, index, definition in vector_indexes: - logger.info(f" {index} on {table}") - - # Test a simple vector similarity query - mock_embedding = generate_mock_embedding(1536) - cur.execute(""" - SELECT id, title, embedding <-> %s::vector as distance - FROM documents - ORDER BY embedding <-> %s::vector - LIMIT 3; - """, (mock_embedding, mock_embedding)) - - results = cur.fetchall() - logger.info(f"Vector similarity query returned {len(results)} results") - - conn.close() - logger.info("Vector database verification completed successfully") - return table_counts, vector_columns, vector_indexes - - except psycopg2.Error as e: - logger.error(f"Verification failed: {e}") - raise - - -def prepare_vector_environment(): - """Main function to prepare the vector database environment.""" - logger.info("Preparing vector database environment...") - - try: - # Create pgvector extension - create_vector_extension() - - # Create vector tables - create_vector_tables() - - # Insert sample data first - insert_sample_data() - - # Create indexes after data insertion for better performance - create_vector_indexes() - - # Verify the setup - table_counts, vector_columns, vector_indexes = verify_vector_setup() - - logger.info("Vector database environment prepared successfully!") - logger.info(f"Total tables created: {len(table_counts)}") - logger.info(f"Total vector columns: {len(vector_columns)}") - logger.info(f"Total vector indexes: {len(vector_indexes)}") - - return { - 'table_counts': table_counts, - 'vector_columns': vector_columns, - 'vector_indexes': vector_indexes - } - - except Exception as e: - logger.error(f"Failed to prepare vector environment: {e}") - raise - - -if __name__ == "__main__": - # Allow running this module directly for testing - logging.basicConfig(level=logging.INFO) - prepare_vector_environment()