chore: merge main into epic — keep Bishop State/Postgres versions, accept LICENSE and csv_only updates from main

William-Hill · William-Hill · commit 47253f624376 · 2026-02-22T01:55:41.000-05:00
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 /dev/color
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/ai_model/complete_ml_pipeline_csv_only.py b/ai_model/complete_ml_pipeline_csv_only.py
@@ -17,11 +17,10 @@
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split, cross_val_score
-from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import (
     accuracy_score, precision_score, recall_score, f1_score, 
-    roc_auc_score, confusion_matrix, classification_report,
-    mean_squared_error, mean_absolute_error, r2_score
+    roc_auc_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
 )
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 import xgboost as xgb
@@ -157,7 +156,7 @@ def assign_credential_type(row):
 
 df['target_credential_type'] = df.apply(assign_credential_type, axis=1)
 
-print(f"Created target variables:")
+print("Created target variables:")
 print(f"  - Retention: {df['target_retention'].value_counts().to_dict()}")
 print(f"  - At Risk: {df['target_at_risk'].value_counts().to_dict()}")
 print(f"  - Credential Type: {df['target_credential_type'].value_counts().to_dict()}")
@@ -264,8 +263,8 @@ def preprocess_features(df, feature_list):
 print("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION")
 print("-" * 80)
 
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import StratifiedKFold
+from sklearn.linear_model import LogisticRegression  # noqa: E402
+from sklearn.model_selection import StratifiedKFold  # noqa: E402
 
 models_to_test = {
     'Logistic Regression': LogisticRegression(
@@ -328,11 +327,11 @@ def preprocess_features(df, feature_list):
     print(f"  Gap:          {gap:.4f} ({gap*100:.2f}%)")
     
     if gap < 0.05:
-        print(f"  ✓ No overfitting (gap < 5%)")
+        print("  ✓ No overfitting (gap < 5%)")
     elif gap < 0.10:
-        print(f"  ⚠ Minimal overfitting (gap < 10%)")
+        print("  ⚠ Minimal overfitting (gap < 10%)")
     else:
-        print(f"  ✗ Overfitting detected (gap > 10%)")
+        print("  ✗ Overfitting detected (gap > 10%)")
     
     model_comparison.append({
         'Model': model_name,
@@ -392,8 +391,8 @@ def preprocess_features(df, feature_list):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              Not Ret  Retained")
+print("                Predicted")
+print("              Not Ret  Retained")
 print(f"Actual Not    {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       Ret    {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -515,8 +514,8 @@ def assign_alert_level(risk_score):
 low_retention_low_risk = df[(df['retention_probability'] < 0.3) & (df['at_risk_alert'] == 'LOW')]
 print(f"Students with <30% retention flagged as LOW: {len(low_retention_low_risk)} (should be very few)")
 
-print(f"\nEarly warning system aligned with retention predictions")
-print(f"\nAlert distribution:")
+print("\nEarly warning system aligned with retention predictions")
+print("\nAlert distribution:")
 print(df['at_risk_alert'].value_counts().sort_index())
 
 # ============================================================================
@@ -572,7 +571,7 @@ def assign_alert_level(risk_score):
     df['predicted_time_to_credential'] = time_model.predict(X_full_retention)
     df['predicted_graduation_year'] = df['Cohort'].str[:4].astype(float) + df['predicted_time_to_credential']
     
-    print(f"Time predictions generated")
+    print("Time predictions generated")
 else:
     print("Warning: Insufficient data for time-to-credential model")
     df['predicted_time_to_credential'] = np.nan
@@ -591,7 +590,7 @@ def assign_alert_level(risk_score):
 y_credential = y_credential[valid_idx]
 
 print(f"\nDataset size: {len(X_cred):,} students")
-print(f"Credential type distribution:")
+print("Credential type distribution:")
 cred_labels = {0: 'No Credential', 1: 'Certificate', 2: 'Associate', 3: 'Bachelor'}
 for k, v in y_credential.value_counts().sort_index().items():
     print(f"  {cred_labels.get(k, k)}: {v:,} ({v/len(y_credential)*100:.1f}%)")
@@ -651,7 +650,7 @@ def assign_alert_level(risk_score):
     if class_idx < len(prob_labels):
         df[prob_labels[int(class_idx)]] = proba[:, i]
 
-print(f"Credential type predictions generated")
+print("Credential type predictions generated")
 
 # ============================================================================
 # STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION
@@ -736,8 +735,8 @@ def assign_alert_level(risk_score):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              No Pass    Pass")
+print("                Predicted")
+print("              No Pass    Pass")
 print(f"Actual No     {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       Pass   {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -753,7 +752,7 @@ def assign_alert_level(risk_score):
     labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
 )
 
-print(f"Gateway math predictions generated")
+print("Gateway math predictions generated")
 
 # ============================================================================
 # STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION
@@ -838,8 +837,8 @@ def assign_alert_level(risk_score):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              No Pass    Pass")
+print("                Predicted")
+print("              No Pass    Pass")
 print(f"Actual No     {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       Pass   {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -855,7 +854,7 @@ def assign_alert_level(risk_score):
     labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
 )
 
-print(f"Gateway English predictions generated")
+print("Gateway English predictions generated")
 
 # ============================================================================
 # STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION
@@ -943,8 +942,8 @@ def assign_alert_level(risk_score):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              GPA>=2.0  GPA<2.0")
+print("                Predicted")
+print("              GPA>=2.0  GPA<2.0")
 print(f"Actual >=2.0  {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       <2.0   {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -958,7 +957,7 @@ def assign_alert_level(risk_score):
     labels=['Low Risk', 'Moderate Risk', 'High Risk', 'Critical Risk']
 )
 
-print(f"Low GPA predictions generated")
+print("Low GPA predictions generated")
 
 # ============================================================================
 # STEP 11: MODEL 8 - GPA PREDICTION (CONTINUOUS)
@@ -1027,9 +1026,9 @@ def assign_alert_level(risk_score):
     axis=1
 )
 
-print(f"GPA predictions generated")
+print("GPA predictions generated")
 print(f"Mean predicted GPA: {df['predicted_gpa'].mean():.2f}")
-print(f"\nPerformance Distribution:")
+print("\nPerformance Distribution:")
 print(df['gpa_performance'].value_counts().to_string())
 
 # ============================================================================
@@ -1042,7 +1041,7 @@ def assign_alert_level(risk_score):
 # Save student-level predictions with all columns
 output_file = os.path.join(DATA_DIR, 'bishop_state_student_level_with_predictions.csv')
 df.to_csv(output_file, index=False)
-print(f"\n✓ Saved student-level predictions to CSV:")
+print("\n✓ Saved student-level predictions to CSV:")
 print(f"  File: {output_file}")
 print(f"  Records: {len(df):,}")
 print(f"  Columns: {len(df.columns)}")
@@ -1093,7 +1092,7 @@ def assign_alert_level(risk_score):
 # Save course-level predictions
 output_file = os.path.join(DATA_DIR, 'bishop_state_merged_with_predictions.csv')
 merged_with_predictions.to_csv(output_file, index=False)
-print(f"\n✓ Saved course-level predictions to CSV:")
+print("\n✓ Saved course-level predictions to CSV:")
 print(f"  File: {output_file}")
 print(f"  Records: {len(merged_with_predictions):,}")
 print(f"  Columns: {len(merged_with_predictions.columns)}")
@@ -1133,7 +1132,7 @@ def assign_alert_level(risk_score):
     pct = count / len(df) * 100
     summary_report += f"     {cat:20s} {count:6,} ({pct:5.1f}%)\n"
 
-summary_report += f"""
+summary_report += """
 2. EARLY WARNING SYSTEM
    Algorithm: Composite Risk Score (Retention + Performance Metrics)
    Approach: Aligned with retention predictions to eliminate contradictions