1717import pandas as pd
1818import numpy as np
1919from sklearn .model_selection import train_test_split , cross_val_score
20- from sklearn .preprocessing import LabelEncoder , StandardScaler
20+ from sklearn .preprocessing import LabelEncoder
2121from sklearn .metrics import (
2222 accuracy_score , precision_score , recall_score , f1_score ,
23- roc_auc_score , confusion_matrix , classification_report ,
24- mean_squared_error , mean_absolute_error , r2_score
23+ roc_auc_score , confusion_matrix , mean_squared_error , mean_absolute_error , r2_score
2524)
2625from sklearn .ensemble import RandomForestClassifier , RandomForestRegressor
2726import xgboost as xgb
@@ -157,7 +156,7 @@ def assign_credential_type(row):
157156
158157df ['target_credential_type' ] = df .apply (assign_credential_type , axis = 1 )
159158
160- print (f "Created target variables:" )
159+ print ("Created target variables:" )
161160print (f" - Retention: { df ['target_retention' ].value_counts ().to_dict ()} " )
162161print (f" - At Risk: { df ['target_at_risk' ].value_counts ().to_dict ()} " )
163162print (f" - Credential Type: { df ['target_credential_type' ].value_counts ().to_dict ()} " )
@@ -264,8 +263,8 @@ def preprocess_features(df, feature_list):
264263print ("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION" )
265264print ("-" * 80 )
266265
267- from sklearn .linear_model import LogisticRegression
268- from sklearn .model_selection import StratifiedKFold
266+ from sklearn .linear_model import LogisticRegression # noqa: E402
267+ from sklearn .model_selection import StratifiedKFold # noqa: E402
269268
270269models_to_test = {
271270 'Logistic Regression' : LogisticRegression (
@@ -328,11 +327,11 @@ def preprocess_features(df, feature_list):
328327 print (f" Gap: { gap :.4f} ({ gap * 100 :.2f} %)" )
329328
330329 if gap < 0.05 :
331- print (f " ✓ No overfitting (gap < 5%)" )
330+ print (" ✓ No overfitting (gap < 5%)" )
332331 elif gap < 0.10 :
333- print (f " ⚠ Minimal overfitting (gap < 10%)" )
332+ print (" ⚠ Minimal overfitting (gap < 10%)" )
334333 else :
335- print (f " ✗ Overfitting detected (gap > 10%)" )
334+ print (" ✗ Overfitting detected (gap > 10%)" )
336335
337336 model_comparison .append ({
338337 'Model' : model_name ,
@@ -392,8 +391,8 @@ def preprocess_features(df, feature_list):
392391
393392print ("\n Confusion Matrix:" )
394393cm = confusion_matrix (y_test , y_pred )
395- print (f " Predicted" )
396- print (f " Not Ret Retained" )
394+ print (" Predicted" )
395+ print (" Not Ret Retained" )
397396print (f"Actual Not { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
398397print (f" Ret { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
399398
@@ -515,8 +514,8 @@ def assign_alert_level(risk_score):
515514low_retention_low_risk = df [(df ['retention_probability' ] < 0.3 ) & (df ['at_risk_alert' ] == 'LOW' )]
516515print (f"Students with <30% retention flagged as LOW: { len (low_retention_low_risk )} (should be very few)" )
517516
518- print (f "\n Early warning system aligned with retention predictions" )
519- print (f "\n Alert distribution:" )
517+ print ("\n Early warning system aligned with retention predictions" )
518+ print ("\n Alert distribution:" )
520519print (df ['at_risk_alert' ].value_counts ().sort_index ())
521520
522521# ============================================================================
@@ -572,7 +571,7 @@ def assign_alert_level(risk_score):
572571 df ['predicted_time_to_credential' ] = time_model .predict (X_full_retention )
573572 df ['predicted_graduation_year' ] = df ['Cohort' ].str [:4 ].astype (float ) + df ['predicted_time_to_credential' ]
574573
575- print (f "Time predictions generated" )
574+ print ("Time predictions generated" )
576575else :
577576 print ("Warning: Insufficient data for time-to-credential model" )
578577 df ['predicted_time_to_credential' ] = np .nan
@@ -591,7 +590,7 @@ def assign_alert_level(risk_score):
591590y_credential = y_credential [valid_idx ]
592591
593592print (f"\n Dataset size: { len (X_cred ):,} students" )
594- print (f "Credential type distribution:" )
593+ print ("Credential type distribution:" )
595594cred_labels = {0 : 'No Credential' , 1 : 'Certificate' , 2 : 'Associate' , 3 : 'Bachelor' }
596595for k , v in y_credential .value_counts ().sort_index ().items ():
597596 print (f" { cred_labels .get (k , k )} : { v :,} ({ v / len (y_credential )* 100 :.1f} %)" )
@@ -651,7 +650,7 @@ def assign_alert_level(risk_score):
651650 if class_idx < len (prob_labels ):
652651 df [prob_labels [int (class_idx )]] = proba [:, i ]
653652
654- print (f "Credential type predictions generated" )
653+ print ("Credential type predictions generated" )
655654
656655# ============================================================================
657656# STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION
@@ -736,8 +735,8 @@ def assign_alert_level(risk_score):
736735
737736print ("\n Confusion Matrix:" )
738737cm = confusion_matrix (y_test , y_pred )
739- print (f " Predicted" )
740- print (f " No Pass Pass" )
738+ print (" Predicted" )
739+ print (" No Pass Pass" )
741740print (f"Actual No { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
742741print (f" Pass { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
743742
@@ -753,7 +752,7 @@ def assign_alert_level(risk_score):
753752 labels = ['High Risk' , 'Moderate Risk' , 'Likely Pass' , 'Very Likely Pass' ]
754753)
755754
756- print (f "Gateway math predictions generated" )
755+ print ("Gateway math predictions generated" )
757756
758757# ============================================================================
759758# STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION
@@ -838,8 +837,8 @@ def assign_alert_level(risk_score):
838837
839838print ("\n Confusion Matrix:" )
840839cm = confusion_matrix (y_test , y_pred )
841- print (f " Predicted" )
842- print (f " No Pass Pass" )
840+ print (" Predicted" )
841+ print (" No Pass Pass" )
843842print (f"Actual No { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
844843print (f" Pass { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
845844
@@ -855,7 +854,7 @@ def assign_alert_level(risk_score):
855854 labels = ['High Risk' , 'Moderate Risk' , 'Likely Pass' , 'Very Likely Pass' ]
856855)
857856
858- print (f "Gateway English predictions generated" )
857+ print ("Gateway English predictions generated" )
859858
860859# ============================================================================
861860# STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION
@@ -943,8 +942,8 @@ def assign_alert_level(risk_score):
943942
944943print ("\n Confusion Matrix:" )
945944cm = confusion_matrix (y_test , y_pred )
946- print (f " Predicted" )
947- print (f " GPA>=2.0 GPA<2.0" )
945+ print (" Predicted" )
946+ print (" GPA>=2.0 GPA<2.0" )
948947print (f"Actual >=2.0 { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
949948print (f" <2.0 { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
950949
@@ -958,7 +957,7 @@ def assign_alert_level(risk_score):
958957 labels = ['Low Risk' , 'Moderate Risk' , 'High Risk' , 'Critical Risk' ]
959958)
960959
961- print (f "Low GPA predictions generated" )
960+ print ("Low GPA predictions generated" )
962961
963962# ============================================================================
964963# STEP 11: MODEL 8 - GPA PREDICTION (CONTINUOUS)
@@ -1027,9 +1026,9 @@ def assign_alert_level(risk_score):
10271026 axis = 1
10281027)
10291028
1030- print (f "GPA predictions generated" )
1029+ print ("GPA predictions generated" )
10311030print (f"Mean predicted GPA: { df ['predicted_gpa' ].mean ():.2f} " )
1032- print (f "\n Performance Distribution:" )
1031+ print ("\n Performance Distribution:" )
10331032print (df ['gpa_performance' ].value_counts ().to_string ())
10341033
10351034# ============================================================================
@@ -1042,7 +1041,7 @@ def assign_alert_level(risk_score):
10421041# Save student-level predictions with all columns
10431042output_file = os .path .join (DATA_DIR , 'bishop_state_student_level_with_predictions.csv' )
10441043df .to_csv (output_file , index = False )
1045- print (f "\n ✓ Saved student-level predictions to CSV:" )
1044+ print ("\n ✓ Saved student-level predictions to CSV:" )
10461045print (f" File: { output_file } " )
10471046print (f" Records: { len (df ):,} " )
10481047print (f" Columns: { len (df .columns )} " )
@@ -1093,7 +1092,7 @@ def assign_alert_level(risk_score):
10931092# Save course-level predictions
10941093output_file = os .path .join (DATA_DIR , 'bishop_state_merged_with_predictions.csv' )
10951094merged_with_predictions .to_csv (output_file , index = False )
1096- print (f "\n ✓ Saved course-level predictions to CSV:" )
1095+ print ("\n ✓ Saved course-level predictions to CSV:" )
10971096print (f" File: { output_file } " )
10981097print (f" Records: { len (merged_with_predictions ):,} " )
10991098print (f" Columns: { len (merged_with_predictions .columns )} " )
@@ -1133,7 +1132,7 @@ def assign_alert_level(risk_score):
11331132 pct = count / len (df ) * 100
11341133 summary_report += f" { cat :20s} { count :6,} ({ pct :5.1f} %)\n "
11351134
1136- summary_report += f """
1135+ summary_report += """
113711362. EARLY WARNING SYSTEM
11381137 Algorithm: Composite Risk Score (Retention + Performance Metrics)
11391138 Approach: Aligned with retention predictions to eliminate contradictions
0 commit comments