Skip to content

Commit 47253f6

Browse files
committed
chore: merge main into epic — keep Bishop State/Postgres versions, accept LICENSE and csv_only updates from main
2 parents 31ce6bb + 7087437 commit 47253f6

2 files changed

Lines changed: 50 additions & 30 deletions

File tree

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 /dev/color
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

ai_model/complete_ml_pipeline_csv_only.py

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,10 @@
1717
import pandas as pd
1818
import numpy as np
1919
from sklearn.model_selection import train_test_split, cross_val_score
20-
from sklearn.preprocessing import LabelEncoder, StandardScaler
20+
from sklearn.preprocessing import LabelEncoder
2121
from sklearn.metrics import (
2222
accuracy_score, precision_score, recall_score, f1_score,
23-
roc_auc_score, confusion_matrix, classification_report,
24-
mean_squared_error, mean_absolute_error, r2_score
23+
roc_auc_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
2524
)
2625
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
2726
import xgboost as xgb
@@ -157,7 +156,7 @@ def assign_credential_type(row):
157156

158157
df['target_credential_type'] = df.apply(assign_credential_type, axis=1)
159158

160-
print(f"Created target variables:")
159+
print("Created target variables:")
161160
print(f" - Retention: {df['target_retention'].value_counts().to_dict()}")
162161
print(f" - At Risk: {df['target_at_risk'].value_counts().to_dict()}")
163162
print(f" - Credential Type: {df['target_credential_type'].value_counts().to_dict()}")
@@ -264,8 +263,8 @@ def preprocess_features(df, feature_list):
264263
print("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION")
265264
print("-" * 80)
266265

267-
from sklearn.linear_model import LogisticRegression
268-
from sklearn.model_selection import StratifiedKFold
266+
from sklearn.linear_model import LogisticRegression # noqa: E402
267+
from sklearn.model_selection import StratifiedKFold # noqa: E402
269268

270269
models_to_test = {
271270
'Logistic Regression': LogisticRegression(
@@ -328,11 +327,11 @@ def preprocess_features(df, feature_list):
328327
print(f" Gap: {gap:.4f} ({gap*100:.2f}%)")
329328

330329
if gap < 0.05:
331-
print(f" ✓ No overfitting (gap < 5%)")
330+
print(" ✓ No overfitting (gap < 5%)")
332331
elif gap < 0.10:
333-
print(f" ⚠ Minimal overfitting (gap < 10%)")
332+
print(" ⚠ Minimal overfitting (gap < 10%)")
334333
else:
335-
print(f" ✗ Overfitting detected (gap > 10%)")
334+
print(" ✗ Overfitting detected (gap > 10%)")
336335

337336
model_comparison.append({
338337
'Model': model_name,
@@ -392,8 +391,8 @@ def preprocess_features(df, feature_list):
392391

393392
print("\nConfusion Matrix:")
394393
cm = confusion_matrix(y_test, y_pred)
395-
print(f" Predicted")
396-
print(f" Not Ret Retained")
394+
print(" Predicted")
395+
print(" Not Ret Retained")
397396
print(f"Actual Not {cm[0,0]:6d} {cm[0,1]:6d}")
398397
print(f" Ret {cm[1,0]:6d} {cm[1,1]:6d}")
399398

@@ -515,8 +514,8 @@ def assign_alert_level(risk_score):
515514
low_retention_low_risk = df[(df['retention_probability'] < 0.3) & (df['at_risk_alert'] == 'LOW')]
516515
print(f"Students with <30% retention flagged as LOW: {len(low_retention_low_risk)} (should be very few)")
517516

518-
print(f"\nEarly warning system aligned with retention predictions")
519-
print(f"\nAlert distribution:")
517+
print("\nEarly warning system aligned with retention predictions")
518+
print("\nAlert distribution:")
520519
print(df['at_risk_alert'].value_counts().sort_index())
521520

522521
# ============================================================================
@@ -572,7 +571,7 @@ def assign_alert_level(risk_score):
572571
df['predicted_time_to_credential'] = time_model.predict(X_full_retention)
573572
df['predicted_graduation_year'] = df['Cohort'].str[:4].astype(float) + df['predicted_time_to_credential']
574573

575-
print(f"Time predictions generated")
574+
print("Time predictions generated")
576575
else:
577576
print("Warning: Insufficient data for time-to-credential model")
578577
df['predicted_time_to_credential'] = np.nan
@@ -591,7 +590,7 @@ def assign_alert_level(risk_score):
591590
y_credential = y_credential[valid_idx]
592591

593592
print(f"\nDataset size: {len(X_cred):,} students")
594-
print(f"Credential type distribution:")
593+
print("Credential type distribution:")
595594
cred_labels = {0: 'No Credential', 1: 'Certificate', 2: 'Associate', 3: 'Bachelor'}
596595
for k, v in y_credential.value_counts().sort_index().items():
597596
print(f" {cred_labels.get(k, k)}: {v:,} ({v/len(y_credential)*100:.1f}%)")
@@ -651,7 +650,7 @@ def assign_alert_level(risk_score):
651650
if class_idx < len(prob_labels):
652651
df[prob_labels[int(class_idx)]] = proba[:, i]
653652

654-
print(f"Credential type predictions generated")
653+
print("Credential type predictions generated")
655654

656655
# ============================================================================
657656
# STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION
@@ -736,8 +735,8 @@ def assign_alert_level(risk_score):
736735

737736
print("\nConfusion Matrix:")
738737
cm = confusion_matrix(y_test, y_pred)
739-
print(f" Predicted")
740-
print(f" No Pass Pass")
738+
print(" Predicted")
739+
print(" No Pass Pass")
741740
print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}")
742741
print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}")
743742

@@ -753,7 +752,7 @@ def assign_alert_level(risk_score):
753752
labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
754753
)
755754

756-
print(f"Gateway math predictions generated")
755+
print("Gateway math predictions generated")
757756

758757
# ============================================================================
759758
# STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION
@@ -838,8 +837,8 @@ def assign_alert_level(risk_score):
838837

839838
print("\nConfusion Matrix:")
840839
cm = confusion_matrix(y_test, y_pred)
841-
print(f" Predicted")
842-
print(f" No Pass Pass")
840+
print(" Predicted")
841+
print(" No Pass Pass")
843842
print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}")
844843
print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}")
845844

@@ -855,7 +854,7 @@ def assign_alert_level(risk_score):
855854
labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
856855
)
857856

858-
print(f"Gateway English predictions generated")
857+
print("Gateway English predictions generated")
859858

860859
# ============================================================================
861860
# STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION
@@ -943,8 +942,8 @@ def assign_alert_level(risk_score):
943942

944943
print("\nConfusion Matrix:")
945944
cm = confusion_matrix(y_test, y_pred)
946-
print(f" Predicted")
947-
print(f" GPA>=2.0 GPA<2.0")
945+
print(" Predicted")
946+
print(" GPA>=2.0 GPA<2.0")
948947
print(f"Actual >=2.0 {cm[0,0]:6d} {cm[0,1]:6d}")
949948
print(f" <2.0 {cm[1,0]:6d} {cm[1,1]:6d}")
950949

@@ -958,7 +957,7 @@ def assign_alert_level(risk_score):
958957
labels=['Low Risk', 'Moderate Risk', 'High Risk', 'Critical Risk']
959958
)
960959

961-
print(f"Low GPA predictions generated")
960+
print("Low GPA predictions generated")
962961

963962
# ============================================================================
964963
# STEP 11: MODEL 8 - GPA PREDICTION (CONTINUOUS)
@@ -1027,9 +1026,9 @@ def assign_alert_level(risk_score):
10271026
axis=1
10281027
)
10291028

1030-
print(f"GPA predictions generated")
1029+
print("GPA predictions generated")
10311030
print(f"Mean predicted GPA: {df['predicted_gpa'].mean():.2f}")
1032-
print(f"\nPerformance Distribution:")
1031+
print("\nPerformance Distribution:")
10331032
print(df['gpa_performance'].value_counts().to_string())
10341033

10351034
# ============================================================================
@@ -1042,7 +1041,7 @@ def assign_alert_level(risk_score):
10421041
# Save student-level predictions with all columns
10431042
output_file = os.path.join(DATA_DIR, 'bishop_state_student_level_with_predictions.csv')
10441043
df.to_csv(output_file, index=False)
1045-
print(f"\n✓ Saved student-level predictions to CSV:")
1044+
print("\n✓ Saved student-level predictions to CSV:")
10461045
print(f" File: {output_file}")
10471046
print(f" Records: {len(df):,}")
10481047
print(f" Columns: {len(df.columns)}")
@@ -1093,7 +1092,7 @@ def assign_alert_level(risk_score):
10931092
# Save course-level predictions
10941093
output_file = os.path.join(DATA_DIR, 'bishop_state_merged_with_predictions.csv')
10951094
merged_with_predictions.to_csv(output_file, index=False)
1096-
print(f"\n✓ Saved course-level predictions to CSV:")
1095+
print("\n✓ Saved course-level predictions to CSV:")
10971096
print(f" File: {output_file}")
10981097
print(f" Records: {len(merged_with_predictions):,}")
10991098
print(f" Columns: {len(merged_with_predictions.columns)}")
@@ -1133,7 +1132,7 @@ def assign_alert_level(risk_score):
11331132
pct = count / len(df) * 100
11341133
summary_report += f" {cat:20s} {count:6,} ({pct:5.1f}%)\n"
11351134

1136-
summary_report += f"""
1135+
summary_report += """
11371136
2. EARLY WARNING SYSTEM
11381137
Algorithm: Composite Risk Score (Retention + Performance Metrics)
11391138
Approach: Aligned with retention predictions to eliminate contradictions

0 commit comments

Comments
 (0)