|
| 1 | +""" |
| 2 | +Merge Bishop State Data Files into a Single Student-Level File |
| 3 | +=============================================================== |
| 4 | +Reads three source files: |
| 5 | + 1. data/bishop_state_cohorts_with_zip.csv (one row per student) |
| 6 | + 2. data/ar_bscc_with_zip.csv (one row per student) |
| 7 | + 3. data/bishop_state_courses.csv (multiple rows per student) |
| 8 | +
|
| 9 | +Outputs: |
| 10 | + data/bishop_state_student_level_with_zip.csv (one row per student, with |
| 11 | + aggregated course-level features) |
| 12 | +
|
| 13 | +The output schema matches what complete_ml_pipeline.py expects from |
| 14 | +kctcs_student_level_with_zip.csv. |
| 15 | +""" |
| 16 | + |
| 17 | +import os |
| 18 | +import pandas as pd |
| 19 | +import numpy as np |
| 20 | +from datetime import datetime |
| 21 | + |
| 22 | +# Resolve paths relative to project root |
| 23 | +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| 24 | +PROJECT_ROOT = os.path.dirname(SCRIPT_DIR) |
| 25 | +DATA_DIR = os.path.join(PROJECT_ROOT, 'data') |
| 26 | + |
| 27 | +print("=" * 80) |
| 28 | +print("MERGING BISHOP STATE DATA FILES") |
| 29 | +print("=" * 80) |
| 30 | + |
| 31 | +# ============================================================================ |
| 32 | +# 1. Load source files |
| 33 | +# ============================================================================ |
| 34 | + |
| 35 | +print("\n1. Reading ar_bscc_with_zip.csv...") |
| 36 | +ar_df = pd.read_csv(os.path.join(DATA_DIR, 'ar_bscc_with_zip.csv')) |
| 37 | +print(f" - Loaded {len(ar_df)} records") |
| 38 | +print(f" - Columns: {len(ar_df.columns)}") |
| 39 | + |
| 40 | +print("\n2. Reading bishop_state_cohorts_with_zip.csv...") |
| 41 | +cohorts_df = pd.read_csv(os.path.join(DATA_DIR, 'bishop_state_cohorts_with_zip.csv')) |
| 42 | +print(f" - Loaded {len(cohorts_df)} records") |
| 43 | +print(f" - Columns: {len(cohorts_df.columns)}") |
| 44 | + |
| 45 | +print("\n3. Reading bishop_state_courses.csv...") |
| 46 | +courses_df = pd.read_csv(os.path.join(DATA_DIR, 'bishop_state_courses.csv')) |
| 47 | +print(f" - Loaded {len(courses_df)} records") |
| 48 | +print(f" - Columns: {len(courses_df.columns)}") |
| 49 | + |
| 50 | +# ============================================================================ |
| 51 | +# 2. Merge cohorts with AR data (one-to-one on Student_GUID) |
| 52 | +# ============================================================================ |
| 53 | + |
| 54 | +print("\n" + "=" * 80) |
| 55 | +print("MERGING COHORT + AR DATA") |
| 56 | +print("=" * 80) |
| 57 | + |
| 58 | +# Rename student_id in AR data to Student_GUID for consistency |
| 59 | +ar_df_renamed = ar_df.rename(columns={'student_id': 'Student_GUID'}) |
| 60 | + |
| 61 | +# Drop zip_code from AR data (already present in cohorts) |
| 62 | +if 'zip_code' in ar_df_renamed.columns: |
| 63 | + ar_df_renamed = ar_df_renamed.drop(columns=['zip_code']) |
| 64 | + |
| 65 | +# Prefix AR columns to avoid conflicts (except join key and id) |
| 66 | +ar_columns_to_rename = {col: f'ar_{col}' for col in ar_df_renamed.columns |
| 67 | + if col not in ['Student_GUID', 'id']} |
| 68 | +ar_df_renamed = ar_df_renamed.rename(columns=ar_columns_to_rename) |
| 69 | +ar_df_renamed = ar_df_renamed.rename(columns={'id': 'ar_id'}) |
| 70 | + |
| 71 | +merged = pd.merge( |
| 72 | + cohorts_df, |
| 73 | + ar_df_renamed, |
| 74 | + on='Student_GUID', |
| 75 | + how='left', |
| 76 | + suffixes=('', '_ar') |
| 77 | +) |
| 78 | +print(f" - Cohort + AR merge: {len(merged)} records, {len(merged.columns)} columns") |
| 79 | + |
| 80 | +# ============================================================================ |
| 81 | +# 3. Aggregate course-level data to one row per student |
| 82 | +# ============================================================================ |
| 83 | + |
| 84 | +print("\n" + "=" * 80) |
| 85 | +print("AGGREGATING COURSE-LEVEL FEATURES") |
| 86 | +print("=" * 80) |
| 87 | + |
| 88 | +# Grade-to-numeric mapping for GPA-style calculations |
| 89 | +grade_map = { |
| 90 | + 'A': 4.0, 'B': 3.0, 'C': 2.0, 'D': 1.0, 'F': 0.0, |
| 91 | + 'A+': 4.0, 'A-': 3.7, 'B+': 3.3, 'B-': 2.7, |
| 92 | + 'C+': 2.3, 'C-': 1.7, 'D+': 1.3, 'D-': 0.7, |
| 93 | + 'P': 3.0, 'S': 3.0, # Pass / Satisfactory treated as C-equivalent |
| 94 | +} |
| 95 | +failing_grades = {'F', 'W', 'WF', 'WN', 'FN', 'U', 'I', 'E'} |
| 96 | + |
| 97 | +courses_df['grade_numeric'] = courses_df['Grade'].map(grade_map) |
| 98 | +courses_df['is_failing'] = courses_df['Grade'].isin(failing_grades).astype(int) |
| 99 | +courses_df['is_passing'] = (~courses_df['Grade'].isin(failing_grades) & |
| 100 | + courses_df['Grade'].notna()).astype(int) |
| 101 | + |
| 102 | +# Ensure numeric credit columns |
| 103 | +courses_df['Number_of_Credits_Attempted'] = pd.to_numeric( |
| 104 | + courses_df['Number_of_Credits_Attempted'], errors='coerce') |
| 105 | +courses_df['Number_of_Credits_Earned'] = pd.to_numeric( |
| 106 | + courses_df['Number_of_Credits_Earned'], errors='coerce') |
| 107 | + |
| 108 | + |
| 109 | +def aggregate_courses(group): |
| 110 | + """Aggregate course rows for a single student.""" |
| 111 | + row = {} |
| 112 | + |
| 113 | + # Volume metrics |
| 114 | + row['total_courses_enrolled'] = len(group) |
| 115 | + row['unique_course_prefixes'] = group['Course_Prefix'].nunique() |
| 116 | + |
| 117 | + # Credit metrics |
| 118 | + row['total_credits_attempted'] = group['Number_of_Credits_Attempted'].sum() |
| 119 | + row['total_credits_earned'] = group['Number_of_Credits_Earned'].sum() |
| 120 | + row['avg_credits_per_course'] = group['Number_of_Credits_Attempted'].mean() |
| 121 | + |
| 122 | + # Completion rate |
| 123 | + attempted = group['Number_of_Credits_Attempted'].sum() |
| 124 | + earned = group['Number_of_Credits_Earned'].sum() |
| 125 | + row['course_completion_rate'] = (earned / attempted) if attempted > 0 else 0.0 |
| 126 | + |
| 127 | + # Grade metrics |
| 128 | + graded = group['grade_numeric'].dropna() |
| 129 | + row['courses_with_grades'] = len(graded) |
| 130 | + row['average_grade'] = graded.mean() if len(graded) > 0 else np.nan |
| 131 | + row['min_grade'] = graded.min() if len(graded) > 0 else np.nan |
| 132 | + row['max_grade'] = graded.max() if len(graded) > 0 else np.nan |
| 133 | + row['grade_std_dev'] = graded.std() if len(graded) > 1 else 0.0 |
| 134 | + row['failing_grades_count'] = group['is_failing'].sum() |
| 135 | + total_graded = group['is_passing'].sum() + group['is_failing'].sum() |
| 136 | + row['passing_rate'] = (group['is_passing'].sum() / total_graded) if total_graded > 0 else 0.0 |
| 137 | + |
| 138 | + # Course type counts |
| 139 | + row['core_courses_taken'] = (group['Core_Course'] == 'Y').sum() |
| 140 | + row['gateway_math_courses'] = ((group['Math_or_English_Gateway'] == 'M') | |
| 141 | + (group['Math_or_English_Gateway'] == 'Y')).sum() |
| 142 | + row['gateway_english_courses'] = ((group['Math_or_English_Gateway'] == 'E') | |
| 143 | + (group['Math_or_English_Gateway'] == 'Y')).sum() |
| 144 | + row['corequisite_courses'] = (group['Co_requisite_Course'] == 'Y').sum() |
| 145 | + |
| 146 | + # Delivery method |
| 147 | + delivery = group['Delivery_Method'].fillna('') |
| 148 | + row['online_courses'] = (delivery == 'O').sum() |
| 149 | + row['face_to_face_courses'] = (delivery == 'F').sum() |
| 150 | + row['hybrid_courses'] = (delivery == 'H').sum() |
| 151 | + total_delivery = row['online_courses'] + row['face_to_face_courses'] + row['hybrid_courses'] |
| 152 | + row['pct_online'] = (row['online_courses'] / total_delivery) if total_delivery > 0 else 0.0 |
| 153 | + |
| 154 | + # Term / year diversity |
| 155 | + row['unique_academic_years'] = group['Academic_Year'].nunique() |
| 156 | + row['unique_academic_terms'] = group['Academic_Term'].nunique() |
| 157 | + |
| 158 | + term = group['Academic_Term'].fillna('').str.upper() |
| 159 | + row['fall_courses'] = term.str.contains('FALL').sum() |
| 160 | + row['spring_courses'] = term.str.contains('SPRING').sum() |
| 161 | + row['summer_courses'] = term.str.contains('SUMMER').sum() |
| 162 | + |
| 163 | + # Instructor employment status |
| 164 | + emp = group['Course_Instructor_Employment_Status'].fillna('') |
| 165 | + row['courses_with_fulltime_instructors'] = (emp == 'FT').sum() |
| 166 | + row['courses_with_parttime_instructors'] = (emp == 'PT').sum() |
| 167 | + |
| 168 | + # Enrolled at other institutions |
| 169 | + other = group['Enrolled_at_Other_Institutions'].fillna('N') |
| 170 | + row['enrolled_other_institutions'] = (other == 'Y').sum() |
| 171 | + |
| 172 | + return pd.Series(row) |
| 173 | + |
| 174 | + |
| 175 | +print(" Aggregating courses per student...") |
| 176 | +course_agg = courses_df.groupby('Student_GUID').apply( |
| 177 | + aggregate_courses, include_groups=False |
| 178 | +).reset_index() |
| 179 | +print(f" - Aggregated to {len(course_agg)} student rows") |
| 180 | + |
| 181 | +# ============================================================================ |
| 182 | +# 4. Join aggregated course features onto cohort+AR data |
| 183 | +# ============================================================================ |
| 184 | + |
| 185 | +print("\n" + "=" * 80) |
| 186 | +print("FINAL MERGE: COHORT+AR + COURSE AGGREGATES") |
| 187 | +print("=" * 80) |
| 188 | + |
| 189 | +final = pd.merge(merged, course_agg, on='Student_GUID', how='left') |
| 190 | +print(f" - Final dataset: {len(final)} students, {len(final.columns)} columns") |
| 191 | + |
| 192 | +# ============================================================================ |
| 193 | +# 5. Save output |
| 194 | +# ============================================================================ |
| 195 | + |
| 196 | +output_path = os.path.join(DATA_DIR, 'bishop_state_student_level_with_zip.csv') |
| 197 | +print(f"\nSaving to {output_path}...") |
| 198 | +final.to_csv(output_path, index=False) |
| 199 | + |
| 200 | +# ============================================================================ |
| 201 | +# Summary |
| 202 | +# ============================================================================ |
| 203 | + |
| 204 | +print("\n" + "=" * 80) |
| 205 | +print("MERGE COMPLETE!") |
| 206 | +print("=" * 80) |
| 207 | +print(f"Output file: {output_path}") |
| 208 | +print(f"Total students: {len(final):,}") |
| 209 | +print(f"Total columns: {len(final.columns)}") |
| 210 | + |
| 211 | +students_with_courses = final['total_courses_enrolled'].notna().sum() |
| 212 | +students_without_courses = final['total_courses_enrolled'].isna().sum() |
| 213 | +print(f"\nData breakdown:") |
| 214 | +print(f" - Students with course records: {students_with_courses:,}") |
| 215 | +print(f" - Students without course records: {students_without_courses:,}") |
| 216 | +if students_with_courses > 0: |
| 217 | + avg_courses = final.loc[final['total_courses_enrolled'].notna(), 'total_courses_enrolled'].mean() |
| 218 | + print(f" - Average courses per student (for those with courses): {avg_courses:.1f}") |
| 219 | + |
| 220 | +print(f"\nColumn categories:") |
| 221 | +cohort_cols = [c for c in final.columns |
| 222 | + if not c.startswith('ar_') and c not in course_agg.columns] |
| 223 | +ar_cols = [c for c in final.columns if c.startswith('ar_') or c == 'ar_id'] |
| 224 | +course_feat_cols = [c for c in course_agg.columns if c != 'Student_GUID'] |
| 225 | +print(f" - Cohort columns: {len(cohort_cols)}") |
| 226 | +print(f" - AR columns: {len(ar_cols)}") |
| 227 | +print(f" - Aggregated course features: {len(course_feat_cols)}") |
| 228 | + |
| 229 | +print("\n" + "=" * 80) |
0 commit comments