Skip to content

Commit 01629cc

Browse files
committed
feat: add Bishop State data merge script and merged dataset
Merge cohort, AR, and course-level data into a single student-level file (4,000 students, 135 columns) for ML pipeline consumption.
1 parent c8d5559 commit 01629cc

2 files changed

Lines changed: 4230 additions & 0 deletions

File tree

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
"""
2+
Merge Bishop State Data Files into a Single Student-Level File
3+
===============================================================
4+
Reads three source files:
5+
1. data/bishop_state_cohorts_with_zip.csv (one row per student)
6+
2. data/ar_bscc_with_zip.csv (one row per student)
7+
3. data/bishop_state_courses.csv (multiple rows per student)
8+
9+
Outputs:
10+
data/bishop_state_student_level_with_zip.csv (one row per student, with
11+
aggregated course-level features)
12+
13+
The output schema matches what complete_ml_pipeline.py expects from
14+
kctcs_student_level_with_zip.csv.
15+
"""
16+
17+
import os
18+
import pandas as pd
19+
import numpy as np
20+
from datetime import datetime
21+
22+
# Resolve paths relative to project root
23+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24+
PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
25+
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
26+
27+
print("=" * 80)
28+
print("MERGING BISHOP STATE DATA FILES")
29+
print("=" * 80)
30+
31+
# ============================================================================
32+
# 1. Load source files
33+
# ============================================================================
34+
35+
print("\n1. Reading ar_bscc_with_zip.csv...")
36+
ar_df = pd.read_csv(os.path.join(DATA_DIR, 'ar_bscc_with_zip.csv'))
37+
print(f" - Loaded {len(ar_df)} records")
38+
print(f" - Columns: {len(ar_df.columns)}")
39+
40+
print("\n2. Reading bishop_state_cohorts_with_zip.csv...")
41+
cohorts_df = pd.read_csv(os.path.join(DATA_DIR, 'bishop_state_cohorts_with_zip.csv'))
42+
print(f" - Loaded {len(cohorts_df)} records")
43+
print(f" - Columns: {len(cohorts_df.columns)}")
44+
45+
print("\n3. Reading bishop_state_courses.csv...")
46+
courses_df = pd.read_csv(os.path.join(DATA_DIR, 'bishop_state_courses.csv'))
47+
print(f" - Loaded {len(courses_df)} records")
48+
print(f" - Columns: {len(courses_df.columns)}")
49+
50+
# ============================================================================
51+
# 2. Merge cohorts with AR data (one-to-one on Student_GUID)
52+
# ============================================================================
53+
54+
print("\n" + "=" * 80)
55+
print("MERGING COHORT + AR DATA")
56+
print("=" * 80)
57+
58+
# Rename student_id in AR data to Student_GUID for consistency
59+
ar_df_renamed = ar_df.rename(columns={'student_id': 'Student_GUID'})
60+
61+
# Drop zip_code from AR data (already present in cohorts)
62+
if 'zip_code' in ar_df_renamed.columns:
63+
ar_df_renamed = ar_df_renamed.drop(columns=['zip_code'])
64+
65+
# Prefix AR columns to avoid conflicts (except join key and id)
66+
ar_columns_to_rename = {col: f'ar_{col}' for col in ar_df_renamed.columns
67+
if col not in ['Student_GUID', 'id']}
68+
ar_df_renamed = ar_df_renamed.rename(columns=ar_columns_to_rename)
69+
ar_df_renamed = ar_df_renamed.rename(columns={'id': 'ar_id'})
70+
71+
merged = pd.merge(
72+
cohorts_df,
73+
ar_df_renamed,
74+
on='Student_GUID',
75+
how='left',
76+
suffixes=('', '_ar')
77+
)
78+
print(f" - Cohort + AR merge: {len(merged)} records, {len(merged.columns)} columns")
79+
80+
# ============================================================================
81+
# 3. Aggregate course-level data to one row per student
82+
# ============================================================================
83+
84+
print("\n" + "=" * 80)
85+
print("AGGREGATING COURSE-LEVEL FEATURES")
86+
print("=" * 80)
87+
88+
# Grade-to-numeric mapping for GPA-style calculations
89+
grade_map = {
90+
'A': 4.0, 'B': 3.0, 'C': 2.0, 'D': 1.0, 'F': 0.0,
91+
'A+': 4.0, 'A-': 3.7, 'B+': 3.3, 'B-': 2.7,
92+
'C+': 2.3, 'C-': 1.7, 'D+': 1.3, 'D-': 0.7,
93+
'P': 3.0, 'S': 3.0, # Pass / Satisfactory treated as C-equivalent
94+
}
95+
failing_grades = {'F', 'W', 'WF', 'WN', 'FN', 'U', 'I', 'E'}
96+
97+
courses_df['grade_numeric'] = courses_df['Grade'].map(grade_map)
98+
courses_df['is_failing'] = courses_df['Grade'].isin(failing_grades).astype(int)
99+
courses_df['is_passing'] = (~courses_df['Grade'].isin(failing_grades) &
100+
courses_df['Grade'].notna()).astype(int)
101+
102+
# Ensure numeric credit columns
103+
courses_df['Number_of_Credits_Attempted'] = pd.to_numeric(
104+
courses_df['Number_of_Credits_Attempted'], errors='coerce')
105+
courses_df['Number_of_Credits_Earned'] = pd.to_numeric(
106+
courses_df['Number_of_Credits_Earned'], errors='coerce')
107+
108+
109+
def aggregate_courses(group):
110+
"""Aggregate course rows for a single student."""
111+
row = {}
112+
113+
# Volume metrics
114+
row['total_courses_enrolled'] = len(group)
115+
row['unique_course_prefixes'] = group['Course_Prefix'].nunique()
116+
117+
# Credit metrics
118+
row['total_credits_attempted'] = group['Number_of_Credits_Attempted'].sum()
119+
row['total_credits_earned'] = group['Number_of_Credits_Earned'].sum()
120+
row['avg_credits_per_course'] = group['Number_of_Credits_Attempted'].mean()
121+
122+
# Completion rate
123+
attempted = group['Number_of_Credits_Attempted'].sum()
124+
earned = group['Number_of_Credits_Earned'].sum()
125+
row['course_completion_rate'] = (earned / attempted) if attempted > 0 else 0.0
126+
127+
# Grade metrics
128+
graded = group['grade_numeric'].dropna()
129+
row['courses_with_grades'] = len(graded)
130+
row['average_grade'] = graded.mean() if len(graded) > 0 else np.nan
131+
row['min_grade'] = graded.min() if len(graded) > 0 else np.nan
132+
row['max_grade'] = graded.max() if len(graded) > 0 else np.nan
133+
row['grade_std_dev'] = graded.std() if len(graded) > 1 else 0.0
134+
row['failing_grades_count'] = group['is_failing'].sum()
135+
total_graded = group['is_passing'].sum() + group['is_failing'].sum()
136+
row['passing_rate'] = (group['is_passing'].sum() / total_graded) if total_graded > 0 else 0.0
137+
138+
# Course type counts
139+
row['core_courses_taken'] = (group['Core_Course'] == 'Y').sum()
140+
row['gateway_math_courses'] = ((group['Math_or_English_Gateway'] == 'M') |
141+
(group['Math_or_English_Gateway'] == 'Y')).sum()
142+
row['gateway_english_courses'] = ((group['Math_or_English_Gateway'] == 'E') |
143+
(group['Math_or_English_Gateway'] == 'Y')).sum()
144+
row['corequisite_courses'] = (group['Co_requisite_Course'] == 'Y').sum()
145+
146+
# Delivery method
147+
delivery = group['Delivery_Method'].fillna('')
148+
row['online_courses'] = (delivery == 'O').sum()
149+
row['face_to_face_courses'] = (delivery == 'F').sum()
150+
row['hybrid_courses'] = (delivery == 'H').sum()
151+
total_delivery = row['online_courses'] + row['face_to_face_courses'] + row['hybrid_courses']
152+
row['pct_online'] = (row['online_courses'] / total_delivery) if total_delivery > 0 else 0.0
153+
154+
# Term / year diversity
155+
row['unique_academic_years'] = group['Academic_Year'].nunique()
156+
row['unique_academic_terms'] = group['Academic_Term'].nunique()
157+
158+
term = group['Academic_Term'].fillna('').str.upper()
159+
row['fall_courses'] = term.str.contains('FALL').sum()
160+
row['spring_courses'] = term.str.contains('SPRING').sum()
161+
row['summer_courses'] = term.str.contains('SUMMER').sum()
162+
163+
# Instructor employment status
164+
emp = group['Course_Instructor_Employment_Status'].fillna('')
165+
row['courses_with_fulltime_instructors'] = (emp == 'FT').sum()
166+
row['courses_with_parttime_instructors'] = (emp == 'PT').sum()
167+
168+
# Enrolled at other institutions
169+
other = group['Enrolled_at_Other_Institutions'].fillna('N')
170+
row['enrolled_other_institutions'] = (other == 'Y').sum()
171+
172+
return pd.Series(row)
173+
174+
175+
print(" Aggregating courses per student...")
176+
course_agg = courses_df.groupby('Student_GUID').apply(
177+
aggregate_courses, include_groups=False
178+
).reset_index()
179+
print(f" - Aggregated to {len(course_agg)} student rows")
180+
181+
# ============================================================================
182+
# 4. Join aggregated course features onto cohort+AR data
183+
# ============================================================================
184+
185+
print("\n" + "=" * 80)
186+
print("FINAL MERGE: COHORT+AR + COURSE AGGREGATES")
187+
print("=" * 80)
188+
189+
final = pd.merge(merged, course_agg, on='Student_GUID', how='left')
190+
print(f" - Final dataset: {len(final)} students, {len(final.columns)} columns")
191+
192+
# ============================================================================
193+
# 5. Save output
194+
# ============================================================================
195+
196+
output_path = os.path.join(DATA_DIR, 'bishop_state_student_level_with_zip.csv')
197+
print(f"\nSaving to {output_path}...")
198+
final.to_csv(output_path, index=False)
199+
200+
# ============================================================================
201+
# Summary
202+
# ============================================================================
203+
204+
print("\n" + "=" * 80)
205+
print("MERGE COMPLETE!")
206+
print("=" * 80)
207+
print(f"Output file: {output_path}")
208+
print(f"Total students: {len(final):,}")
209+
print(f"Total columns: {len(final.columns)}")
210+
211+
students_with_courses = final['total_courses_enrolled'].notna().sum()
212+
students_without_courses = final['total_courses_enrolled'].isna().sum()
213+
print(f"\nData breakdown:")
214+
print(f" - Students with course records: {students_with_courses:,}")
215+
print(f" - Students without course records: {students_without_courses:,}")
216+
if students_with_courses > 0:
217+
avg_courses = final.loc[final['total_courses_enrolled'].notna(), 'total_courses_enrolled'].mean()
218+
print(f" - Average courses per student (for those with courses): {avg_courses:.1f}")
219+
220+
print(f"\nColumn categories:")
221+
cohort_cols = [c for c in final.columns
222+
if not c.startswith('ar_') and c not in course_agg.columns]
223+
ar_cols = [c for c in final.columns if c.startswith('ar_') or c == 'ar_id']
224+
course_feat_cols = [c for c in course_agg.columns if c != 'Student_GUID']
225+
print(f" - Cohort columns: {len(cohort_cols)}")
226+
print(f" - AR columns: {len(ar_cols)}")
227+
print(f" - Aggregated course features: {len(course_feat_cols)}")
228+
229+
print("\n" + "=" * 80)

0 commit comments

Comments
 (0)