Untitled
user_1718919
plain_text
4 months ago
14 kB
4
Indexable
# Cell 1: Import necessary libraries import pandas as pd import numpy as np from pathlib import Path import os import re from datetime import datetime # Create processed_data directory if it doesn't exist if not os.path.exists('processed_data'): os.makedirs('processed_data') # Cell 2: Helper functions for name standardization def clean_string(s): """Remove extra spaces and standardize string format""" if pd.isna(s): return '' return str(s).strip().replace(' ', ' ') def extract_names_from_manager(manager_name): """ Extract first name, known as, and last name from manager format Example: 'Joanne Jo, Loss' -> ('Joanne', 'Jo', 'Loss') """ if pd.isna(manager_name) or manager_name == '': return '', '', '' try: # Split by comma to separate last name name_parts = manager_name.split(',') if len(name_parts) != 2: return manager_name, '', '' first_part = name_parts[0].strip() last_name = name_parts[1].strip() # Split first part to get first name and known as first_parts = first_part.split(' ') if len(first_parts) == 1: return first_parts[0], '', last_name else: return first_parts[0], first_parts[-1], last_name except: return manager_name, '', '' def standardize_staff_name(first_name, known_as, last_name): """Create standardized versions of staff names""" first_name = clean_string(first_name) known_as = clean_string(known_as) last_name = clean_string(last_name) # If known_as is empty, use first_name if not known_as: known_as = first_name return { 'first_name': first_name, 'known_as': known_as, 'last_name': last_name, 'full_name_first': f"{first_name} {last_name}", 'full_name_known': f"{known_as} {last_name}" } # Cell 3: Process Monthly Reports def process_monthly_reports(): """Process all monthly ACL reports""" monthly_data = [] monthly_reports_path = Path('monthly_reports') for file in monthly_reports_path.glob('ACL Monthly Report *.xlsx'): # Extract month and year from filename date_match = re.search(r'(\d{2})\.(\d{2})\.(\d{2})\.xlsx$', file.name) if date_match: day, month, year = date_match.groups() report_date = f"20{year}-{month}-{day}" # Read Excel file starting from row 3 (0-based index 2) df = pd.read_excel(file, skiprows=2) # Add report date df['report_date'] = pd.to_datetime(report_date) # Clean column names df.columns = [col.strip() for col in df.columns] # Filter for Tutors and Learning Support Assistants mask = df['Position Name'].isin(['Learning Support Assistant', 'Tutor']) df = df[mask] # Calculate target hours df['working_hours'] = pd.to_numeric(df['Working Hours'].str.replace('.', ''), errors='coerce') df['is_salaried'] = df['working_hours'] > 3 # Calculate target hours (only for salaried staff) df['fte'] = pd.to_numeric(df['Full-Time Equivalent'], errors='coerce') df['target_hours'] = np.where(df['is_salaried'], df['fte'] * 840, 0) # Process names df['first_name'] = df['First Name'].apply(clean_string) df['known_as'] = df['Known As'].apply(clean_string) df['last_name'] = df['Last Name'].apply(clean_string) # Process line manager names manager_names = df['Line Manager Name'].apply(extract_names_from_manager) df['manager_first_name'] = [x[0] for x in manager_names] df['manager_known_as'] = [x[1] for x in manager_names] df['manager_last_name'] = [x[2] for x in manager_names] monthly_data.append(df) # Combine all monthly data if monthly_data: combined_df = pd.concat(monthly_data, ignore_index=True) # Save processed data combined_df.to_csv('processed_data/processed_monthly_reports.csv', index=False) return combined_df else: raise Exception("No monthly reports found in the specified directory") # Cell 4: Process Tutor Pay Report def process_tutor_pay_report(file_path='TutorPayReport_New 23.24 FY 22.11.24.xlsx'): """Process the Tutor Pay Report""" df = pd.read_excel(file_path) # Clean column names df.columns = [col.strip() for col in df.columns] # Filter for relevant staff roles df = df[df['STAFFROLE'].str.strip().isin(['Tutor', 'Learning Support'])] # Clean names df['PEOPLENAME'] = df['PEOPLENAME'].apply(clean_string) df['PEOPLESURNAME'] = df['PEOPLESURNAME'].apply(clean_string) # Group assessment activities assessment_types = [ 'Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments' ] df['ACTIVITY_GROUP'] = np.where( df['ACTIVITYTYPE'].isin(assessment_types), 'Assessment', df['ACTIVITYTYPE'] ) # Calculate actual hours def convert_ms_to_hours(ms): if pd.isna(ms): return 0 return abs(float(ms)) / (1000 * 60 * 60) # Convert ms to hours df['early_adjust'] = df['TIMEADJUSTEARLY'].apply(convert_ms_to_hours) df['late_adjust'] = df['TIMEADJUSTLATE'].apply(convert_ms_to_hours) df['actual_hours'] = df['Potential Hours'] - (df['early_adjust'] + df['late_adjust']) df['actual_hours'] = df['actual_hours'].clip(lower=0) # Replace negative values with 0 # Convert event date to datetime df['EVENTDATE'] = pd.to_datetime(df['EVENTDATE']) # Save processed data df.to_csv('processed_data/processed_tutor_pay_report.csv', index=False) return df # Cell 5: Process Qualification Reference Data def process_qualification_references(): """Process both Approved and Archived qualification lists""" def read_qual_list(file_path): df = pd.read_excel(file_path) return df[['QualificationName', 'QualificationNumber']].copy() # Read both files approved_df = read_qual_list('Approved List for Terms Caseloading FY2324.xlsx') archived_df = read_qual_list('Archived List for Terms Caseloading FY2324.xlsx') # Combine them with a source indicator approved_df['source'] = 'approved' archived_df['source'] = 'archived' combined_df = pd.concat([approved_df, archived_df], ignore_index=True) # Remove 'P' or 'N' from the end of qualification numbers combined_df['QualificationNumber_clean'] = combined_df['QualificationNumber'].str.replace(r'[PN] , '', regex=True) # Save processed data combined_df.to_csv('processed_data/processed_qualification_refs.csv', index=False) return combined_df # Cell 6: Process Terms Caseloading Data def process_terms_caseloading(qual_ref_df): """Process Terms caseloading data with qualification reference lookup""" df = pd.read_excel('Terms caseloading 23.24 FY.xlsx') # Clean funding references (remove P/N and lookup qualification names) df['funding_ref_clean'] = df['Funding Reference'].str.replace(r'[PN] , '', regex=True) # Create qualification name lookup dictionary from both sources qual_lookup = qual_ref_df.set_index('QualificationNumber_clean')['QualificationName'].to_dict() # Apply qualification name lookup, fall back to Activity Name if not found df['QualificationName'] = df['funding_ref_clean'].map(qual_lookup).fillna(df['Activity Name']) # Convert dates date_columns = ['Start Date', 'End Date'] for col in date_columns: df[col] = pd.to_datetime(df[col]) # Clean staff names df['Primary Staff Name'] = df['Primary Staff Name'].apply(clean_string) df['Primary Staff Surname'] = df['Primary Staff Surname'].apply(clean_string) # Calculate learner counts by course and tutor learner_counts = df.groupby(['Primary Staff Name', 'Primary Staff Surname', 'QualificationName', 'Level', 'Activity Group', pd.Grouper(key='Start Date', freq='M')])['Name'].count().reset_index() learner_counts.rename(columns={'Name': 'learner_count'}, inplace=True) # Save processed data df.to_csv('processed_data/processed_terms_caseloading.csv', index=False) learner_counts.to_csv('processed_data/terms_learner_counts.csv', index=False) return df, learner_counts # Cell 7: Process PICS Hours Lookup def process_pics_hours_lookup(): """Process PICS Hours lookup table""" df = pd.read_excel('PICS Hours for Assessor Look up.xlsx', skiprows=2) # Convert time format (H:MM) to decimal hours def time_to_decimal(time_str): if pd.isna(time_str): return 0 try: hours, minutes = str(time_str).split(':') return float(hours) + float(minutes) / 60 except: return 0 df['weighted_hours'] = df['Weighted Monthly Hours (1.6)'].apply(time_to_decimal) # Save processed lookup df.to_csv('processed_data/processed_pics_hours_lookup.csv', index=False) return df # Cell 8: Process PICS Caseload Data def process_pics_caseload(hours_lookup_df): """Process PICS caseload data with hours lookup""" df = pd.read_excel('PICS caseload for PBI.xlsx') # Convert dates date_cols = ['Apprenticeship Achieved Date', 'Start Date', 'Learning Expected End', 'Actual End'] for col in date_cols: df[col] = pd.to_datetime(df[col], errors='coerce') # Create mapping for weighted hours hours_map = hours_lookup_df.set_index('Standard title')['weighted_hours'].to_dict() # Add weighted hours to each apprenticeship df['weighted_hours'] = df['Apprenticeship Standard Title'].map(hours_map) # Calculate active periods for each student df['end_date'] = df['Actual End'].fillna(df['Learning Expected End']) # Create monthly summary def get_monthly_student_counts(group): """Calculate student counts for each month in the date range""" if len(group) == 0: return pd.DataFrame() # Get date range for the group start_date = group['Start Date'].min() end_date = group['end_date'].max() if pd.isna(start_date) or pd.isna(end_date): return pd.DataFrame() # Create date range by month date_range = pd.date_range(start_date, end_date, freq='M') counts = [] for date in date_range: # Count students active in this month active_students = group[ (group['Start Date'] <= date) & (group['end_date'] >= date) ].shape[0] # Calculate total weighted hours for the month total_weighted_hours = group[ (group['Start Date'] <= date) & (group['end_date'] >= date) ]['weighted_hours'].sum() counts.append({ 'month': date, 'student_count': active_students, 'total_weighted_hours': total_weighted_hours }) return pd.DataFrame(counts) # Group by assessor and calculate monthly counts monthly_summary = df.groupby('Assessor Full Name').apply(get_monthly_student_counts).reset_index() # Save processed data df.to_csv('processed_data/processed_pics_caseload.csv', index=False) monthly_summary.to_csv('processed_data/pics_monthly_summary.csv', index=False) return df, monthly_summary # Cell 9: Main processing function def process_all_data(): """Process all data sources and create final outputs""" try: print("Processing monthly reports...") monthly_data = process_monthly_reports() print("Processing tutor pay report...") tutor_pay_data = process_tutor_pay_report() print("Processing qualification references...") qual_ref_data = process_qualification_references() print("Processing Terms caseloading...") terms_data, learner_counts = process_terms_caseloading(qual_ref_data) print("Processing PICS hours lookup...") pics_hours_lookup = process_pics_hours_lookup() print("Processing PICS caseload...") pics_data, pics_summary = process_pics_caseload(pics_hours_lookup) print("All data processed successfully!") return True except Exception as e: print(f"Error processing data: {str(e)}") return False # Call the main processing function if running the notebook if __name__ == "__main__": process_all_data()
Editor is loading...
Leave a Comment