C12
user_1718919
plain_text
4 months ago
6.0 kB
4
Indexable
# --- PART 4: PICS DATA PROCESSING --- def convert_time_to_hours(time_str): """Convert time string in format HH:MM to decimal hours""" if pd.isna(time_str): return 0 try: hours, minutes = map(int, str(time_str).split(':')) return hours + minutes/60 except: return 0 def process_pics_data(caseload_file, hours_lookup_file): """Process PICS caseload data and hours lookup""" print("\nProcessing PICS data...") # Read hours lookup first print("Reading hours lookup data...") hours_df = pd.read_excel(hours_lookup_file, skiprows=2) # Skip first two rows # Process hours lookup hours_lookup = hours_df[[ 'Standard title', 'Weighted Monthly Hours (1.6)' ]].copy() # Convert time format to decimal hours hours_lookup['Monthly_Hours'] = hours_lookup['Weighted Monthly Hours (1.6)'].apply( convert_time_to_hours ) # Clean and standardize titles hours_lookup['Standard_Title'] = hours_lookup['Standard title'].str.strip() print(f"Processed {len(hours_lookup)} standard titles with hours") # Read caseload data print("\nReading PICS caseload data...") caseload_df = pd.read_excel(caseload_file) # Select and rename relevant columns cols_to_keep = { 'Assessor Full Name': 'Assessor_Name', 'Programme': 'Programme_Level', 'Apprenticeship Standard Title': 'Standard_Title', 'Apprenticeship Achieved Date': 'Achieved_Date', 'Start Date': 'Start_Date', 'Learning Expected End': 'Expected_End', 'Actual End': 'Actual_End' } df = caseload_df[cols_to_keep.keys()].copy() df.rename(columns=cols_to_keep, inplace=True) # Convert dates date_columns = ['Achieved_Date', 'Start_Date', 'Expected_End', 'Actual_End'] for col in date_columns: df[col] = pd.to_datetime(df[col], errors='coerce') # Clean titles for matching with lookup df['Standard_Title'] = df['Standard_Title'].str.strip() # Add hours from lookup df = df.merge( hours_lookup[['Standard_Title', 'Monthly_Hours']], on='Standard_Title', how='left' ) # Fill missing hours with 0 df['Monthly_Hours'] = df['Monthly_Hours'].fillna(0) # Function to check if a student was active in a given month def was_active_in_month(row, check_date): start_date = row['Start_Date'] end_date = row['Actual_End'] if pd.notna(row['Actual_End']) else row['Expected_End'] if pd.isna(start_date) or pd.isna(end_date): return False return start_date <= check_date <= end_date # Create monthly snapshots for FY23/24 monthly_data = [] # Generate dates for FY23/24 dates = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M') print("\nGenerating monthly snapshots...") for date in dates: # Filter for active students in this month month_mask = df.apply(lambda row: was_active_in_month(row, date), axis=1) month_data = df[month_mask].copy() # Add month info month_data['Snapshot_Date'] = date month_data['Year'] = date.year month_data['Month'] = date.month monthly_data.append(month_data) monthly_df = pd.concat(monthly_data, ignore_index=True) # Create summaries # 1. Monthly summary per assessor monthly_summary = monthly_df.groupby( ['Assessor_Name', 'Year', 'Month'] ).agg({ 'Standard_Title': 'count', # Number of active students 'Monthly_Hours': 'sum' # Total hours needed }).reset_index() monthly_summary.rename(columns={ 'Standard_Title': 'Active_Students' }, inplace=True) # 2. Programme level summary programme_summary = monthly_df.groupby( ['Assessor_Name', 'Programme_Level', 'Year', 'Month'] ).agg({ 'Standard_Title': 'count' }).reset_index() programme_summary.rename(columns={ 'Standard_Title': 'Students_In_Programme' }, inplace=True) # 3. Standard title summary standard_summary = monthly_df.groupby( ['Assessor_Name', 'Standard_Title', 'Year', 'Month'] ).agg({ 'Standard_Title': 'count', 'Monthly_Hours': 'sum' }).reset_index() standard_summary.rename(columns={ 'Standard_Title': 'Students_In_Standard', 'Monthly_Hours': 'Required_Hours' }, inplace=True) print("PICS data processing completed!") return { 'detailed_monthly': monthly_df, 'monthly_summary': monthly_summary, 'programme_summary': programme_summary, 'standard_summary': standard_summary, 'hours_reference': hours_lookup } # Execute processing if __name__ == "__main__": print("\nStarting PICS data processing...") # Define input files caseload_file = "PICS caseload for PBI.xlsx" hours_lookup_file = "PICS Hours for Assessor Look up.xlsx" try: # Process PICS data pics_data = process_pics_data(caseload_file, hours_lookup_file) # Save results output_folder = "processed_data" # Using same output folder as before if not os.path.exists(output_folder): os.makedirs(output_folder) # Save all dataframes for name, df in pics_data.items(): output_file = os.path.join(output_folder, f'pics_{name}.csv') df.to_csv(output_file, index=False) print(f"Saved {name} to {output_file}") print("\nPICS data processing completed successfully!") except Exception as e: print(f"\nError during processing: {str(e)}") raise
Editor is loading...
Leave a Comment