C4
user_1718919
plain_text
4 months ago
10 kB
4
Indexable
# Cell 4: PICS Data Processing and Main Execution def process_pics_hours_lookup(hours_df: pd.DataFrame) -> pd.DataFrame: """Process the PICS hours lookup data""" hours_lookup = hours_df[[ 'Standard title', 'Weighted Monthly Hours (1.6)' ]].copy() def parse_weighted_hours(time_str): if pd.isna(time_str): return 0.0 try: if isinstance(time_str, str) and ':' in time_str: hours, minutes = map(int, time_str.split(':')[:2]) return hours + (minutes / 60) elif isinstance(time_str, (datetime.time, pd.Timestamp)): return time_str.hour + (time_str.minute / 60) return float(time_str) except Exception as e: logging.warning(f"Error parsing weighted hours '{time_str}': {str(e)}") return 0.0 hours_lookup['Monthly_Hours'] = hours_lookup['Weighted Monthly Hours (1.6)'].apply(parse_weighted_hours) hours_lookup['Standard_Title'] = hours_lookup['Standard title'].str.strip() # Log zero hours entries zero_hours = hours_lookup[hours_lookup['Monthly_Hours'] == 0] if not zero_hours.empty: logging.warning(f"Found {len(zero_hours)} standards with zero hours:") for _, row in zero_hours.iterrows(): logging.warning(f" - {row['Standard_Title']}: {row['Weighted Monthly Hours (1.6)']}") return hours_lookup[['Standard_Title', 'Monthly_Hours']] def process_pics_data( caseload_file: str, hours_lookup_file: str, contract_history: pd.DataFrame ) -> Dict[str, pd.DataFrame]: """ Process PICS caseload data and hours lookup """ logging.info("Processing PICS data...") # Read hours lookup first logging.info("Reading hours lookup data...") hours_df, error = safe_read_excel(hours_lookup_file, skiprows=2) if error: logging.error(f"Failed to read hours lookup: {error}") raise ValueError(f"Cannot process PICS data: {error}") # Process hours lookup hours_lookup = process_pics_hours_lookup(hours_df) logging.info(f"Processed {len(hours_lookup)} standard titles with hours") # Read and validate caseload data logging.info("Reading PICS caseload data...") caseload_df, error = safe_read_excel(caseload_file) if error: logging.error(f"Failed to read caseload data: {error}") raise ValueError(f"Cannot process PICS data: {error}") required_columns = [ 'Assessor Full Name', 'Programme', 'Apprenticeship Standard Title', 'Apprenticeship Achieved Date', 'Start Date', 'Learning Expected End', 'Actual End' ] # Validate and select columns try: validate_processed_data(caseload_df, required_columns, 'PICS_caseload') df = caseload_df[required_columns].copy() except ValueError as e: logging.error(f"Validation failed: {str(e)}") raise # Rename columns for consistency column_mapping = { 'Assessor Full Name': 'Assessor_Name', 'Programme': 'Programme_Level', 'Apprenticeship Standard Title': 'Standard_Title', 'Apprenticeship Achieved Date': 'Achieved_Date', 'Start Date': 'Start_Date', 'Learning Expected End': 'Expected_End', 'Actual End': 'Actual_End' } df.rename(columns=column_mapping, inplace=True) # Clean assessor names df['Assessor_Name'] = df['Assessor_Name'].apply(clean_name) # Convert dates with validation and proper error handling date_columns = ['Achieved_Date', 'Start_Date', 'Expected_End', 'Actual_End'] for col in date_columns: df[col] = pd.to_datetime(df[col], errors='coerce') invalid_dates = df[col].isna() invalid_count = invalid_dates.sum() if invalid_count > 0: logging.warning(f"Found {invalid_count} invalid dates in {col}") if col not in ['Achieved_Date', 'Actual_End']: # These can legitimately be null logging.warning("Sample of rows with invalid dates:") sample_invalid = df[invalid_dates].head() for _, row in sample_invalid.iterrows(): logging.warning(f"Invalid {col} for {row['Assessor_Name']}: {row['Standard_Title']}") # Clean titles and match with hours lookup df['Standard_Title'] = df['Standard_Title'].str.strip() # Add hours from lookup with validation df = df.merge( hours_lookup, on='Standard_Title', how='left', validate='m:1' ) # Check for unmatched standards unmatched = df[df['Monthly_Hours'].isna()]['Standard_Title'].unique() if len(unmatched) > 0: logging.warning(f"Found {len(unmatched)} standards without matching hours:") for title in unmatched: logging.warning(f" - {title}") df['Monthly_Hours'] = df['Monthly_Hours'].fillna(0) # Create monthly snapshots with improved date handling monthly_data = [] date_range = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M') logging.info("Generating monthly snapshots...") for date in date_range: month_start = date.replace(day=1) month_end = date logging.info(f"Processing month: {month_start.strftime('%B %Y')}") # Get active students for this month with proper date comparison month_mask = ( (df['Start_Date'].notna() & (df['Start_Date'] <= month_end)) & ((df['Actual_End'].isna() | (df['Actual_End'] >= month_start)) | (df['Expected_End'].isna() | (df['Expected_End'] >= month_start))) ) month_data = df[month_mask].copy() if not month_data.empty: month_data['Snapshot_Date'] = month_end month_data['Year'] = month_end.year month_data['Month'] = month_end.month # Add contract info for this month def get_assessor_contract(row): staff_contracts = contract_history[ contract_history['Staff_Name'] == row['Assessor_Name'] ] relevant_contracts = staff_contracts[ (staff_contracts['Start_Date'] <= month_end) & (staff_contracts['End_Date'] >= month_start) ] if len(relevant_contracts) > 0: positions = relevant_contracts['Position'].unique() contract_types = relevant_contracts['Contract_Type'].unique() # Calculate total target hours only for salaried contracts total_target = relevant_contracts[ relevant_contracts['Contract_Type'].str.contains('Salaried', na=False) ]['Target_Hours'].sum() return pd.Series({ 'Assessor_Position': ' & '.join(positions), 'Assessor_Contract': ' & '.join(contract_types), 'Assessor_Target_Hours': total_target if total_target > 0 else None, 'Multiple_Contracts': len(relevant_contracts) > 1 }) return pd.Series({ 'Assessor_Position': None, 'Assessor_Contract': None, 'Assessor_Target_Hours': None, 'Multiple_Contracts': False }) # Add contract details for the month contract_details = month_data.apply(get_assessor_contract, axis=1) month_data = pd.concat([month_data, contract_details], axis=1) monthly_data.append(month_data) logging.info(f"Active students in {month_start.strftime('%B %Y')}: {len(month_data)}") # Combine monthly data with empty DataFrame handling monthly_df = pd.concat(monthly_data, ignore_index=True) if monthly_data else pd.DataFrame() if monthly_df.empty: logging.warning("No monthly data generated!") return {} # Create summaries with proper grouping logging.info("Creating summary views...") # Monthly summary per assessor monthly_summary = monthly_df.groupby( ['Assessor_Name', 'Year', 'Month', 'Assessor_Position', 'Assessor_Contract', 'Multiple_Contracts'] ).agg({ 'Standard_Title': 'nunique', 'Monthly_Hours': 'sum', 'Assessor_Target_Hours': 'first' }).reset_index() monthly_summary.rename(columns={ 'Standard_Title': 'Active_Students', 'Monthly_Hours': 'Required_Hours' }, inplace=True) # Programme level summary programme_summary = monthly_df.groupby( ['Assessor_Name', 'Programme_Level', 'Year', 'Month'] ).agg({ 'Standard_Title': 'nunique', 'Assessor_Position': 'first', 'Assessor_Contract': 'first', 'Multiple_Contracts': 'first', 'Assessor_Target_Hours': 'first' }).reset_index() programme_summary.rename(columns={ 'Standard_Title': 'Students_In_Programme' }, inplace=True) # Standard title summary standard_summary = monthly_df.groupby( ['Assessor_Name', 'Standard_Title', 'Year', 'Month'] ).agg({ 'Monthly_Hours': 'sum', 'Assessor_Name': 'count', 'Assessor_Position': 'first', 'Assessor_Contract': 'first', 'Multiple_Contracts': 'first', 'Assessor_Target_Hours': 'first' }).reset_index() standard_summary.rename(columns={ 'Assessor_Name': 'Students_In_Standard', 'Monthly_Hours': 'Required_Hours' }, inplace=True) logging.info("PICS data processing completed!") return { 'detailed_monthly': monthly_df, 'monthly_summary': monthly_summary, 'programme_summary': programme_summary, 'standard_summary': standard_summary, 'hours_reference': hours_lookup }
Editor is loading...
Leave a Comment