C3
user_1718919
plain_text
4 months ago
10 kB
4
Indexable
def calculate_activity_hours(row: pd.Series) -> Tuple[str, float]: """ Calculate and categorize activity hours Args: row: DataFrame row containing activity information Returns: Tuple of (activity_category, actual_hours) """ assessment_types = [ 'Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments' ] if pd.isna(row['ACTIVITYTYPE']): return 'Other', row['Actual_Hours'] activity = str(row['ACTIVITYTYPE']).strip() hours = float(row['Actual_Hours']) if activity in assessment_types: return 'Assessment', hours elif activity in ['Community Engagement', 'Tutorials/Drop Ins']: return activity, hours return 'Other', hours def process_tutor_report( file_path: str, staff_data_path: str, contract_history: pd.DataFrame ) -> Optional[Dict[str, pd.DataFrame]]: """ Process the TutorPayReport and link with staff data Args: file_path: Path to tutor pay report file staff_data_path: Path to processed staff data contract_history: Contract history DataFrame Returns: Dictionary of processed DataFrames or None if processing fails """ logging.info("Processing Tutor Pay Report...") # Read files with error handling df, error = safe_read_excel(file_path) if error: logging.error(f"Failed to read tutor pay report: {error}") return None staff_df = pd.read_csv(staff_data_path) required_columns = [ 'PEOPLENAME', 'PEOPLESURNAME', 'STAFFROLE', 'CONTRACTNAME', 'ACTIVITYTYPE', 'Potential Hours', 'TIMEADJUSTEARLY', 'TIMEADJUSTLATE', 'EVENTDATE', 'Course Funding Group', 'Course Prov Mon C', 'Course Prov Mon D', 'VENUE' ] try: validate_processed_data(df, required_columns, 'tutor_pay_report') except ValueError as e: logging.error(f"Validation failed: {str(e)}") return None # Clean and standardize names name_columns = ['PEOPLENAME', 'PEOPLESURNAME'] df = standardize_names(df, name_columns) df['Staff_Standard'] = df.apply( lambda x: f"{x['PEOPLENAME']} {x['PEOPLESURNAME']}", axis=1 ) # Filter staff roles and clean df['STAFFROLE'] = df['STAFFROLE'].apply(clean_name) role_mask = df['STAFFROLE'].isin(['Tutor', 'Learning Support']) invalid_roles = df[~role_mask]['STAFFROLE'].unique() if len(invalid_roles) > 0: logging.warning(f"Found invalid staff roles: {invalid_roles}") df = df[role_mask].copy() # Calculate actual hours with time adjustments logging.info("Calculating actual hours...") # Convert time adjustments for col in ['TIMEADJUSTEARLY', 'TIMEADJUSTLATE']: df[f'{col}_Hours'] = pd.to_numeric(df[col], errors='coerce').fillna(0).apply( lambda x: convert_time_to_hours(x) if x != 0 else 0 ) # Calculate actual hours df['Potential Hours'] = pd.to_numeric(df['Potential Hours'], errors='coerce').fillna(0) df['Actual_Hours'] = df.apply( lambda x: max(0, x['Potential Hours'] - (x['TIMEADJUSTEARLY_Hours'] + x['TIMEADJUSTLATE_Hours'])), axis=1 ) # Verify hour calculations zero_hours = df['Actual_Hours'].eq(0).sum() if zero_hours > 0: logging.warning(f"Found {zero_hours} entries with zero actual hours") logging.debug("Sample of zero-hour entries:") sample_zero = df[df['Actual_Hours'] == 0].head() for _, row in sample_zero.iterrows(): logging.debug(f" Activity: {row['ACTIVITYTYPE']}, " f"Potential: {row['Potential Hours']}, " f"Adjustments: {row['TIMEADJUSTEARLY_Hours'] + row['TIMEADJUSTLATE_Hours']}") # Process dates and create time periods df['EVENTDATE'] = pd.to_datetime(df['EVENTDATE'], format='%d %b %Y', errors='coerce') invalid_dates = df['EVENTDATE'].isna().sum() if invalid_dates > 0: logging.warning(f"Found {invalid_dates} invalid event dates") logging.debug("Sample of invalid dates:") sample_invalid = df[df['EVENTDATE'].isna()]['EVENTDATE'].head() for date in sample_invalid: logging.debug(f" Invalid date value: {date}") df['Year'] = df['EVENTDATE'].dt.year df['Month'] = df['EVENTDATE'].dt.month df['Week'] = df['EVENTDATE'].dt.isocalendar().week # Categorize activities and calculate hours logging.info("Categorizing activities...") df['Activity_Category'], df['Category_Hours'] = zip(*df.apply(calculate_activity_hours, axis=1)) # Keep required columns columns_to_keep = [ 'Staff_Standard', 'STAFFROLE', 'CONTRACTNAME', 'Activity_Category', 'Actual_Hours', 'Category_Hours', 'EVENTDATE', 'Year', 'Month', 'Week', 'Course Funding Group', 'Course Prov Mon C', 'Course Prov Mon D', 'VENUE' ] df = df[columns_to_keep] # Create monthly summaries logging.info("Creating monthly summaries...") monthly_hours = df.groupby( ['Staff_Standard', 'Year', 'Month', 'Activity_Category'] ).agg({ 'Actual_Hours': 'sum', 'Category_Hours': 'sum' }).reset_index() # Get contract details for each month def get_monthly_contract(row): month_date = pd.Timestamp(year=int(row['Year']), month=int(row['Month']), day=1) staff_contracts = contract_history[ contract_history['Staff_Name'] == row['Staff_Standard'] ] # Find applicable contracts for the month relevant_contracts = staff_contracts[ (staff_contracts['Start_Date'] <= month_date) & (staff_contracts['End_Date'] >= month_date) ] if len(relevant_contracts) > 0: # If multiple contracts, combine information positions = relevant_contracts['Position'].unique() contract_types = relevant_contracts['Contract_Type'].unique() total_target = relevant_contracts['Target_Hours'].sum() return pd.Series({ 'Position': ' & '.join(positions), 'Contract_Type': ' & '.join(contract_types), 'Target_Hours': total_target, 'Multiple_Contracts': len(relevant_contracts) > 1 }) return pd.Series({ 'Position': None, 'Contract_Type': None, 'Target_Hours': None, 'Multiple_Contracts': False }) # Add contract info to monthly summary logging.info("Adding contract information...") contract_details = monthly_hours.apply(get_monthly_contract, axis=1) monthly_hours = pd.concat([monthly_hours, contract_details], axis=1) # Calculate utilization logging.info("Calculating monthly utilization...") monthly_utilization = [] for name in monthly_hours['Staff_Standard'].unique(): staff_monthly = monthly_hours[ monthly_hours['Staff_Standard'] == name ].copy() # Process each month for month_idx, month_group in staff_monthly.groupby(['Year', 'Month']): year, month = month_idx # Only calculate utilization if we have target hours if pd.notna(month_group['Target_Hours'].iloc[0]): utilization = calculate_monthly_utilization( month_group, month_group['Target_Hours'].iloc[0], exclude_categories=['Assessment'] ) staff_monthly.loc[ (staff_monthly['Year'] == year) & (staff_monthly['Month'] == month), 'Utilization_Percentage' ] = utilization monthly_utilization.append(staff_monthly) if not monthly_utilization: logging.warning("No monthly utilization data generated!") return None monthly_summary = pd.concat(monthly_utilization) # Create additional summaries logging.info("Creating additional summaries...") # Daily summary daily_summary = df.groupby( ['Staff_Standard', 'EVENTDATE', 'Activity_Category'] ).agg({ 'Actual_Hours': 'sum', 'Category_Hours': 'sum' }).reset_index() # Weekly summary weekly_summary = df.groupby( ['Staff_Standard', 'Year', 'Week', 'Activity_Category'] ).agg({ 'Actual_Hours': 'sum', 'Category_Hours': 'sum' }).reset_index() # Add total hours columns for PowerBI monthly_summary['Total_Hours'] = monthly_summary['Actual_Hours'] monthly_summary['Assessment_Hours'] = monthly_summary.apply( lambda x: x['Category_Hours'] if x['Activity_Category'] == 'Assessment' else 0, axis=1 ) monthly_summary['Non_Assessment_Hours'] = monthly_summary.apply( lambda x: x['Category_Hours'] if x['Activity_Category'] != 'Assessment' else 0, axis=1 ) # Venue and curriculum summaries venue_summary = df.groupby( ['Staff_Standard', 'Year', 'Month', 'VENUE'] ).agg({ 'Actual_Hours': 'sum', 'Category_Hours': 'sum' }).reset_index() curriculum_summary = df.groupby( ['Staff_Standard', 'Year', 'Month', 'Course Prov Mon C'] ).agg({ 'Actual_Hours': 'sum', 'Category_Hours': 'sum' }).reset_index() logging.info("Tutor pay report processing completed!") # Return all processed data return { 'daily_summary': daily_summary, 'weekly_summary': weekly_summary, 'monthly_summary': monthly_summary, 'venue_summary': venue_summary, 'curriculum_summary': curriculum_summary, 'detailed_data': df }
Editor is loading...
Leave a Comment