C8
user_1718919
plain_text
10 months ago
7.6 kB
5
Indexable
import pandas as pd
import numpy as np
import os
from datetime import datetime
def clean_name(name):
"""Clean extra spaces from names"""
if pd.isna(name):
return name
return ' '.join(str(name).split())
def process_tutor_report(file_path, staff_data_path):
"""Process the TutorPayReport and link with staff data"""
print(f"\nReading tutor report from: {file_path}")
print(f"Reading staff data from: {staff_data_path}")
# Read the TutorPayReport
df = pd.read_excel(file_path)
print(f"\nTutor report shape: {df.shape}")
# Clean name fields
df['PEOPLENAME'] = df['PEOPLENAME'].apply(clean_name)
df['PEOPLESURNAME'] = df['PEOPLESURNAME'].apply(clean_name)
# Read the processed staff data for name matching
staff_df = pd.read_csv(staff_data_path)
print(f"\nStaff data shape: {staff_df.shape}")
# Create standardized name format for matching
df['Staff_Standard'] = df.apply(
lambda x: f"{x['PEOPLENAME']} {x['PEOPLESURNAME']}", axis=1)
print("\nSample of cleaned names from tutor report:")
print(df[['Staff_Standard']].head())
print("\nSample of names from staff data:")
print(staff_df[['Staff_Standard']].head())
# Filter for relevant staff roles
df = df[df['STAFFROLE'].isin(['Tutor', 'Learning Support'])]
# Process time adjustments
def convert_ms_to_hours(ms_value):
if pd.isna(ms_value) or ms_value == 0:
return 0
return abs(float(ms_value)) / (1000 * 60 * 60)
# Convert time adjustments to hours, handling potential string values
df['EarlyAdjust'] = pd.to_numeric(df['TIMEADJUSTEARLY'], errors='coerce').fillna(0)
df['LateAdjust'] = pd.to_numeric(df['TIMEADJUSTLATE'], errors='coerce').fillna(0)
df['EarlyAdjust'] = df['EarlyAdjust'].apply(convert_ms_to_hours)
df['LateAdjust'] = df['LateAdjust'].apply(convert_ms_to_hours)
# Calculate actual hours
df['Potential Hours'] = pd.to_numeric(df['Potential Hours'], errors='coerce').fillna(0)
df['Actual_Hours'] = df.apply(
lambda x: max(0, x['Potential Hours'] - (x['EarlyAdjust'] + x['LateAdjust'])),
axis=1
)
# Group activity types
assessment_types = [
'Access Assessments', 'Assessments', 'Beauty OL Assess',
'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess',
'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess',
'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments',
'SWiS Assessments'
]
def categorize_activity(activity):
if pd.isna(activity):
return 'Other'
activity = str(activity).strip()
if activity in assessment_types:
return 'Assessment'
elif activity in ['Community Engagement', 'Tutorials/Drop Ins']:
return activity
return 'Other'
df['Activity_Category'] = df['ACTIVITYTYPE'].apply(categorize_activity)
# Convert event date to datetime
df['EVENTDATE'] = pd.to_datetime(df['EVENTDATE'], format='%d %b %Y', errors='coerce')
print("\nProcessing summaries...")
# Create summary dataframes
# 1. Daily summary
daily_summary = df.groupby(
['Staff_Standard', 'EVENTDATE', 'Activity_Category']
).agg({
'Actual_Hours': 'sum'
}).reset_index()
print(f"Daily summary records: {len(daily_summary)}")
# 2. Weekly summary
df['Week'] = df['EVENTDATE'].dt.isocalendar().week
df['Year'] = df['EVENTDATE'].dt.isocalendar().year
weekly_summary = df.groupby(
['Staff_Standard', 'Year', 'Week', 'Activity_Category']
).agg({
'Actual_Hours': 'sum'
}).reset_index()
print(f"Weekly summary records: {len(weekly_summary)}")
# 3. Monthly summary
df['Month'] = df['EVENTDATE'].dt.month
monthly_summary = df.groupby(
['Staff_Standard', 'Year', 'Month', 'Activity_Category']
).agg({
'Actual_Hours': 'sum'
}).reset_index()
print(f"Monthly summary records: {len(monthly_summary)}")
def calculate_utilization(group_df, target_hours, period_divisor):
"""Calculate utilization percentage"""
non_assessment_hours = group_df[
group_df['Activity_Category'] != 'Assessment'
]['Actual_Hours'].sum()
period_target = target_hours / period_divisor
return (non_assessment_hours / period_target * 100) if period_target > 0 else 0
# Add utilization to monthly summary
monthly_utilization = []
for staff in monthly_summary['Staff_Standard'].unique():
staff_records = staff_df[staff_df['Staff_Standard'] == staff]
if len(staff_records) > 0:
staff_target = staff_records['Target_Hours'].iloc[-1]
staff_monthly = monthly_summary[
monthly_summary['Staff_Standard'] == staff
].copy()
if len(staff_monthly) > 0:
staff_monthly['Utilization_Percentage'] = calculate_utilization(
staff_monthly, staff_target, 12
)
monthly_utilization.append(staff_monthly)
if not monthly_utilization:
print("\nWARNING: No monthly utilization data was generated!")
return None
monthly_summary_with_utilization = pd.concat(monthly_utilization)
print("\nProcessing completed successfully!")
return {
'daily_summary': daily_summary,
'weekly_summary': weekly_summary,
'monthly_summary': monthly_summary_with_utilization,
'detailed_data': df
}
def save_processed_data(data_dict, output_folder):
"""Save all processed dataframes to CSV files"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for name, df in data_dict.items():
output_file = os.path.join(output_folder, f'{name}.csv')
df.to_csv(output_file, index=False)
print(f"Saved {name} to {output_file}")
# Usage
if __name__ == "__main__":
# Print current directory and files for debugging
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")
print("\nFiles in current directory:")
for file in os.listdir(current_dir):
print(f"- {file}")
# Define paths
input_file = "TutorPayReport_New 23.24 FY 22.11.24.xlsx"
staff_data = os.path.join("processed_data", "staff_monthly_data.csv")
output_folder = "processed_data"
# Verify files exist
if not os.path.exists(input_file):
print(f"\nERROR: Could not find {input_file}")
print("Please make sure the TutorPayReport file is in the same folder as this notebook")
exit()
if not os.path.exists(staff_data):
print(f"\nERROR: Could not find {staff_data}")
print("Please make sure the processed_data folder contains staff_monthly_data.csv")
exit()
# Process the data
processed_data = process_tutor_report(input_file, staff_data)
if processed_data is not None:
# Save the results
save_processed_data(processed_data, output_folder)
print("\nSummary stats:")
for name, df in processed_data.items():
print(f"{name}: {len(df)} records")
else:
print("\nERROR: Data processing failed!")Editor is loading...
Leave a Comment