C3
user_1718919
plain_text
10 months ago
10 kB
7
Indexable
def calculate_activity_hours(row: pd.Series) -> Tuple[str, float]:
"""
Calculate and categorize activity hours
Args:
row: DataFrame row containing activity information
Returns:
Tuple of (activity_category, actual_hours)
"""
assessment_types = [
'Access Assessments', 'Assessments', 'Beauty OL Assess',
'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess',
'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess',
'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments',
'SWiS Assessments'
]
if pd.isna(row['ACTIVITYTYPE']):
return 'Other', row['Actual_Hours']
activity = str(row['ACTIVITYTYPE']).strip()
hours = float(row['Actual_Hours'])
if activity in assessment_types:
return 'Assessment', hours
elif activity in ['Community Engagement', 'Tutorials/Drop Ins']:
return activity, hours
return 'Other', hours
def process_tutor_report(
file_path: str,
staff_data_path: str,
contract_history: pd.DataFrame
) -> Optional[Dict[str, pd.DataFrame]]:
"""
Process the TutorPayReport and link with staff data
Args:
file_path: Path to tutor pay report file
staff_data_path: Path to processed staff data
contract_history: Contract history DataFrame
Returns:
Dictionary of processed DataFrames or None if processing fails
"""
logging.info("Processing Tutor Pay Report...")
# Read files with error handling
df, error = safe_read_excel(file_path)
if error:
logging.error(f"Failed to read tutor pay report: {error}")
return None
staff_df = pd.read_csv(staff_data_path)
required_columns = [
'PEOPLENAME', 'PEOPLESURNAME', 'STAFFROLE', 'CONTRACTNAME',
'ACTIVITYTYPE', 'Potential Hours', 'TIMEADJUSTEARLY',
'TIMEADJUSTLATE', 'EVENTDATE', 'Course Funding Group',
'Course Prov Mon C', 'Course Prov Mon D', 'VENUE'
]
try:
validate_processed_data(df, required_columns, 'tutor_pay_report')
except ValueError as e:
logging.error(f"Validation failed: {str(e)}")
return None
# Clean and standardize names
name_columns = ['PEOPLENAME', 'PEOPLESURNAME']
df = standardize_names(df, name_columns)
df['Staff_Standard'] = df.apply(
lambda x: f"{x['PEOPLENAME']} {x['PEOPLESURNAME']}", axis=1
)
# Filter staff roles and clean
df['STAFFROLE'] = df['STAFFROLE'].apply(clean_name)
role_mask = df['STAFFROLE'].isin(['Tutor', 'Learning Support'])
invalid_roles = df[~role_mask]['STAFFROLE'].unique()
if len(invalid_roles) > 0:
logging.warning(f"Found invalid staff roles: {invalid_roles}")
df = df[role_mask].copy()
# Calculate actual hours with time adjustments
logging.info("Calculating actual hours...")
# Convert time adjustments
for col in ['TIMEADJUSTEARLY', 'TIMEADJUSTLATE']:
df[f'{col}_Hours'] = pd.to_numeric(df[col], errors='coerce').fillna(0).apply(
lambda x: convert_time_to_hours(x) if x != 0 else 0
)
# Calculate actual hours
df['Potential Hours'] = pd.to_numeric(df['Potential Hours'], errors='coerce').fillna(0)
df['Actual_Hours'] = df.apply(
lambda x: max(0, x['Potential Hours'] -
(x['TIMEADJUSTEARLY_Hours'] + x['TIMEADJUSTLATE_Hours'])),
axis=1
)
# Verify hour calculations
zero_hours = df['Actual_Hours'].eq(0).sum()
if zero_hours > 0:
logging.warning(f"Found {zero_hours} entries with zero actual hours")
logging.debug("Sample of zero-hour entries:")
sample_zero = df[df['Actual_Hours'] == 0].head()
for _, row in sample_zero.iterrows():
logging.debug(f" Activity: {row['ACTIVITYTYPE']}, "
f"Potential: {row['Potential Hours']}, "
f"Adjustments: {row['TIMEADJUSTEARLY_Hours'] + row['TIMEADJUSTLATE_Hours']}")
# Process dates and create time periods
df['EVENTDATE'] = pd.to_datetime(df['EVENTDATE'], format='%d %b %Y', errors='coerce')
invalid_dates = df['EVENTDATE'].isna().sum()
if invalid_dates > 0:
logging.warning(f"Found {invalid_dates} invalid event dates")
logging.debug("Sample of invalid dates:")
sample_invalid = df[df['EVENTDATE'].isna()]['EVENTDATE'].head()
for date in sample_invalid:
logging.debug(f" Invalid date value: {date}")
df['Year'] = df['EVENTDATE'].dt.year
df['Month'] = df['EVENTDATE'].dt.month
df['Week'] = df['EVENTDATE'].dt.isocalendar().week
# Categorize activities and calculate hours
logging.info("Categorizing activities...")
df['Activity_Category'], df['Category_Hours'] = zip(*df.apply(calculate_activity_hours, axis=1))
# Keep required columns
columns_to_keep = [
'Staff_Standard', 'STAFFROLE', 'CONTRACTNAME',
'Activity_Category', 'Actual_Hours', 'Category_Hours',
'EVENTDATE', 'Year', 'Month', 'Week',
'Course Funding Group', 'Course Prov Mon C',
'Course Prov Mon D', 'VENUE'
]
df = df[columns_to_keep]
# Create monthly summaries
logging.info("Creating monthly summaries...")
monthly_hours = df.groupby(
['Staff_Standard', 'Year', 'Month', 'Activity_Category']
).agg({
'Actual_Hours': 'sum',
'Category_Hours': 'sum'
}).reset_index()
# Get contract details for each month
def get_monthly_contract(row):
month_date = pd.Timestamp(year=int(row['Year']), month=int(row['Month']), day=1)
staff_contracts = contract_history[
contract_history['Staff_Name'] == row['Staff_Standard']
]
# Find applicable contracts for the month
relevant_contracts = staff_contracts[
(staff_contracts['Start_Date'] <= month_date) &
(staff_contracts['End_Date'] >= month_date)
]
if len(relevant_contracts) > 0:
# If multiple contracts, combine information
positions = relevant_contracts['Position'].unique()
contract_types = relevant_contracts['Contract_Type'].unique()
total_target = relevant_contracts['Target_Hours'].sum()
return pd.Series({
'Position': ' & '.join(positions),
'Contract_Type': ' & '.join(contract_types),
'Target_Hours': total_target,
'Multiple_Contracts': len(relevant_contracts) > 1
})
return pd.Series({
'Position': None,
'Contract_Type': None,
'Target_Hours': None,
'Multiple_Contracts': False
})
# Add contract info to monthly summary
logging.info("Adding contract information...")
contract_details = monthly_hours.apply(get_monthly_contract, axis=1)
monthly_hours = pd.concat([monthly_hours, contract_details], axis=1)
# Calculate utilization
logging.info("Calculating monthly utilization...")
monthly_utilization = []
for name in monthly_hours['Staff_Standard'].unique():
staff_monthly = monthly_hours[
monthly_hours['Staff_Standard'] == name
].copy()
# Process each month
for month_idx, month_group in staff_monthly.groupby(['Year', 'Month']):
year, month = month_idx
# Only calculate utilization if we have target hours
if pd.notna(month_group['Target_Hours'].iloc[0]):
utilization = calculate_monthly_utilization(
month_group,
month_group['Target_Hours'].iloc[0],
exclude_categories=['Assessment']
)
staff_monthly.loc[
(staff_monthly['Year'] == year) &
(staff_monthly['Month'] == month),
'Utilization_Percentage'
] = utilization
monthly_utilization.append(staff_monthly)
if not monthly_utilization:
logging.warning("No monthly utilization data generated!")
return None
monthly_summary = pd.concat(monthly_utilization)
# Create additional summaries
logging.info("Creating additional summaries...")
# Daily summary
daily_summary = df.groupby(
['Staff_Standard', 'EVENTDATE', 'Activity_Category']
).agg({
'Actual_Hours': 'sum',
'Category_Hours': 'sum'
}).reset_index()
# Weekly summary
weekly_summary = df.groupby(
['Staff_Standard', 'Year', 'Week', 'Activity_Category']
).agg({
'Actual_Hours': 'sum',
'Category_Hours': 'sum'
}).reset_index()
# Add total hours columns for PowerBI
monthly_summary['Total_Hours'] = monthly_summary['Actual_Hours']
monthly_summary['Assessment_Hours'] = monthly_summary.apply(
lambda x: x['Category_Hours'] if x['Activity_Category'] == 'Assessment' else 0,
axis=1
)
monthly_summary['Non_Assessment_Hours'] = monthly_summary.apply(
lambda x: x['Category_Hours'] if x['Activity_Category'] != 'Assessment' else 0,
axis=1
)
# Venue and curriculum summaries
venue_summary = df.groupby(
['Staff_Standard', 'Year', 'Month', 'VENUE']
).agg({
'Actual_Hours': 'sum',
'Category_Hours': 'sum'
}).reset_index()
curriculum_summary = df.groupby(
['Staff_Standard', 'Year', 'Month', 'Course Prov Mon C']
).agg({
'Actual_Hours': 'sum',
'Category_Hours': 'sum'
}).reset_index()
logging.info("Tutor pay report processing completed!")
# Return all processed data
return {
'daily_summary': daily_summary,
'weekly_summary': weekly_summary,
'monthly_summary': monthly_summary,
'venue_summary': venue_summary,
'curriculum_summary': curriculum_summary,
'detailed_data': df
}Editor is loading...
Leave a Comment