Untitled
user_1718919
plain_text
10 months ago
12 kB
4
Indexable
# Import necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path
import os
import re
from datetime import datetime
# Create processed_data directory if it doesn't exist
if not os.path.exists('processed_data'):
os.makedirs('processed_data')
# Helper functions for name standardization
def clean_string(s):
"""Remove extra spaces and standardize string format"""
if pd.isna(s):
return ''
return str(s).strip().replace(' ', ' ')
def extract_names_from_manager(manager_name):
"""Extract first name, known as, and last name from manager format"""
if pd.isna(manager_name) or manager_name == '':
return '', '', ''
try:
name_parts = manager_name.split(',')
if len(name_parts) != 2:
return manager_name, '', ''
first_part = name_parts[0].strip()
last_name = name_parts[1].strip()
first_parts = first_part.split(' ')
if len(first_parts) == 1:
return first_parts[0], '', last_name
else:
return first_parts[0], first_parts[-1], last_name
except:
return manager_name, '', ''
def standardize_staff_name(first_name, known_as, last_name):
"""Create standardized versions of staff names"""
first_name = clean_string(first_name)
known_as = clean_string(known_as)
last_name = clean_string(last_name)
if not known_as:
known_as = first_name
return {
'first_name': first_name,
'known_as': known_as,
'last_name': last_name,
'full_name_first': f"{first_name} {last_name}",
'full_name_known': f"{known_as} {last_name}"
}
def process_monthly_reports():
"""Process all monthly ACL reports"""
monthly_data = []
monthly_reports_path = Path('monthly_reports')
for file in monthly_reports_path.glob('ACL Monthly Report *.xlsx'):
# Extract month and year from filename
date_match = re.search(r'(\d{2})\.(\d{2})\.(\d{2})\.xlsx$', file.name)
if date_match:
day, month, year = date_match.groups()
report_date = f"20{year}-{month}-{day}"
df = pd.read_excel(file, skiprows=2)
df['report_date'] = pd.to_datetime(report_date)
# Clean column names
df.columns = [col.strip() for col in df.columns]
# Filter for Tutors and Learning Support Assistants
mask = df['Position Name'].isin(['Learning Support Assistant', 'Tutor'])
df = df[mask]
# Calculate target hours
df['working_hours'] = df['Working Hours'].fillna(0)
df['is_salaried'] = df['working_hours'] > 3
df['fte'] = df['Full-Time Equivalent'].fillna(0)
df['target_hours'] = np.where(df['is_salaried'], df['fte'] * 840, 0)
# Process names
df['first_name'] = df['First Name'].apply(clean_string)
df['known_as'] = df['Known As'].apply(clean_string)
df['last_name'] = df['Last Name'].apply(clean_string)
# Process line manager names
manager_names = df['Line Manager Name'].apply(extract_names_from_manager)
df['manager_first_name'] = [x[0] for x in manager_names]
df['manager_known_as'] = [x[1] for x in manager_names]
df['manager_last_name'] = [x[2] for x in manager_names]
monthly_data.append(df)
if monthly_data:
combined_df = pd.concat(monthly_data, ignore_index=True)
combined_df.to_csv('processed_data/processed_monthly_reports.csv', index=False)
return combined_df
else:
raise Exception("No monthly reports found in the specified directory")
def process_tutor_pay_report(file_path='TutorPayReport_New 23.24 FY 22.11.24.xlsx'):
"""Process the Tutor Pay Report"""
df = pd.read_excel(file_path)
# Clean column names
df.columns = [col.strip() for col in df.columns]
# Filter for relevant staff roles
df = df[df['STAFFROLE'].str.strip().isin(['Tutor', 'Learning Support'])]
# Clean names
df['PEOPLENAME'] = df['PEOPLENAME'].apply(clean_string)
df['PEOPLESURNAME'] = df['PEOPLESURNAME'].apply(clean_string)
# Group assessment activities
assessment_types = [
'Access Assessments', 'Assessments', 'Beauty OL Assess',
'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess',
'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess',
'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments',
'SWiS Assessments'
]
df['ACTIVITY_GROUP'] = np.where(
df['ACTIVITYTYPE'].isin(assessment_types),
'Assessment',
df['ACTIVITYTYPE']
)
# Calculate actual hours
def convert_ms_to_hours(ms):
if pd.isna(ms):
return 0
return abs(float(ms)) / (1000 * 60 * 60)
df['early_adjust'] = df['TIMEADJUSTEARLY'].apply(convert_ms_to_hours)
df['late_adjust'] = df['TIMEADJUSTLATE'].apply(convert_ms_to_hours)
df['actual_hours'] = df['Potential Hours'] - (df['early_adjust'] + df['late_adjust'])
df['actual_hours'] = df['actual_hours'].clip(lower=0)
# Convert event date to datetime
df['EVENTDATE'] = pd.to_datetime(df['EVENTDATE'])
df.to_csv('processed_data/processed_tutor_pay_report.csv', index=False)
return df
def process_qualification_references():
"""Process both Approved and Archived qualification lists"""
# Read Approved List
approved_df = pd.read_excel('Approved List for Terms Caseloading FY2324.xlsx')
approved_df = approved_df[['QualificationName', 'QualificationNumber']].copy()
approved_df['source'] = 'approved'
# Read Archived List
archived_df = pd.read_excel('Archived List for Terms Caseloading FY2324.xlsx')
archived_df = archived_df[['QualificationName', 'QualificationNumber']].copy()
archived_df['source'] = 'archived'
# Combine dataframes
combined_df = pd.concat([approved_df, archived_df], ignore_index=True)
# Clean qualification numbers
def clean_qual_number(x):
if pd.isna(x):
return x
x = str(x)
if x.endswith('P') or x.endswith('N'):
return x[:-1]
return x
combined_df['QualificationNumber_clean'] = combined_df['QualificationNumber'].apply(clean_qual_number)
combined_df.to_csv('processed_data/processed_qualification_refs.csv', index=False)
return combined_df
def process_terms_caseloading(qual_ref_df):
"""Process Terms caseloading data with qualification reference lookup"""
df = pd.read_excel('Terms caseloading 23.24 FY.xlsx')
# Create qualification name lookup dictionary
qual_lookup = dict(zip(
qual_ref_df['QualificationNumber_clean'],
qual_ref_df['QualificationName']
))
# Clean funding references and lookup qualification names
df['funding_ref_clean'] = df['Funding Reference'].astype(str).apply(
lambda x: x[:-1] if x.endswith(('P', 'N')) else x
)
# Apply qualification name lookup, fall back to Activity Name
df['QualificationName'] = df['funding_ref_clean'].map(qual_lookup).fillna(df['Activity Name'])
# Convert dates
date_columns = ['Start Date', 'End Date']
for col in date_columns:
df[col] = pd.to_datetime(df[col])
# Clean staff names
df['Primary Staff Name'] = df['Primary Staff Name'].apply(clean_string)
df['Primary Staff Surname'] = df['Primary Staff Surname'].apply(clean_string)
# Calculate learner counts
learner_counts = df.groupby([
'Primary Staff Name', 'Primary Staff Surname',
'QualificationName', 'Level', 'Activity Group',
pd.Grouper(key='Start Date', freq='M')
])['Name'].count().reset_index()
learner_counts.rename(columns={'Name': 'learner_count'}, inplace=True)
# Save processed data
df.to_csv('processed_data/processed_terms_caseloading.csv', index=False)
learner_counts.to_csv('processed_data/terms_learner_counts.csv', index=False)
return df, learner_counts
def process_pics_hours_lookup():
"""Process PICS Hours lookup table"""
df = pd.read_excel('PICS Hours for Assessor Look up.xlsx', skiprows=2)
def time_to_decimal(time_str):
if pd.isna(time_str):
return 0
try:
hours, minutes = str(time_str).split(':')
return float(hours) + float(minutes) / 60
except:
return 0
df['weighted_hours'] = df['Weighted Monthly Hours (1.6)'].apply(time_to_decimal)
df.to_csv('processed_data/processed_pics_hours_lookup.csv', index=False)
return df
def process_pics_caseload(hours_lookup_df):
"""Process PICS caseload data with hours lookup"""
df = pd.read_excel('PICS caseload for PBI.xlsx')
# Convert dates
date_cols = ['Apprenticeship Achieved Date', 'Start Date',
'Learning Expected End', 'Actual End']
for col in date_cols:
df[col] = pd.to_datetime(df[col], errors='coerce')
# Create mapping for weighted hours
hours_map = dict(zip(
hours_lookup_df['Standard title'],
hours_lookup_df['weighted_hours']
))
# Add weighted hours to each apprenticeship
df['weighted_hours'] = df['Apprenticeship Standard Title'].map(hours_map)
# Calculate active periods
df['end_date'] = df['Actual End'].fillna(df['Learning Expected End'])
def get_monthly_student_counts(group):
if len(group) == 0:
return pd.DataFrame()
start_date = group['Start Date'].min()
end_date = group['end_date'].max()
if pd.isna(start_date) or pd.isna(end_date):
return pd.DataFrame()
date_range = pd.date_range(start_date, end_date, freq='M')
counts = []
for date in date_range:
active_mask = (
(group['Start Date'] <= date) &
(group['end_date'] >= date)
)
active_students = group[active_mask].shape[0]
total_weighted_hours = group[active_mask]['weighted_hours'].sum()
counts.append({
'month': date,
'student_count': active_students,
'total_weighted_hours': total_weighted_hours
})
return pd.DataFrame(counts)
monthly_summary = df.groupby('Assessor Full Name').apply(
get_monthly_student_counts
).reset_index()
# Save processed data
df.to_csv('processed_data/processed_pics_caseload.csv', index=False)
monthly_summary.to_csv('processed_data/pics_monthly_summary.csv', index=False)
return df, monthly_summary
def process_all_data():
"""Process all data sources and create final outputs"""
try:
print("Processing monthly reports...")
monthly_data = process_monthly_reports()
print("Processing tutor pay report...")
tutor_pay_data = process_tutor_pay_report()
print("Processing qualification references...")
qual_ref_data = process_qualification_references()
print("Processing Terms caseloading...")
terms_data, learner_counts = process_terms_caseloading(qual_ref_data)
print("Processing PICS hours lookup...")
pics_hours_lookup = process_pics_hours_lookup()
print("Processing PICS caseload...")
pics_data, pics_summary = process_pics_caseload(pics_hours_lookup)
print("All data processed successfully!")
return True
except Exception as e:
print(f"Error processing data: {str(e)}")
return False
# Call the main processing function
if __name__ == "__main__":
process_all_data()Editor is loading...
Leave a Comment