o1-1
user_1718919
plain_text
a year ago
20 kB
7
Indexable
# Cell 1: Imports and Setup
import os
import glob
import pandas as pd
import numpy as np
# Ensure we are in the "Staff Utilisation" directory
# If not, you may need to adjust paths accordingly.
# We'll assume the notebook is placed in the "Staff Utilisation" folder.
# Create processed_data directory if it doesn't exist
os.makedirs('processed_data', exist_ok=True)
# Cell 2: Define Helper Functions
def extract_month_year_from_filename(filename):
"""
Extract month and year from filename like 'ACL Monthly Report 17.04.23.xlsx'.
We'll assume the format is always '... DD.MM.YY.xlsx'.
Example: 'ACL Monthly Report 17.04.23.xlsx' -> April 2023
We'll return a pandas Period or (month, year) tuple.
"""
# filename might be something like: 'ACL Monthly Report 17.04.23.xlsx'
# We split by space and get the last part before .xlsx
base = os.path.basename(filename)
name_part = os.path.splitext(base)[0] # 'ACL Monthly Report 17.04.23'
parts = name_part.split() # ['ACL', 'Monthly', 'Report', '17.04.23']
date_str = parts[-1] # '17.04.23'
# The middle part is '04' for month, '23' for year.
# Day might not be needed but let's parse anyway.
# Format: DD.MM.YY
day, month, year = date_str.split('.')
# We can store this as a YYYY-MM string (financial year presumably 2023 means year=2023)
# We'll assume 20xx for year:
year_full = 2000 + int(year)
month_int = int(month)
return pd.Period(f'{year_full}-{month_int}', freq='M') # Monthly period
def clean_name(name_str):
"""
Standardize a name string by stripping spaces and making title case.
"""
if pd.isna(name_str):
return ''
return ' '.join(str(name_str).strip().split()).title() # Remove extra spaces and title-case
def parse_line_manager_name(full_name):
"""
Line Manager Name is in format 'FirstName KnownAs, LastName'.
Example: 'Joanne Jo, Loss'
We want two formats:
1) FirstName LastName
2) KnownAs LastName
We'll assume first part before comma is "FirstName KnownAs"
and after comma is LastName.
If it's something like "Joanne Jo, Loss":
FirstName might be 'Joanne'
KnownAs = 'Jo'
LastName = 'Loss'
If there's only one first name and no known-as, known-as = first name.
We'll try to split by comma first, then split the left part by spaces.
"""
if pd.isna(full_name) or full_name.strip() == '':
return pd.Series(["", ""])
parts = full_name.split(',')
last_name = clean_name(parts[-1]) if len(parts) > 1 else ''
first_known = parts[0].strip().split()
if len(first_known) == 1:
# Only one name given, treat as both first name and known as
first_name = clean_name(first_known[0])
known_as = first_name
else:
# Assume last item in the first_known array is known-as
# and the first is the formal first name
# If there's more complexity, we may need a more robust approach
first_name = clean_name(first_known[0])
known_as = clean_name(first_known[-1])
# Return as a Series: [FirstName LastName, KnownAs LastName]
return pd.Series([f"{first_name} {last_name}", f"{known_as} {last_name}"])
# Cell 3: Process the 12 Monthly ACL Reports
# We assume the folder 'monthly_reports' has 12 files named similarly to 'ACL Monthly Report DD.MM.YY.xlsx'.
# We'll read them all, clean them, and combine into a single DataFrame with a 'Month' column.
monthly_files = glob.glob('monthly_reports/*.xlsx')
all_monthly_dfs = []
for f in monthly_files:
month_period = extract_month_year_from_filename(f) # Get the month-year period
# Monthly files start from 3rd row with column names
# According to instructions:
# 1st row = headline, 2nd row = empty, column names start from 3rd row
# So skip the first two rows.
df = pd.read_excel(f, skiprows=2)
# Clean column names (if needed)
df.columns = [str(c).strip() for c in df.columns]
# Add a column for the period
df['Report_Month'] = month_period
all_monthly_dfs.append(df)
acl_monthly_df = pd.concat(all_monthly_dfs, ignore_index=True)
# Useful columns reference from instructions:
# Assignment Number (A), Title (B), First Name (C), Known As (D), Last Name (E)
# Position Name (J), Working Hours (K), Full-Time Equivalent (Q), Line Manager Name (W)
# We'll rename columns to something standardized if needed.
# Let's store original column names to variables:
orig_cols = acl_monthly_df.columns
# We know from instructions the approximate column positions:
# Let's align with instructions (Adjust if needed based on actual data):
# A: Assignment Number
# B: Title
# C: First Name
# D: Known As
# E: Last Name
# J: Position Name
# K: Working Hours
# Q: Full-Time Equivalent
# W: Line Manager Name
acl_monthly_df = acl_monthly_df.rename(columns={
'Assignment Number': 'Assignment_Number',
'Title': 'Title',
'First Name': 'First_Name',
'Known As': 'Known_As',
'Last Name': 'Last_Name',
'Position Name': 'Position_Name',
'Working Hours': 'Working_Hours',
'Full-Time Equivalent': 'FTE',
'Line Manager Name': 'Line_Manager_Name'
}, errors='ignore') # in case some columns differ in case/spaces
# Filter only Tutors and LSAs
# Position_Name must contain 'Tutor' or 'Learning Support Assistant'
acl_monthly_df = acl_monthly_df[acl_monthly_df['Position_Name'].isin(['Tutor', 'Learning Support Assistant'])]
# Clean names
acl_monthly_df['First_Name'] = acl_monthly_df['First_Name'].apply(clean_name)
acl_monthly_df['Known_As'] = acl_monthly_df['Known_As'].apply(clean_name)
acl_monthly_df['Last_Name'] = acl_monthly_df['Last_Name'].apply(clean_name)
acl_monthly_df['Line_Manager_Name'] = acl_monthly_df['Line_Manager_Name'].apply(clean_name)
# Parse line manager into two formats
acl_monthly_df[['LM_FirstNameLastName','LM_KnownAsLastName']] = acl_monthly_df['Line_Manager_Name'].apply(parse_line_manager_name)
# Determine Contract Type:
# If Position_Name == 'Learning Support Assistant' → LSA
# If Position_Name == 'Tutor':
# If Working_Hours <= 3 → Sessional
# If Working_Hours > 3 → Salaried
def contract_type(row):
if row['Position_Name'] == 'Learning Support Assistant':
return 'LSA'
elif row['Position_Name'] == 'Tutor':
if pd.to_numeric(row['Working_Hours'], errors='coerce') is not None:
wh = float(row['Working_Hours'])
if wh > 3:
return 'Salaried Tutor'
else:
return 'Sessional Tutor'
else:
return 'Sessional Tutor' # default if parsing fails
return None
acl_monthly_df['Contract_Type'] = acl_monthly_df.apply(contract_type, axis=1)
# Calculate Target Hours only for Salaried Tutors
# Target = FTE * 840, if Salaried Tutor
acl_monthly_df['FTE'] = pd.to_numeric(acl_monthly_df['FTE'], errors='coerce').fillna(0)
acl_monthly_df['Target_Hours'] = np.where(
acl_monthly_df['Contract_Type']=='Salaried Tutor',
acl_monthly_df['FTE']*840,
0
)
# Now we have a combined monthly dataset with month info.
# This dataset can be used to track changes over time by grouping on Assignment_Number and seeing changes in Contract_Type across months.
# We'll save this processed monthly dataset
acl_monthly_df.to_csv('processed_data/acl_monthly_processed.csv', index=False)
# Cell 4: Process TutorPayReport (TERMS data)
# File: TutorPayReport_New 23.24 FY 22.11.24.xlsx
# This file has columns:
# PEOPLENAME (H), PEOPLESURNAME (I), STAFFROLE (U), CONTRACTNAME (V), ACTIVITYTYPE (S), Potential Hours (W), TIMEADJUSTEARLY (N), TIMEADJUSTLATE (O), ...
# EVENTDATE (M), Course Funding Group (C), Course Prov Mon C (E), Course Prov Mon D (F), VENUE (G)
tutor_pay_file = 'TutorPayReport_New 23.24 FY 22.11.24.xlsx'
tutor_pay_df = pd.read_excel(tutor_pay_file)
# Rename columns of interest
tutor_pay_df = tutor_pay_df.rename(columns={
'PEOPLENAME': 'PeopleName',
'PEOPLESURNAME': 'PeopleSurname',
'STAFFROLE': 'StaffRole',
'CONTRACTNAME': 'ContractName',
'ACTIVITYTYPE': 'ActivityType',
'Potential Hours': 'PotentialHours',
'TIMEADJUSTEARLY': 'TimeAdjustEarly',
'TIMEADJUSTLATE': 'TimeAdjustLate',
'Course Funding Group': 'CourseFundingGroup',
'Course Prov Mon C': 'CurriculumName',
'Course Prov Mon D': 'ToplineVenue',
'VENUE': 'Venue',
'EVENTDATE': 'EventDate'
}, errors='ignore')
# Filter STAFFROLE: We only care about 'Tutor' and 'Learning Support' (Strip extra spaces)
tutor_pay_df['StaffRole'] = tutor_pay_df['StaffRole'].str.strip()
tutor_pay_df = tutor_pay_df[tutor_pay_df['StaffRole'].isin(['Tutor','Learning Support'])]
# Focus on CONTRACTNAME: 'LSA-NB', 'SALARIED TUTOR', 'SESSIONAL TUTOR'
tutor_pay_df['ContractName'] = tutor_pay_df['ContractName'].str.strip()
# ActivityType grouping for "Assessment"
assessment_types = ['Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess', 'Creative Assessments',
'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess',
'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments']
tutor_pay_df['ActivityTypeGroup'] = np.where(tutor_pay_df['ActivityType'].isin(assessment_types),
'Assessment', tutor_pay_df['ActivityType'])
# Calculate Actual Hours
# TIMEADJUSTEARLY & TIMEADJUSTLATE in ms
# Convert ms to hours by dividing by (1000*60*60)
tutor_pay_df['TimeAdjustEarly'] = pd.to_numeric(tutor_pay_df['TimeAdjustEarly'], errors='coerce').fillna(0)
tutor_pay_df['TimeAdjustLate'] = pd.to_numeric(tutor_pay_df['TimeAdjustLate'], errors='coerce').fillna(0)
tutor_pay_df['PotentialHours'] = pd.to_numeric(tutor_pay_df['PotentialHours'], errors='coerce').fillna(0)
early_hours = tutor_pay_df['TimeAdjustEarly']/(1000*60*60)
late_hours = tutor_pay_df['TimeAdjustLate']/(1000*60*60)
tutor_pay_df['ActualHours'] = tutor_pay_df['PotentialHours'] - (early_hours + late_hours)
tutor_pay_df['ActualHours'] = tutor_pay_df['ActualHours'].apply(lambda x: x if x>0 else 0)
# Clean PeopleName and PeopleSurname
tutor_pay_df['PeopleName'] = tutor_pay_df['PeopleName'].apply(clean_name)
tutor_pay_df['PeopleSurname'] = tutor_pay_df['PeopleSurname'].apply(clean_name)
# Save processed Tutor Pay Data
tutor_pay_df.to_csv('processed_data/tutor_pay_processed.csv', index=False)
# Cell 5: Approved and Archived Lists for Terms Caseloading
# We will create a dictionary/map of QualificationNumber -> QualificationName from Approved first, then Archived.
approved_file = 'Approved List for Terms Caseloading FY2324.xlsx'
archived_file = 'Archived List for Terms Caseloading FY2324.xlsx'
approved_df = pd.read_excel(approved_file)
archived_df = pd.read_excel(archived_file)
# Columns we care about: QualificationName (B), QualificationNumber (D)
approved_df = approved_df.rename(columns={'QualificationName': 'QualificationName', 'QualificationNumber': 'QualificationNumber'})
archived_df = archived_df.rename(columns={'QualificationName': 'QualificationName', 'QualificationNumber': 'QualificationNumber'})
approved_map = dict(zip(approved_df['QualificationNumber'], approved_df['QualificationName']))
archived_map = dict(zip(archived_df['QualificationNumber'], archived_df['QualificationName']))
# Cell 6: Terms Caseloading File
# File 5: Terms caseloading 23.24 FY.xlsx
# Columns of interest:
# Funding Reference (M), Activity Name (D)
# Primary Staff Name (A), Primary Staff Surname (B)
# Start Date (H), End Date (I), Level (N), Activity Group (S)
terms_caseload_file = 'Terms caseloading 23.24 FY.xlsx'
terms_df = pd.read_excel(terms_caseload_file)
# Rename columns
terms_df = terms_df.rename(columns={
'Funding Reference': 'FundingReference',
'Activity Name': 'ActivityName',
'Primary Staff Name': 'PrimaryStaffName',
'Primary Staff Surname': 'PrimaryStaffSurname',
'Start Date': 'StartDate',
'End Date': 'EndDate',
'Level': 'Level',
'Activity Group': 'ActivityGroup'
}, errors='ignore')
# Clean staff names
terms_df['PrimaryStaffName'] = terms_df['PrimaryStaffName'].apply(clean_name)
terms_df['PrimaryStaffSurname'] = terms_df['PrimaryStaffSurname'].apply(clean_name)
# Process FundingReference:
# If ends in 'P' or 'N', remove that character
def clean_funding_ref(ref):
if pd.isna(ref):
return ''
ref = str(ref).strip()
if ref.endswith('P') or ref.endswith('N'):
ref = ref[:-1] # remove last char
return ref
terms_df['FundingReference'] = terms_df['FundingReference'].apply(clean_funding_ref)
def get_qualification_name(ref):
# If ref is BLANK or '', use ActivityName
if ref == '' or ref.upper() == 'BLANK':
return None
# Try approved_map first
if ref in approved_map:
return approved_map[ref]
# If not found, try archived_map
if ref in archived_map:
return archived_map[ref]
return None
terms_df['QualificationName'] = terms_df['FundingReference'].apply(get_qualification_name)
terms_df['QualificationName'] = terms_df.apply(
lambda x: x['QualificationName'] if pd.notna(x['QualificationName']) else x['ActivityName'],
axis=1
)
# We now have a dataset with tutors and their learners, courses, etc.
# We'll save the processed terms data
terms_df.to_csv('processed_data/terms_caseload_processed.csv', index=False)
# Cell 7: PICS caseload for PBI (File 3)
# Columns of interest:
# Assessor Full Name (K), Programme (D), Apprenticeship Standard Title (N), Apprenticeship Achieved Date (Q), Start Date (G), Learning Expected End (H), Actual End (I)
pics_caseload_file = 'Pics caseload for PBI.xlsx'
pics_df = pd.read_excel(pics_caseload_file)
pics_df = pics_df.rename(columns={
'Assessor Full Name': 'AssessorFullName',
'Programme': 'Programme',
'Apprenticeship Standard Title': 'ApprenticeshipStandardTitle',
'Apprenticeship Achieved Date': 'ApprenticeshipAchievedDate',
'Start Date': 'StartDate',
'Learning Expected End': 'LearningExpectedEnd',
'Actual End': 'ActualEnd'
}, errors='ignore')
# Clean assessor names
pics_df['AssessorFullName'] = pics_df['AssessorFullName'].apply(clean_name)
# Convert date columns if needed
date_cols = ['ApprenticeshipAchievedDate', 'StartDate', 'LearningExpectedEnd', 'ActualEnd']
for dc in date_cols:
pics_df[dc] = pd.to_datetime(pics_df[dc], errors='coerce')
# We'll count active learners per assessor and month in Power BI later using these date fields.
# Save processed PICS caseload
pics_df.to_csv('processed_data/pics_caseload_processed.csv', index=False)
# Cell 8: PICS Hours for Assessor Look up (File 4)
# Columns start on row 3
# Columns: B 'Standard title', F 'Weighted Monthly Hours (1.6)'
pics_hours_file = 'PICS Hours for Assessor Look up.xlsx'
pics_hours_df = pd.read_excel(pics_hours_file, skiprows=2)
pics_hours_df = pics_hours_df.rename(columns={'Standard title': 'StandardTitle', 'Weighted Monthly Hours (1.6)': 'WeightedMonthlyHours'})
# Hours in format H:MM, convert to decimal hours
def time_str_to_hours(t):
if pd.isna(t):
return 0.0
# assume format H:MM
parts = str(t).split(':')
if len(parts) == 2:
h = float(parts[0])
m = float(parts[1])
return h + m/60.0
else:
# If something else, try to convert directly
return pd.to_numeric(t, errors='coerce').fillna(0)
pics_hours_df['WeightedMonthlyHours'] = pics_hours_df['WeightedMonthlyHours'].apply(time_str_to_hours)
# Save processed
pics_hours_df.to_csv('processed_data/pics_hours_lookup_processed.csv', index=False)
# Cell 9: Additional Data Cleaning / Name Matching
# We have multiple datasets with staff names (ACL monthly, TutorPay, Terms, PICS).
# We will create a master staff mapping. This is optional but recommended.
# Extract unique staff from ACL monthly (tutors and LSAs)
acl_staff = acl_monthly_df[['First_Name','Known_As','Last_Name','Contract_Type']].drop_duplicates()
# Extract from Tutor Pay
tutor_pay_staff = tutor_pay_df[['PeopleName','PeopleSurname','StaffRole','ContractName']].drop_duplicates()
tutor_pay_staff = tutor_pay_staff.rename(columns={
'PeopleName':'First_Name',
'PeopleSurname':'Last_Name'
})
# Extract from Terms caseloading (tutors)
terms_staff = terms_df[['PrimaryStaffName','PrimaryStaffSurname']].drop_duplicates()
terms_staff = terms_staff.rename(columns={
'PrimaryStaffName':'First_Name',
'PrimaryStaffSurname':'Last_Name'
})
# Extract from PICS caseload (assessors)
pics_staff = pics_df[['AssessorFullName']].drop_duplicates()
# Split assessor full name into first/last if possible:
def split_full_name(fullname):
# simplistic split by space
if pd.isna(fullname) or fullname.strip()=='':
return pd.Series(['',''])
parts = fullname.strip().split()
first, last = parts[0], parts[-1]
return pd.Series([first.title(), last.title()])
pics_staff[['First_Name','Last_Name']] = pics_staff['AssessorFullName'].apply(split_full_name)
pics_staff = pics_staff.drop(columns=['AssessorFullName'])
# In reality, you might need a more robust name matching strategy (fuzzy matching).
# For now, we assume clean_name and standardization suffices.
# Combine all staff references
all_staff = pd.concat([acl_staff[['First_Name','Known_As','Last_Name']],
tutor_pay_staff[['First_Name','Last_Name']],
terms_staff[['First_Name','Last_Name']],
pics_staff[['First_Name','Last_Name']]], ignore_index=True)
all_staff = all_staff.fillna('')
all_staff = all_staff.drop_duplicates()
# This master staff list can be refined manually if needed,
# or used as a lookup to ensure consistency in Power BI relationships.
all_staff.to_csv('processed_data/all_staff_master.csv', index=False)
# Cell 10: Final Notes and Outputs
# We have now created processed datasets for:
# - Monthly ACL reports: acl_monthly_processed.csv
# - TutorPay (TERMS) data: tutor_pay_processed.csv
# - Terms caseloading: terms_caseload_processed.csv
# - PICS caseload: pics_caseload_processed.csv
# - PICS hours lookup: pics_hours_lookup_processed.csv
# - Master staff list: all_staff_master.csv
# These processed datasets can now be loaded into Power BI.
# In Power BI, we will:
# - Create relationships between these tables based on staff names (or ideally a unique staff ID if available).
# - Implement filters/slicers for Line Manager, Month, Week, Funding Group, Curriculum, Venues, Levels.
# - Create measures for utilization percentage = (Actual Hours / Target Hours) for salaried tutors.
# - Create drill-through pages to show detail tables.
# - Implement a toggle to show Known As vs First Name by using a slicer or bookmark that switches a measure or a calculated column in visuals.
# End of code.
print("Data processing completed. Processed files available in 'processed_data' folder.")
Editor is loading...
Leave a Comment