o1-1
user_1718919
plain_text
4 months ago
20 kB
4
Indexable
# Cell 1: Imports and Setup import os import glob import pandas as pd import numpy as np # Ensure we are in the "Staff Utilisation" directory # If not, you may need to adjust paths accordingly. # We'll assume the notebook is placed in the "Staff Utilisation" folder. # Create processed_data directory if it doesn't exist os.makedirs('processed_data', exist_ok=True) # Cell 2: Define Helper Functions def extract_month_year_from_filename(filename): """ Extract month and year from filename like 'ACL Monthly Report 17.04.23.xlsx'. We'll assume the format is always '... DD.MM.YY.xlsx'. Example: 'ACL Monthly Report 17.04.23.xlsx' -> April 2023 We'll return a pandas Period or (month, year) tuple. """ # filename might be something like: 'ACL Monthly Report 17.04.23.xlsx' # We split by space and get the last part before .xlsx base = os.path.basename(filename) name_part = os.path.splitext(base)[0] # 'ACL Monthly Report 17.04.23' parts = name_part.split() # ['ACL', 'Monthly', 'Report', '17.04.23'] date_str = parts[-1] # '17.04.23' # The middle part is '04' for month, '23' for year. # Day might not be needed but let's parse anyway. # Format: DD.MM.YY day, month, year = date_str.split('.') # We can store this as a YYYY-MM string (financial year presumably 2023 means year=2023) # We'll assume 20xx for year: year_full = 2000 + int(year) month_int = int(month) return pd.Period(f'{year_full}-{month_int}', freq='M') # Monthly period def clean_name(name_str): """ Standardize a name string by stripping spaces and making title case. """ if pd.isna(name_str): return '' return ' '.join(str(name_str).strip().split()).title() # Remove extra spaces and title-case def parse_line_manager_name(full_name): """ Line Manager Name is in format 'FirstName KnownAs, LastName'. Example: 'Joanne Jo, Loss' We want two formats: 1) FirstName LastName 2) KnownAs LastName We'll assume first part before comma is "FirstName KnownAs" and after comma is LastName. If it's something like "Joanne Jo, Loss": FirstName might be 'Joanne' KnownAs = 'Jo' LastName = 'Loss' If there's only one first name and no known-as, known-as = first name. We'll try to split by comma first, then split the left part by spaces. """ if pd.isna(full_name) or full_name.strip() == '': return pd.Series(["", ""]) parts = full_name.split(',') last_name = clean_name(parts[-1]) if len(parts) > 1 else '' first_known = parts[0].strip().split() if len(first_known) == 1: # Only one name given, treat as both first name and known as first_name = clean_name(first_known[0]) known_as = first_name else: # Assume last item in the first_known array is known-as # and the first is the formal first name # If there's more complexity, we may need a more robust approach first_name = clean_name(first_known[0]) known_as = clean_name(first_known[-1]) # Return as a Series: [FirstName LastName, KnownAs LastName] return pd.Series([f"{first_name} {last_name}", f"{known_as} {last_name}"]) # Cell 3: Process the 12 Monthly ACL Reports # We assume the folder 'monthly_reports' has 12 files named similarly to 'ACL Monthly Report DD.MM.YY.xlsx'. # We'll read them all, clean them, and combine into a single DataFrame with a 'Month' column. monthly_files = glob.glob('monthly_reports/*.xlsx') all_monthly_dfs = [] for f in monthly_files: month_period = extract_month_year_from_filename(f) # Get the month-year period # Monthly files start from 3rd row with column names # According to instructions: # 1st row = headline, 2nd row = empty, column names start from 3rd row # So skip the first two rows. df = pd.read_excel(f, skiprows=2) # Clean column names (if needed) df.columns = [str(c).strip() for c in df.columns] # Add a column for the period df['Report_Month'] = month_period all_monthly_dfs.append(df) acl_monthly_df = pd.concat(all_monthly_dfs, ignore_index=True) # Useful columns reference from instructions: # Assignment Number (A), Title (B), First Name (C), Known As (D), Last Name (E) # Position Name (J), Working Hours (K), Full-Time Equivalent (Q), Line Manager Name (W) # We'll rename columns to something standardized if needed. # Let's store original column names to variables: orig_cols = acl_monthly_df.columns # We know from instructions the approximate column positions: # Let's align with instructions (Adjust if needed based on actual data): # A: Assignment Number # B: Title # C: First Name # D: Known As # E: Last Name # J: Position Name # K: Working Hours # Q: Full-Time Equivalent # W: Line Manager Name acl_monthly_df = acl_monthly_df.rename(columns={ 'Assignment Number': 'Assignment_Number', 'Title': 'Title', 'First Name': 'First_Name', 'Known As': 'Known_As', 'Last Name': 'Last_Name', 'Position Name': 'Position_Name', 'Working Hours': 'Working_Hours', 'Full-Time Equivalent': 'FTE', 'Line Manager Name': 'Line_Manager_Name' }, errors='ignore') # in case some columns differ in case/spaces # Filter only Tutors and LSAs # Position_Name must contain 'Tutor' or 'Learning Support Assistant' acl_monthly_df = acl_monthly_df[acl_monthly_df['Position_Name'].isin(['Tutor', 'Learning Support Assistant'])] # Clean names acl_monthly_df['First_Name'] = acl_monthly_df['First_Name'].apply(clean_name) acl_monthly_df['Known_As'] = acl_monthly_df['Known_As'].apply(clean_name) acl_monthly_df['Last_Name'] = acl_monthly_df['Last_Name'].apply(clean_name) acl_monthly_df['Line_Manager_Name'] = acl_monthly_df['Line_Manager_Name'].apply(clean_name) # Parse line manager into two formats acl_monthly_df[['LM_FirstNameLastName','LM_KnownAsLastName']] = acl_monthly_df['Line_Manager_Name'].apply(parse_line_manager_name) # Determine Contract Type: # If Position_Name == 'Learning Support Assistant' → LSA # If Position_Name == 'Tutor': # If Working_Hours <= 3 → Sessional # If Working_Hours > 3 → Salaried def contract_type(row): if row['Position_Name'] == 'Learning Support Assistant': return 'LSA' elif row['Position_Name'] == 'Tutor': if pd.to_numeric(row['Working_Hours'], errors='coerce') is not None: wh = float(row['Working_Hours']) if wh > 3: return 'Salaried Tutor' else: return 'Sessional Tutor' else: return 'Sessional Tutor' # default if parsing fails return None acl_monthly_df['Contract_Type'] = acl_monthly_df.apply(contract_type, axis=1) # Calculate Target Hours only for Salaried Tutors # Target = FTE * 840, if Salaried Tutor acl_monthly_df['FTE'] = pd.to_numeric(acl_monthly_df['FTE'], errors='coerce').fillna(0) acl_monthly_df['Target_Hours'] = np.where( acl_monthly_df['Contract_Type']=='Salaried Tutor', acl_monthly_df['FTE']*840, 0 ) # Now we have a combined monthly dataset with month info. # This dataset can be used to track changes over time by grouping on Assignment_Number and seeing changes in Contract_Type across months. # We'll save this processed monthly dataset acl_monthly_df.to_csv('processed_data/acl_monthly_processed.csv', index=False) # Cell 4: Process TutorPayReport (TERMS data) # File: TutorPayReport_New 23.24 FY 22.11.24.xlsx # This file has columns: # PEOPLENAME (H), PEOPLESURNAME (I), STAFFROLE (U), CONTRACTNAME (V), ACTIVITYTYPE (S), Potential Hours (W), TIMEADJUSTEARLY (N), TIMEADJUSTLATE (O), ... # EVENTDATE (M), Course Funding Group (C), Course Prov Mon C (E), Course Prov Mon D (F), VENUE (G) tutor_pay_file = 'TutorPayReport_New 23.24 FY 22.11.24.xlsx' tutor_pay_df = pd.read_excel(tutor_pay_file) # Rename columns of interest tutor_pay_df = tutor_pay_df.rename(columns={ 'PEOPLENAME': 'PeopleName', 'PEOPLESURNAME': 'PeopleSurname', 'STAFFROLE': 'StaffRole', 'CONTRACTNAME': 'ContractName', 'ACTIVITYTYPE': 'ActivityType', 'Potential Hours': 'PotentialHours', 'TIMEADJUSTEARLY': 'TimeAdjustEarly', 'TIMEADJUSTLATE': 'TimeAdjustLate', 'Course Funding Group': 'CourseFundingGroup', 'Course Prov Mon C': 'CurriculumName', 'Course Prov Mon D': 'ToplineVenue', 'VENUE': 'Venue', 'EVENTDATE': 'EventDate' }, errors='ignore') # Filter STAFFROLE: We only care about 'Tutor' and 'Learning Support' (Strip extra spaces) tutor_pay_df['StaffRole'] = tutor_pay_df['StaffRole'].str.strip() tutor_pay_df = tutor_pay_df[tutor_pay_df['StaffRole'].isin(['Tutor','Learning Support'])] # Focus on CONTRACTNAME: 'LSA-NB', 'SALARIED TUTOR', 'SESSIONAL TUTOR' tutor_pay_df['ContractName'] = tutor_pay_df['ContractName'].str.strip() # ActivityType grouping for "Assessment" assessment_types = ['Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments'] tutor_pay_df['ActivityTypeGroup'] = np.where(tutor_pay_df['ActivityType'].isin(assessment_types), 'Assessment', tutor_pay_df['ActivityType']) # Calculate Actual Hours # TIMEADJUSTEARLY & TIMEADJUSTLATE in ms # Convert ms to hours by dividing by (1000*60*60) tutor_pay_df['TimeAdjustEarly'] = pd.to_numeric(tutor_pay_df['TimeAdjustEarly'], errors='coerce').fillna(0) tutor_pay_df['TimeAdjustLate'] = pd.to_numeric(tutor_pay_df['TimeAdjustLate'], errors='coerce').fillna(0) tutor_pay_df['PotentialHours'] = pd.to_numeric(tutor_pay_df['PotentialHours'], errors='coerce').fillna(0) early_hours = tutor_pay_df['TimeAdjustEarly']/(1000*60*60) late_hours = tutor_pay_df['TimeAdjustLate']/(1000*60*60) tutor_pay_df['ActualHours'] = tutor_pay_df['PotentialHours'] - (early_hours + late_hours) tutor_pay_df['ActualHours'] = tutor_pay_df['ActualHours'].apply(lambda x: x if x>0 else 0) # Clean PeopleName and PeopleSurname tutor_pay_df['PeopleName'] = tutor_pay_df['PeopleName'].apply(clean_name) tutor_pay_df['PeopleSurname'] = tutor_pay_df['PeopleSurname'].apply(clean_name) # Save processed Tutor Pay Data tutor_pay_df.to_csv('processed_data/tutor_pay_processed.csv', index=False) # Cell 5: Approved and Archived Lists for Terms Caseloading # We will create a dictionary/map of QualificationNumber -> QualificationName from Approved first, then Archived. approved_file = 'Approved List for Terms Caseloading FY2324.xlsx' archived_file = 'Archived List for Terms Caseloading FY2324.xlsx' approved_df = pd.read_excel(approved_file) archived_df = pd.read_excel(archived_file) # Columns we care about: QualificationName (B), QualificationNumber (D) approved_df = approved_df.rename(columns={'QualificationName': 'QualificationName', 'QualificationNumber': 'QualificationNumber'}) archived_df = archived_df.rename(columns={'QualificationName': 'QualificationName', 'QualificationNumber': 'QualificationNumber'}) approved_map = dict(zip(approved_df['QualificationNumber'], approved_df['QualificationName'])) archived_map = dict(zip(archived_df['QualificationNumber'], archived_df['QualificationName'])) # Cell 6: Terms Caseloading File # File 5: Terms caseloading 23.24 FY.xlsx # Columns of interest: # Funding Reference (M), Activity Name (D) # Primary Staff Name (A), Primary Staff Surname (B) # Start Date (H), End Date (I), Level (N), Activity Group (S) terms_caseload_file = 'Terms caseloading 23.24 FY.xlsx' terms_df = pd.read_excel(terms_caseload_file) # Rename columns terms_df = terms_df.rename(columns={ 'Funding Reference': 'FundingReference', 'Activity Name': 'ActivityName', 'Primary Staff Name': 'PrimaryStaffName', 'Primary Staff Surname': 'PrimaryStaffSurname', 'Start Date': 'StartDate', 'End Date': 'EndDate', 'Level': 'Level', 'Activity Group': 'ActivityGroup' }, errors='ignore') # Clean staff names terms_df['PrimaryStaffName'] = terms_df['PrimaryStaffName'].apply(clean_name) terms_df['PrimaryStaffSurname'] = terms_df['PrimaryStaffSurname'].apply(clean_name) # Process FundingReference: # If ends in 'P' or 'N', remove that character def clean_funding_ref(ref): if pd.isna(ref): return '' ref = str(ref).strip() if ref.endswith('P') or ref.endswith('N'): ref = ref[:-1] # remove last char return ref terms_df['FundingReference'] = terms_df['FundingReference'].apply(clean_funding_ref) def get_qualification_name(ref): # If ref is BLANK or '', use ActivityName if ref == '' or ref.upper() == 'BLANK': return None # Try approved_map first if ref in approved_map: return approved_map[ref] # If not found, try archived_map if ref in archived_map: return archived_map[ref] return None terms_df['QualificationName'] = terms_df['FundingReference'].apply(get_qualification_name) terms_df['QualificationName'] = terms_df.apply( lambda x: x['QualificationName'] if pd.notna(x['QualificationName']) else x['ActivityName'], axis=1 ) # We now have a dataset with tutors and their learners, courses, etc. # We'll save the processed terms data terms_df.to_csv('processed_data/terms_caseload_processed.csv', index=False) # Cell 7: PICS caseload for PBI (File 3) # Columns of interest: # Assessor Full Name (K), Programme (D), Apprenticeship Standard Title (N), Apprenticeship Achieved Date (Q), Start Date (G), Learning Expected End (H), Actual End (I) pics_caseload_file = 'Pics caseload for PBI.xlsx' pics_df = pd.read_excel(pics_caseload_file) pics_df = pics_df.rename(columns={ 'Assessor Full Name': 'AssessorFullName', 'Programme': 'Programme', 'Apprenticeship Standard Title': 'ApprenticeshipStandardTitle', 'Apprenticeship Achieved Date': 'ApprenticeshipAchievedDate', 'Start Date': 'StartDate', 'Learning Expected End': 'LearningExpectedEnd', 'Actual End': 'ActualEnd' }, errors='ignore') # Clean assessor names pics_df['AssessorFullName'] = pics_df['AssessorFullName'].apply(clean_name) # Convert date columns if needed date_cols = ['ApprenticeshipAchievedDate', 'StartDate', 'LearningExpectedEnd', 'ActualEnd'] for dc in date_cols: pics_df[dc] = pd.to_datetime(pics_df[dc], errors='coerce') # We'll count active learners per assessor and month in Power BI later using these date fields. # Save processed PICS caseload pics_df.to_csv('processed_data/pics_caseload_processed.csv', index=False) # Cell 8: PICS Hours for Assessor Look up (File 4) # Columns start on row 3 # Columns: B 'Standard title', F 'Weighted Monthly Hours (1.6)' pics_hours_file = 'PICS Hours for Assessor Look up.xlsx' pics_hours_df = pd.read_excel(pics_hours_file, skiprows=2) pics_hours_df = pics_hours_df.rename(columns={'Standard title': 'StandardTitle', 'Weighted Monthly Hours (1.6)': 'WeightedMonthlyHours'}) # Hours in format H:MM, convert to decimal hours def time_str_to_hours(t): if pd.isna(t): return 0.0 # assume format H:MM parts = str(t).split(':') if len(parts) == 2: h = float(parts[0]) m = float(parts[1]) return h + m/60.0 else: # If something else, try to convert directly return pd.to_numeric(t, errors='coerce').fillna(0) pics_hours_df['WeightedMonthlyHours'] = pics_hours_df['WeightedMonthlyHours'].apply(time_str_to_hours) # Save processed pics_hours_df.to_csv('processed_data/pics_hours_lookup_processed.csv', index=False) # Cell 9: Additional Data Cleaning / Name Matching # We have multiple datasets with staff names (ACL monthly, TutorPay, Terms, PICS). # We will create a master staff mapping. This is optional but recommended. # Extract unique staff from ACL monthly (tutors and LSAs) acl_staff = acl_monthly_df[['First_Name','Known_As','Last_Name','Contract_Type']].drop_duplicates() # Extract from Tutor Pay tutor_pay_staff = tutor_pay_df[['PeopleName','PeopleSurname','StaffRole','ContractName']].drop_duplicates() tutor_pay_staff = tutor_pay_staff.rename(columns={ 'PeopleName':'First_Name', 'PeopleSurname':'Last_Name' }) # Extract from Terms caseloading (tutors) terms_staff = terms_df[['PrimaryStaffName','PrimaryStaffSurname']].drop_duplicates() terms_staff = terms_staff.rename(columns={ 'PrimaryStaffName':'First_Name', 'PrimaryStaffSurname':'Last_Name' }) # Extract from PICS caseload (assessors) pics_staff = pics_df[['AssessorFullName']].drop_duplicates() # Split assessor full name into first/last if possible: def split_full_name(fullname): # simplistic split by space if pd.isna(fullname) or fullname.strip()=='': return pd.Series(['','']) parts = fullname.strip().split() first, last = parts[0], parts[-1] return pd.Series([first.title(), last.title()]) pics_staff[['First_Name','Last_Name']] = pics_staff['AssessorFullName'].apply(split_full_name) pics_staff = pics_staff.drop(columns=['AssessorFullName']) # In reality, you might need a more robust name matching strategy (fuzzy matching). # For now, we assume clean_name and standardization suffices. # Combine all staff references all_staff = pd.concat([acl_staff[['First_Name','Known_As','Last_Name']], tutor_pay_staff[['First_Name','Last_Name']], terms_staff[['First_Name','Last_Name']], pics_staff[['First_Name','Last_Name']]], ignore_index=True) all_staff = all_staff.fillna('') all_staff = all_staff.drop_duplicates() # This master staff list can be refined manually if needed, # or used as a lookup to ensure consistency in Power BI relationships. all_staff.to_csv('processed_data/all_staff_master.csv', index=False) # Cell 10: Final Notes and Outputs # We have now created processed datasets for: # - Monthly ACL reports: acl_monthly_processed.csv # - TutorPay (TERMS) data: tutor_pay_processed.csv # - Terms caseloading: terms_caseload_processed.csv # - PICS caseload: pics_caseload_processed.csv # - PICS hours lookup: pics_hours_lookup_processed.csv # - Master staff list: all_staff_master.csv # These processed datasets can now be loaded into Power BI. # In Power BI, we will: # - Create relationships between these tables based on staff names (or ideally a unique staff ID if available). # - Implement filters/slicers for Line Manager, Month, Week, Funding Group, Curriculum, Venues, Levels. # - Create measures for utilization percentage = (Actual Hours / Target Hours) for salaried tutors. # - Create drill-through pages to show detail tables. # - Implement a toggle to show Known As vs First Name by using a slicer or bookmark that switches a measure or a calculated column in visuals. # End of code. print("Data processing completed. Processed files available in 'processed_data' folder.")
Editor is loading...
Leave a Comment