LC
import pandas as pd import numpy as np import os import re from datetime import datetime, timedelta # Create processed_data folder if not exists if not os.path.exists('processed_data'): os.makedirs('processed_data') def clean_name(name): if pd.isnull(name): return "" return " ".join(name.split()).strip() def parse_snapshot_date_from_filename(fname): # Filename pattern: 'ACL Monthly Report DD.MM.YY.xlsx' match = re.search(r'(\d{2}\.\d{2}\.\d{2})', fname) if not match: raise ValueError(f"Could not find a date pattern in filename {fname}") date_str = match.group(1) # Parse date - assume '23' means 2023, etc. return datetime.strptime(date_str, '%d.%m.%y') def convert_ms_to_hours(ms): # TIMEADJUSTEARLY, TIMEADJUSTLATE in milliseconds return ms / (1000*3600) def convert_h_mm_to_hours(h_mm_str): if pd.isnull(h_mm_str) or h_mm_str == '': return np.nan parts = h_mm_str.split(':') if len(parts) != 2: return np.nan h = int(parts[0]) m = int(parts[1]) return h + m/60 # Process Monthly ACL Reports (File 7) monthly_data = [] monthly_reports_path = 'monthly_reports' monthly_files = [f for f in os.listdir(monthly_reports_path) if f.endswith('.xlsx')] for f in monthly_files: snapshot_date = parse_snapshot_date_from_filename(f) df = pd.read_excel(os.path.join(monthly_reports_path, f), header=2) df.columns = [c.strip() for c in df.columns] required_cols = ["Assignment Number","Title","First Name","Known As","Last Name", "Position Name","Working Hours","Full-Time Equivalent","Line Manager Name"] for col in required_cols: if col not in df.columns: raise ValueError(f"Column {col} not found in {f}") # Filter Tutors and LSA df = df[(df["Position Name"] == "Tutor") | (df["Position Name"] == "Learning Support Assistant")] df["Working Hours"] = pd.to_numeric(df["Working Hours"], errors='coerce').fillna(0) df["Full-Time Equivalent"] = pd.to_numeric(df["Full-Time Equivalent"], errors='coerce').fillna(0) def determine_contract_type(row): pos = row["Position Name"] wh = row["Working Hours"] if pos == "Tutor": if wh > 3: return "Salaried" else: return "Sessional" elif pos == "Learning Support Assistant": return "LSA" else: return None df["Contract Type"] = df.apply(determine_contract_type, axis=1) df["Target Hours"] = np.where(df["Contract Type"] == "Salaried", df["Full-Time Equivalent"]*840, 0) df["First Name"] = df["First Name"].apply(clean_name) df["Known As"] = df["Known As"].apply(clean_name) df["Last Name"] = df["Last Name"].apply(clean_name) def parse_line_manager(lm): if pd.isnull(lm): return ("","") lm = clean_name(lm) parts = lm.split(',') if len(parts) != 2: # Unexpected format return (lm, lm) first_part = parts[0].strip() last_part = parts[1].strip() fn_parts = first_part.split() if len(fn_parts) > 1: lm_first_name = fn_parts[0] lm_known_as = fn_parts[1] else: lm_first_name = fn_parts[0] lm_known_as = fn_parts[0] lm_first_last = lm_first_name + " " + last_part lm_knownas_last = lm_known_as + " " + last_part return (lm_first_last, lm_knownas_last) df["Line Manager FirstLast"], df["Line Manager KnownAsLast"] = zip(*df["Line Manager Name"].apply(parse_line_manager)) df["TutorName_FirstLast"] = df["First Name"] + " " + df["Last Name"] df["TutorName_KnownAsLast"] = df["Known As"] + " " + df["Last Name"] df["Snapshot Date"] = snapshot_date monthly_data.append(df) acl_monthly_combined = pd.concat(monthly_data, ignore_index=True) # Derive contract start/end dates acl_monthly_combined.sort_values(["Assignment Number","Snapshot Date"], inplace=True) global_snapshots = sorted(acl_monthly_combined["Snapshot Date"].unique()) contract_records = [] for assign_num, group in acl_monthly_combined.groupby("Assignment Number"): group = group.sort_values("Snapshot Date") snapshots = group["Snapshot Date"].unique() start_date = snapshots[0] last_snapshot = snapshots[-1] idx = global_snapshots.index(last_snapshot) if idx < len(global_snapshots)-1: next_snapshot_date = global_snapshots[idx+1] # Check if contract appears in next snapshot appears_in_next = (assign_num in acl_monthly_combined[acl_monthly_combined["Snapshot Date"] == next_snapshot_date]["Assignment Number"].values) if not appears_in_next: contract_end_date = next_snapshot_date - timedelta(days=1) else: contract_end_date = pd.NaT else: contract_end_date = pd.NaT first_row = group.iloc[0] contract_type = first_row["Contract Type"] target_hours = first_row["Target Hours"] pos_name = first_row["Position Name"] fn = first_row["First Name"] kn = first_row["Known As"] ln = first_row["Last Name"] tfl = first_row["TutorName_FirstLast"] tkl = first_row["TutorName_KnownAsLast"] lm_fl = first_row["Line Manager FirstLast"] lm_kl = first_row["Line Manager KnownAsLast"] fte = first_row["Full-Time Equivalent"] working_hours = first_row["Working Hours"] title = first_row["Title"] contract_records.append({ "Assignment Number": assign_num, "First Name": fn, "Known As": kn, "Last Name": ln, "TutorName_FirstLast": tfl, "TutorName_KnownAsLast": tkl, "Line Manager FirstLast": lm_fl, "Line Manager KnownAsLast": lm_kl, "Position Name": pos_name, "Contract Type": contract_type, "FTE": fte, "Working Hours": working_hours, "Target Hours": target_hours, "Title": title, "Contract Start Date": start_date, "Contract End Date": contract_end_date }) contracts_df = pd.DataFrame(contract_records) contracts_df.to_csv("processed_data/contracts_info.csv", index=False) acl_monthly_combined.to_csv("processed_data/acl_monthly_combined.csv", index=False) # Process TutorPayReport (File 6) file_6 = "TutorPayReport_New 23.24 FY 22.11.24.xlsx" df6 = pd.read_excel(file_6, header=0) df6.columns = [c.strip() for c in df6.columns] required_cols_6 = ["Course Funding Group", "Course Prov Mon C", "Course Prov Mon D", "VENUE", "PEOPLENAME", "PEOPLESURNAME", "EVENTDATE", "TIMEADJUSTEARLY", "TIMEADJUSTLATE", "ACTIVITYTYPE", "STAFFROLE", "CONTRACTNAME", "Potential Hours"] for col in required_cols_6: if col not in df6.columns: raise ValueError(f"Column {col} not found in File 6") df6["STAFFROLE"] = df6["STAFFROLE"].apply(lambda x: clean_name(str(x))) df6["CONTRACTNAME"] = df6["CONTRACTNAME"].apply(clean_name) df6 = df6[df6["STAFFROLE"].isin(["Tutor","Learning Support"])] df6["EVENTDATE"] = pd.to_datetime(df6["EVENTDATE"], format='%d %b %Y', errors='coerce') df6["TIMEADJUSTEARLY"] = pd.to_numeric(df6["TIMEADJUSTEARLY"], errors='coerce').fillna(0) df6["TIMEADJUSTLATE"] = pd.to_numeric(df6["TIMEADJUSTLATE"], errors='coerce').fillna(0) df6["Early_Adjust_Hours"] = df6["TIMEADJUSTEARLY"].apply(convert_ms_to_hours) df6["Late_Adjust_Hours"] = df6["TIMEADJUSTLATE"].apply(convert_ms_to_hours) df6["Potential Hours"] = pd.to_numeric(df6["Potential Hours"], errors='coerce').fillna(0) df6["Actual Hours"] = df6["Potential Hours"] - (df6["Early_Adjust_Hours"] + df6["Late_Adjust_Hours"]) df6.loc[df6["Actual Hours"] < 0, "Actual Hours"] = 0 assessment_values = ['Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments'] df6["ActivityType_Grouped"] = df6["ACTIVITYTYPE"].apply(lambda x: "Assessment" if x in assessment_values else x) # Process Approved (File 1) and Archived (File 2) for Qualification Lookup file_1 = "Approved List for Terms Caseloading FY2324.xlsx" df1 = pd.read_excel(file_1, header=0) df1.columns = [c.strip() for c in df1.columns] req_cols_1 = ["QualificationName","QualificationNumber"] for c in req_cols_1: if c not in df1.columns: raise ValueError("Required column missing in File 1") file_2 = "Archived List for Terms Caseloading FY2324.xlsx" df2 = pd.read_excel(file_2, header=0) df2.columns = [c.strip() for c in df2.columns] for c in req_cols_1: if c not in df2.columns: raise ValueError("Required column missing in File 2") approved_lookup = dict(zip(df1["QualificationNumber"], df1["QualificationName"])) archived_lookup = dict(zip(df2["QualificationNumber"], df2["QualificationName"])) # Process Terms Caseloading (File 5) file_5 = "Terms caseloading 23.24 FY.xlsx" df5 = pd.read_excel(file_5, header=0) df5.columns = [c.strip() for c in df5.columns] req_cols_5 = ["Primary Staff Name","Primary Staff Surname","Activity Name","Funding Reference", "Start Date","End Date","Level","Activity Group"] for c in req_cols_5: if c not in df5.columns: raise ValueError(f"Required column {c} missing in File 5") df5["Primary Staff Name"] = df5["Primary Staff Name"].apply(clean_name) df5["Primary Staff Surname"] = df5["Primary Staff Surname"].apply(clean_name) def clean_funding_ref(fr): if pd.isnull(fr): return None # Convert to string first fr = str(fr).strip() if fr.endswith("P") or fr.endswith("N"): fr = fr[:-1] if fr.upper() == "BLANK": fr = "BLANK" return fr df5["Funding Reference"] = df5["Funding Reference"].apply(clean_funding_ref) def get_qualification_name(fref, aname): if fref is None or fref == "BLANK": return aname if fref in approved_lookup: return approved_lookup[fref] if fref in archived_lookup: return archived_lookup[fref] return aname df5["QualificationName"] = df5.apply(lambda row: get_qualification_name(row["Funding Reference"], row["Activity Name"]), axis=1) df5["Start Date"] = pd.to_datetime(df5["Start Date"], errors='coerce') df5["End Date"] = pd.to_datetime(df5["End Date"], errors='coerce') from datetime import datetime, timedelta, time def convert_h_mm_to_hours(h_mm_value): if pd.isnull(h_mm_value) or h_mm_value == '': return np.nan # If it's a time object if isinstance(h_mm_value, time): h = h_mm_value.hour m = h_mm_value.minute return h + m/60 # Otherwise, treat as string h_mm_str = str(h_mm_value).strip() if ':' not in h_mm_str: return np.nan parts = h_mm_str.split(':') if len(parts) != 2: return np.nan h = int(parts[0]) m = int(parts[1]) return h + m/60 # Process PICS Caseload (File 3) and Hours Lookup (File 4) file_3 = "Pics caseload for PBI.xlsx" df3 = pd.read_excel(file_3, header=0) df3.columns = [c.strip() for c in df3.columns] req_cols_3 = ["Assessor Full Name","Programme","Apprenticeship Standard Title", "Apprenticeship Achieved Date","Start Date","Learning Expected End","Actual End"] for c in req_cols_3: if c not in df3.columns: raise ValueError(f"Required column {c} missing in File 3") df3["Start Date"] = pd.to_datetime(df3["Start Date"], errors='coerce') df3["Learning Expected End"] = pd.to_datetime(df3["Learning Expected End"], errors='coerce') df3["Actual End"] = pd.to_datetime(df3["Actual End"], errors='coerce') df3["Apprenticeship Achieved Date"] = pd.to_datetime(df3["Apprenticeship Achieved Date"], errors='coerce') df3.to_csv("processed_data/pics_caseload_processed.csv", index=False) file_4 = "PICS Hours for Assessor Look up.xlsx" df4 = pd.read_excel(file_4, header=2) df4.columns = [c.strip() for c in df4.columns] req_cols_4 = ["Standard title","Weighted Monthly Hours (1.6)"] for c in req_cols_4: if c not in df4.columns: raise ValueError(f"Required column {c} missing in File 4") df4["Weighted Monthly Hours (1.6)"] = df4["Weighted Monthly Hours (1.6)"].apply(convert_h_mm_to_hours) df4.to_csv("processed_data/pics_hours_lookup.csv", index=False) # Create Master Staff List from contracts staff_df = contracts_df[["First Name","Known As","Last Name","TutorName_FirstLast","TutorName_KnownAsLast"]].drop_duplicates() staff_df.to_csv("processed_data/master_staff_list.csv", index=False) # Create a mapping from TutorName_FirstLast -> (FirstName, KnownAs, LastName) staff_map = {} for idx,row in staff_df.iterrows(): staff_map[row["TutorName_FirstLast"]] = (row["First Name"], row["Known As"], row["Last Name"]) # Match names in TutorPayReport (df6) df6["Staff_FirstLast"] = df6["PEOPLENAME"].apply(clean_name) + " " + df6["PEOPLESURNAME"].apply(clean_name) df6["Matched_FirstName"] = df6["Staff_FirstLast"].apply(lambda x: staff_map[x][0] if x in staff_map else np.nan) df6["Matched_KnownAs"] = df6["Staff_FirstLast"].apply(lambda x: staff_map[x][1] if x in staff_map else np.nan) df6["Matched_LastName"] = df6["Staff_FirstLast"].apply(lambda x: staff_map[x][2] if x in staff_map else np.nan) # Match names in Terms caseloading (df5) df5["Staff_FirstLast"] = df5["Primary Staff Name"] + " " + df5["Primary Staff Surname"] df5["Staff_FirstLast"] = df5["Staff_FirstLast"].apply(clean_name) df5["Matched_FirstName"] = df5["Staff_FirstLast"].apply(lambda x: staff_map[x][0] if x in staff_map else np.nan) df5["Matched_KnownAs"] = df5["Staff_FirstLast"].apply(lambda x: staff_map[x][1] if x in staff_map else np.nan) df5["Matched_LastName"] = df5["Staff_FirstLast"].apply(lambda x: staff_map[x][2] if x in staff_map else np.nan) # Validation check for unmatched names in File 6 unmatched_6 = df6[df6["Matched_FirstName"].isna()][["PEOPLENAME","PEOPLESURNAME","Staff_FirstLast"]].drop_duplicates() # Validation check for unmatched names in File 5 unmatched_5 = df5[df5["Matched_FirstName"].isna()][["Primary Staff Name","Primary Staff Surname","Staff_FirstLast"]].drop_duplicates() # Save unmatched names with more intuitive file names unmatched_5.to_csv("processed_data/unmatched_names_terms_caseloading.csv", index=False) unmatched_6.to_csv("processed_data/unmatched_names_tutorpayreport.csv", index=False) # Calculate summary for Terms caseloading (File 5) total_names_5 = df5["Staff_FirstLast"].nunique() unmatched_count_5 = len(unmatched_5) matched_count_5 = total_names_5 - unmatched_count_5 # Identify matched names for File 5 unmatched_names_5_set = set(unmatched_5["Staff_FirstLast"]) all_names_5_set = set(df5["Staff_FirstLast"].unique()) matched_names_5 = list(all_names_5_set - unmatched_names_5_set) # Create a summary DataFrame for File 5 summary_5 = pd.DataFrame([{ "Total Unique Names": total_names_5, "Unmatched Count": unmatched_count_5, "Matched Count": matched_count_5 }]) summary_5.to_csv("processed_data/terms_caseloading_name_matching_summary.csv", index=False) # Save matched names for File 5 matched_5_df = pd.DataFrame(matched_names_5, columns=["Matched Staff_FirstLast"]) matched_5_df.to_csv("processed_data/matched_names_terms_caseloading.csv", index=False) # Calculate summary for TutorPayReport (File 6) total_names_6 = df6["Staff_FirstLast"].nunique() unmatched_count_6 = len(unmatched_6) matched_count_6 = total_names_6 - unmatched_count_6 # Identify matched names for File 6 unmatched_names_6_set = set(unmatched_6["Staff_FirstLast"]) all_names_6_set = set(df6["Staff_FirstLast"].unique()) matched_names_6 = list(all_names_6_set - unmatched_names_6_set) # Create a summary DataFrame for File 6 summary_6 = pd.DataFrame([{ "Total Unique Names": total_names_6, "Unmatched Count": unmatched_count_6, "Matched Count": matched_count_6 }]) summary_6.to_csv("processed_data/tutorpayreport_name_matching_summary.csv", index=False) # Save matched names for File 6 matched_6_df = pd.DataFrame(matched_names_6, columns=["Matched Staff_FirstLast"]) matched_6_df.to_csv("processed_data/matched_names_tutorpayreport.csv", index=False) # Save final processed versions df6.to_csv("processed_data/tutorpayreport_processed.csv", index=False) df5.to_csv("processed_data/terms_caseloading_processed.csv", index=False) print("All processing complete. Check 'processed_data' folder for output files.")
Leave a Comment