LC

 avatar
user_1718919
plain_text
2 months ago
17 kB
2
Indexable
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime, timedelta

# Create processed_data folder if not exists
if not os.path.exists('processed_data'):
    os.makedirs('processed_data')

def clean_name(name):
    if pd.isnull(name):
        return ""
    return " ".join(name.split()).strip()

def parse_snapshot_date_from_filename(fname):
    # Filename pattern: 'ACL Monthly Report DD.MM.YY.xlsx'
    match = re.search(r'(\d{2}\.\d{2}\.\d{2})', fname)
    if not match:
        raise ValueError(f"Could not find a date pattern in filename {fname}")
    date_str = match.group(1)
    # Parse date - assume '23' means 2023, etc.
    return datetime.strptime(date_str, '%d.%m.%y')

def convert_ms_to_hours(ms):
    # TIMEADJUSTEARLY, TIMEADJUSTLATE in milliseconds
    return ms / (1000*3600)

def convert_h_mm_to_hours(h_mm_str):
    if pd.isnull(h_mm_str) or h_mm_str == '':
        return np.nan
    parts = h_mm_str.split(':')
    if len(parts) != 2:
        return np.nan
    h = int(parts[0])
    m = int(parts[1])
    return h + m/60

# Process Monthly ACL Reports (File 7)
monthly_data = []
monthly_reports_path = 'monthly_reports'
monthly_files = [f for f in os.listdir(monthly_reports_path) if f.endswith('.xlsx')]

for f in monthly_files:
    snapshot_date = parse_snapshot_date_from_filename(f)
    df = pd.read_excel(os.path.join(monthly_reports_path, f), header=2)
    df.columns = [c.strip() for c in df.columns]
    
    required_cols = ["Assignment Number","Title","First Name","Known As","Last Name",
                     "Position Name","Working Hours","Full-Time Equivalent","Line Manager Name"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Column {col} not found in {f}")
    
    # Filter Tutors and LSA
    df = df[(df["Position Name"] == "Tutor") | (df["Position Name"] == "Learning Support Assistant")]
    
    df["Working Hours"] = pd.to_numeric(df["Working Hours"], errors='coerce').fillna(0)
    df["Full-Time Equivalent"] = pd.to_numeric(df["Full-Time Equivalent"], errors='coerce').fillna(0)
    
    def determine_contract_type(row):
        pos = row["Position Name"]
        wh = row["Working Hours"]
        if pos == "Tutor":
            if wh > 3:
                return "Salaried"
            else:
                return "Sessional"
        elif pos == "Learning Support Assistant":
            return "LSA"
        else:
            return None
    
    df["Contract Type"] = df.apply(determine_contract_type, axis=1)
    df["Target Hours"] = np.where(df["Contract Type"] == "Salaried", df["Full-Time Equivalent"]*840, 0)
    
    df["First Name"] = df["First Name"].apply(clean_name)
    df["Known As"] = df["Known As"].apply(clean_name)
    df["Last Name"] = df["Last Name"].apply(clean_name)
    
    def parse_line_manager(lm):
        if pd.isnull(lm):
            return ("","")
        lm = clean_name(lm)
        parts = lm.split(',')
        if len(parts) != 2:
            # Unexpected format
            return (lm, lm)
        first_part = parts[0].strip()
        last_part = parts[1].strip()
        
        fn_parts = first_part.split()
        if len(fn_parts) > 1:
            lm_first_name = fn_parts[0]
            lm_known_as = fn_parts[1]
        else:
            lm_first_name = fn_parts[0]
            lm_known_as = fn_parts[0]
        
        lm_first_last = lm_first_name + " " + last_part
        lm_knownas_last = lm_known_as + " " + last_part
        return (lm_first_last, lm_knownas_last)
    
    df["Line Manager FirstLast"], df["Line Manager KnownAsLast"] = zip(*df["Line Manager Name"].apply(parse_line_manager))
    df["TutorName_FirstLast"] = df["First Name"] + " " + df["Last Name"]
    df["TutorName_KnownAsLast"] = df["Known As"] + " " + df["Last Name"]
    df["Snapshot Date"] = snapshot_date
    
    monthly_data.append(df)

acl_monthly_combined = pd.concat(monthly_data, ignore_index=True)

# Derive contract start/end dates
acl_monthly_combined.sort_values(["Assignment Number","Snapshot Date"], inplace=True)

global_snapshots = sorted(acl_monthly_combined["Snapshot Date"].unique())
contract_records = []

for assign_num, group in acl_monthly_combined.groupby("Assignment Number"):
    group = group.sort_values("Snapshot Date")
    snapshots = group["Snapshot Date"].unique()
    start_date = snapshots[0]
    last_snapshot = snapshots[-1]
    idx = global_snapshots.index(last_snapshot)
    if idx < len(global_snapshots)-1:
        next_snapshot_date = global_snapshots[idx+1]
        # Check if contract appears in next snapshot
        appears_in_next = (assign_num in acl_monthly_combined[acl_monthly_combined["Snapshot Date"] == next_snapshot_date]["Assignment Number"].values)
        if not appears_in_next:
            contract_end_date = next_snapshot_date - timedelta(days=1)
        else:
            contract_end_date = pd.NaT
    else:
        contract_end_date = pd.NaT
    
    first_row = group.iloc[0]
    contract_type = first_row["Contract Type"]
    target_hours = first_row["Target Hours"]
    pos_name = first_row["Position Name"]
    fn = first_row["First Name"]
    kn = first_row["Known As"]
    ln = first_row["Last Name"]
    tfl = first_row["TutorName_FirstLast"]
    tkl = first_row["TutorName_KnownAsLast"]
    lm_fl = first_row["Line Manager FirstLast"]
    lm_kl = first_row["Line Manager KnownAsLast"]
    fte = first_row["Full-Time Equivalent"]
    working_hours = first_row["Working Hours"]
    title = first_row["Title"]
    
    contract_records.append({
        "Assignment Number": assign_num,
        "First Name": fn,
        "Known As": kn,
        "Last Name": ln,
        "TutorName_FirstLast": tfl,
        "TutorName_KnownAsLast": tkl,
        "Line Manager FirstLast": lm_fl,
        "Line Manager KnownAsLast": lm_kl,
        "Position Name": pos_name,
        "Contract Type": contract_type,
        "FTE": fte,
        "Working Hours": working_hours,
        "Target Hours": target_hours,
        "Title": title,
        "Contract Start Date": start_date,
        "Contract End Date": contract_end_date
    })

contracts_df = pd.DataFrame(contract_records)
contracts_df.to_csv("processed_data/contracts_info.csv", index=False)
acl_monthly_combined.to_csv("processed_data/acl_monthly_combined.csv", index=False)

# Process TutorPayReport (File 6)
file_6 = "TutorPayReport_New 23.24 FY 22.11.24.xlsx"
df6 = pd.read_excel(file_6, header=0)
df6.columns = [c.strip() for c in df6.columns]

required_cols_6 = ["Course Funding Group", "Course Prov Mon C", "Course Prov Mon D", "VENUE",
                   "PEOPLENAME", "PEOPLESURNAME", "EVENTDATE", "TIMEADJUSTEARLY", "TIMEADJUSTLATE",
                   "ACTIVITYTYPE", "STAFFROLE", "CONTRACTNAME", "Potential Hours"]
for col in required_cols_6:
    if col not in df6.columns:
        raise ValueError(f"Column {col} not found in File 6")

df6["STAFFROLE"] = df6["STAFFROLE"].apply(lambda x: clean_name(str(x)))
df6["CONTRACTNAME"] = df6["CONTRACTNAME"].apply(clean_name)
df6 = df6[df6["STAFFROLE"].isin(["Tutor","Learning Support"])]

df6["EVENTDATE"] = pd.to_datetime(df6["EVENTDATE"], format='%d %b %Y', errors='coerce')
df6["TIMEADJUSTEARLY"] = pd.to_numeric(df6["TIMEADJUSTEARLY"], errors='coerce').fillna(0)
df6["TIMEADJUSTLATE"] = pd.to_numeric(df6["TIMEADJUSTLATE"], errors='coerce').fillna(0)
df6["Early_Adjust_Hours"] = df6["TIMEADJUSTEARLY"].apply(convert_ms_to_hours)
df6["Late_Adjust_Hours"] = df6["TIMEADJUSTLATE"].apply(convert_ms_to_hours)
df6["Potential Hours"] = pd.to_numeric(df6["Potential Hours"], errors='coerce').fillna(0)

df6["Actual Hours"] = df6["Potential Hours"] - (df6["Early_Adjust_Hours"] + df6["Late_Adjust_Hours"])
df6.loc[df6["Actual Hours"] < 0, "Actual Hours"] = 0

assessment_values = ['Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess',
                     'Creative Assessments', 'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess',
                     'ESOL F2F Assess', 'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments',
                     'SWiS Assessments']
df6["ActivityType_Grouped"] = df6["ACTIVITYTYPE"].apply(lambda x: "Assessment" if x in assessment_values else x)

# Process Approved (File 1) and Archived (File 2) for Qualification Lookup
file_1 = "Approved List for Terms Caseloading FY2324.xlsx"
df1 = pd.read_excel(file_1, header=0)
df1.columns = [c.strip() for c in df1.columns]
req_cols_1 = ["QualificationName","QualificationNumber"]
for c in req_cols_1:
    if c not in df1.columns:
        raise ValueError("Required column missing in File 1")

file_2 = "Archived List for Terms Caseloading FY2324.xlsx"
df2 = pd.read_excel(file_2, header=0)
df2.columns = [c.strip() for c in df2.columns]
for c in req_cols_1:
    if c not in df2.columns:
        raise ValueError("Required column missing in File 2")

approved_lookup = dict(zip(df1["QualificationNumber"], df1["QualificationName"]))
archived_lookup = dict(zip(df2["QualificationNumber"], df2["QualificationName"]))

# Process Terms Caseloading (File 5)
file_5 = "Terms caseloading 23.24 FY.xlsx"
df5 = pd.read_excel(file_5, header=0)
df5.columns = [c.strip() for c in df5.columns]

req_cols_5 = ["Primary Staff Name","Primary Staff Surname","Activity Name","Funding Reference",
              "Start Date","End Date","Level","Activity Group"]
for c in req_cols_5:
    if c not in df5.columns:
        raise ValueError(f"Required column {c} missing in File 5")

df5["Primary Staff Name"] = df5["Primary Staff Name"].apply(clean_name)
df5["Primary Staff Surname"] = df5["Primary Staff Surname"].apply(clean_name)

def clean_funding_ref(fr):
    if pd.isnull(fr):
        return None
    # Convert to string first
    fr = str(fr).strip()
    if fr.endswith("P") or fr.endswith("N"):
        fr = fr[:-1]
    if fr.upper() == "BLANK":
        fr = "BLANK"
    return fr

df5["Funding Reference"] = df5["Funding Reference"].apply(clean_funding_ref)

def get_qualification_name(fref, aname):
    if fref is None or fref == "BLANK":
        return aname
    if fref in approved_lookup:
        return approved_lookup[fref]
    if fref in archived_lookup:
        return archived_lookup[fref]
    return aname

df5["QualificationName"] = df5.apply(lambda row: get_qualification_name(row["Funding Reference"], row["Activity Name"]), axis=1)

df5["Start Date"] = pd.to_datetime(df5["Start Date"], errors='coerce')
df5["End Date"] = pd.to_datetime(df5["End Date"], errors='coerce')

from datetime import datetime, timedelta, time

def convert_h_mm_to_hours(h_mm_value):
    if pd.isnull(h_mm_value) or h_mm_value == '':
        return np.nan
    # If it's a time object
    if isinstance(h_mm_value, time):
        h = h_mm_value.hour
        m = h_mm_value.minute
        return h + m/60
    # Otherwise, treat as string
    h_mm_str = str(h_mm_value).strip()
    if ':' not in h_mm_str:
        return np.nan
    parts = h_mm_str.split(':')
    if len(parts) != 2:
        return np.nan
    h = int(parts[0])
    m = int(parts[1])
    return h + m/60

# Process PICS Caseload (File 3) and Hours Lookup (File 4)
file_3 = "Pics caseload for PBI.xlsx"
df3 = pd.read_excel(file_3, header=0)
df3.columns = [c.strip() for c in df3.columns]

req_cols_3 = ["Assessor Full Name","Programme","Apprenticeship Standard Title",
              "Apprenticeship Achieved Date","Start Date","Learning Expected End","Actual End"]
for c in req_cols_3:
    if c not in df3.columns:
        raise ValueError(f"Required column {c} missing in File 3")

df3["Start Date"] = pd.to_datetime(df3["Start Date"], errors='coerce')
df3["Learning Expected End"] = pd.to_datetime(df3["Learning Expected End"], errors='coerce')
df3["Actual End"] = pd.to_datetime(df3["Actual End"], errors='coerce')
df3["Apprenticeship Achieved Date"] = pd.to_datetime(df3["Apprenticeship Achieved Date"], errors='coerce')

df3.to_csv("processed_data/pics_caseload_processed.csv", index=False)

file_4 = "PICS Hours for Assessor Look up.xlsx"
df4 = pd.read_excel(file_4, header=2)
df4.columns = [c.strip() for c in df4.columns]

req_cols_4 = ["Standard title","Weighted Monthly Hours (1.6)"]
for c in req_cols_4:
    if c not in df4.columns:
        raise ValueError(f"Required column {c} missing in File 4")

df4["Weighted Monthly Hours (1.6)"] = df4["Weighted Monthly Hours (1.6)"].apply(convert_h_mm_to_hours)
df4.to_csv("processed_data/pics_hours_lookup.csv", index=False)

# Create Master Staff List from contracts
staff_df = contracts_df[["First Name","Known As","Last Name","TutorName_FirstLast","TutorName_KnownAsLast"]].drop_duplicates()
staff_df.to_csv("processed_data/master_staff_list.csv", index=False)

# Create a mapping from TutorName_FirstLast -> (FirstName, KnownAs, LastName)
staff_map = {}
for idx,row in staff_df.iterrows():
    staff_map[row["TutorName_FirstLast"]] = (row["First Name"], row["Known As"], row["Last Name"])

# Match names in TutorPayReport (df6)
df6["Staff_FirstLast"] = df6["PEOPLENAME"].apply(clean_name) + " " + df6["PEOPLESURNAME"].apply(clean_name)
df6["Matched_FirstName"] = df6["Staff_FirstLast"].apply(lambda x: staff_map[x][0] if x in staff_map else np.nan)
df6["Matched_KnownAs"] = df6["Staff_FirstLast"].apply(lambda x: staff_map[x][1] if x in staff_map else np.nan)
df6["Matched_LastName"] = df6["Staff_FirstLast"].apply(lambda x: staff_map[x][2] if x in staff_map else np.nan)

# Match names in Terms caseloading (df5)
df5["Staff_FirstLast"] = df5["Primary Staff Name"] + " " + df5["Primary Staff Surname"]
df5["Staff_FirstLast"] = df5["Staff_FirstLast"].apply(clean_name)
df5["Matched_FirstName"] = df5["Staff_FirstLast"].apply(lambda x: staff_map[x][0] if x in staff_map else np.nan)
df5["Matched_KnownAs"] = df5["Staff_FirstLast"].apply(lambda x: staff_map[x][1] if x in staff_map else np.nan)
df5["Matched_LastName"] = df5["Staff_FirstLast"].apply(lambda x: staff_map[x][2] if x in staff_map else np.nan)

# Validation check for unmatched names in File 6
unmatched_6 = df6[df6["Matched_FirstName"].isna()][["PEOPLENAME","PEOPLESURNAME","Staff_FirstLast"]].drop_duplicates()

# Validation check for unmatched names in File 5
unmatched_5 = df5[df5["Matched_FirstName"].isna()][["Primary Staff Name","Primary Staff Surname","Staff_FirstLast"]].drop_duplicates()


# Save unmatched names with more intuitive file names
unmatched_5.to_csv("processed_data/unmatched_names_terms_caseloading.csv", index=False)
unmatched_6.to_csv("processed_data/unmatched_names_tutorpayreport.csv", index=False)

# Calculate summary for Terms caseloading (File 5)
total_names_5 = df5["Staff_FirstLast"].nunique()
unmatched_count_5 = len(unmatched_5)
matched_count_5 = total_names_5 - unmatched_count_5

# Identify matched names for File 5
unmatched_names_5_set = set(unmatched_5["Staff_FirstLast"])
all_names_5_set = set(df5["Staff_FirstLast"].unique())
matched_names_5 = list(all_names_5_set - unmatched_names_5_set)

# Create a summary DataFrame for File 5
summary_5 = pd.DataFrame([{
    "Total Unique Names": total_names_5,
    "Unmatched Count": unmatched_count_5,
    "Matched Count": matched_count_5
}])
summary_5.to_csv("processed_data/terms_caseloading_name_matching_summary.csv", index=False)

# Save matched names for File 5
matched_5_df = pd.DataFrame(matched_names_5, columns=["Matched Staff_FirstLast"])
matched_5_df.to_csv("processed_data/matched_names_terms_caseloading.csv", index=False)

# Calculate summary for TutorPayReport (File 6)
total_names_6 = df6["Staff_FirstLast"].nunique()
unmatched_count_6 = len(unmatched_6)
matched_count_6 = total_names_6 - unmatched_count_6

# Identify matched names for File 6
unmatched_names_6_set = set(unmatched_6["Staff_FirstLast"])
all_names_6_set = set(df6["Staff_FirstLast"].unique())
matched_names_6 = list(all_names_6_set - unmatched_names_6_set)

# Create a summary DataFrame for File 6
summary_6 = pd.DataFrame([{
    "Total Unique Names": total_names_6,
    "Unmatched Count": unmatched_count_6,
    "Matched Count": matched_count_6
}])
summary_6.to_csv("processed_data/tutorpayreport_name_matching_summary.csv", index=False)

# Save matched names for File 6
matched_6_df = pd.DataFrame(matched_names_6, columns=["Matched Staff_FirstLast"])
matched_6_df.to_csv("processed_data/matched_names_tutorpayreport.csv", index=False)

# Save final processed versions
df6.to_csv("processed_data/tutorpayreport_processed.csv", index=False)
df5.to_csv("processed_data/terms_caseloading_processed.csv", index=False)

print("All processing complete. Check 'processed_data' folder for output files.")
Leave a Comment