o1-1

# Cell 1: Imports and Setup
import os
import glob
import pandas as pd
import numpy as np

# Ensure we are in the "Staff Utilisation" directory
# If not, you may need to adjust paths accordingly.
# We'll assume the notebook is placed in the "Staff Utilisation" folder.

# Create processed_data directory if it doesn't exist
os.makedirs('processed_data', exist_ok=True)






# Cell 2: Define Helper Functions

def extract_month_year_from_filename(filename):
    """
    Extract month and year from filename like 'ACL Monthly Report 17.04.23.xlsx'.
    We'll assume the format is always '... DD.MM.YY.xlsx'.
    Example: 'ACL Monthly Report 17.04.23.xlsx' -> April 2023
    We'll return a pandas Period or (month, year) tuple.
    """
    # filename might be something like: 'ACL Monthly Report 17.04.23.xlsx'
    # We split by space and get the last part before .xlsx
    base = os.path.basename(filename)
    name_part = os.path.splitext(base)[0]  # 'ACL Monthly Report 17.04.23'
    parts = name_part.split()  # ['ACL', 'Monthly', 'Report', '17.04.23']
    date_str = parts[-1]  # '17.04.23'
    
    # The middle part is '04' for month, '23' for year.
    # Day might not be needed but let's parse anyway.
    # Format: DD.MM.YY
    day, month, year = date_str.split('.')
    
    # We can store this as a YYYY-MM string (financial year presumably 2023 means year=2023)
    # We'll assume 20xx for year:
    year_full = 2000 + int(year)
    month_int = int(month)
    
    return pd.Period(f'{year_full}-{month_int}', freq='M')  # Monthly period

def clean_name(name_str):
    """
    Standardize a name string by stripping spaces and making title case.
    """
    if pd.isna(name_str):
        return ''
    return ' '.join(str(name_str).strip().split()).title()  # Remove extra spaces and title-case

def parse_line_manager_name(full_name):
    """
    Line Manager Name is in format 'FirstName KnownAs, LastName'.
    Example: 'Joanne Jo, Loss'
    We want two formats:
      1) FirstName LastName
      2) KnownAs LastName
    We'll assume first part before comma is "FirstName KnownAs"
    and after comma is LastName.
    If it's something like "Joanne Jo, Loss":
       FirstName might be 'Joanne'
       KnownAs = 'Jo'
       LastName = 'Loss'
    If there's only one first name and no known-as, known-as = first name.
    
    We'll try to split by comma first, then split the left part by spaces.
    """
    if pd.isna(full_name) or full_name.strip() == '':
        return pd.Series(["", ""])
    
    parts = full_name.split(',')
    last_name = clean_name(parts[-1]) if len(parts) > 1 else ''
    first_known = parts[0].strip().split()
    if len(first_known) == 1:
        # Only one name given, treat as both first name and known as
        first_name = clean_name(first_known[0])
        known_as = first_name
    else:
        # Assume last item in the first_known array is known-as
        # and the first is the formal first name
        # If there's more complexity, we may need a more robust approach
        first_name = clean_name(first_known[0])
        known_as = clean_name(first_known[-1])
    
    # Return as a Series: [FirstName LastName, KnownAs LastName]
    return pd.Series([f"{first_name} {last_name}", f"{known_as} {last_name}"])






# Cell 3: Process the 12 Monthly ACL Reports

# We assume the folder 'monthly_reports' has 12 files named similarly to 'ACL Monthly Report DD.MM.YY.xlsx'.
# We'll read them all, clean them, and combine into a single DataFrame with a 'Month' column.

monthly_files = glob.glob('monthly_reports/*.xlsx')
all_monthly_dfs = []

for f in monthly_files:
    month_period = extract_month_year_from_filename(f)  # Get the month-year period
    # Monthly files start from 3rd row with column names
    # According to instructions: 
    # 1st row = headline, 2nd row = empty, column names start from 3rd row
    # So skip the first two rows.
    df = pd.read_excel(f, skiprows=2)
    
    # Clean column names (if needed)
    df.columns = [str(c).strip() for c in df.columns]
    
    # Add a column for the period
    df['Report_Month'] = month_period
    
    all_monthly_dfs.append(df)

acl_monthly_df = pd.concat(all_monthly_dfs, ignore_index=True)

# Useful columns reference from instructions:
# Assignment Number (A), Title (B), First Name (C), Known As (D), Last Name (E)
# Position Name (J), Working Hours (K), Full-Time Equivalent (Q), Line Manager Name (W)
# We'll rename columns to something standardized if needed.

# Let's store original column names to variables:
orig_cols = acl_monthly_df.columns
# We know from instructions the approximate column positions:
# Let's align with instructions (Adjust if needed based on actual data):
# A: Assignment Number
# B: Title
# C: First Name
# D: Known As
# E: Last Name
# J: Position Name
# K: Working Hours
# Q: Full-Time Equivalent
# W: Line Manager Name
acl_monthly_df = acl_monthly_df.rename(columns={
    'Assignment Number': 'Assignment_Number',
    'Title': 'Title',
    'First Name': 'First_Name',
    'Known As': 'Known_As',
    'Last Name': 'Last_Name',
    'Position Name': 'Position_Name',
    'Working Hours': 'Working_Hours',
    'Full-Time Equivalent': 'FTE',
    'Line Manager Name': 'Line_Manager_Name'
}, errors='ignore')  # in case some columns differ in case/spaces

# Filter only Tutors and LSAs
# Position_Name must contain 'Tutor' or 'Learning Support Assistant'
acl_monthly_df = acl_monthly_df[acl_monthly_df['Position_Name'].isin(['Tutor', 'Learning Support Assistant'])]

# Clean names
acl_monthly_df['First_Name'] = acl_monthly_df['First_Name'].apply(clean_name)
acl_monthly_df['Known_As'] = acl_monthly_df['Known_As'].apply(clean_name)
acl_monthly_df['Last_Name'] = acl_monthly_df['Last_Name'].apply(clean_name)
acl_monthly_df['Line_Manager_Name'] = acl_monthly_df['Line_Manager_Name'].apply(clean_name)

# Parse line manager into two formats
acl_monthly_df[['LM_FirstNameLastName','LM_KnownAsLastName']] = acl_monthly_df['Line_Manager_Name'].apply(parse_line_manager_name)

# Determine Contract Type:
# If Position_Name == 'Learning Support Assistant' → LSA
# If Position_Name == 'Tutor':
#   If Working_Hours <= 3 → Sessional
#   If Working_Hours > 3 → Salaried

def contract_type(row):
    if row['Position_Name'] == 'Learning Support Assistant':
        return 'LSA'
    elif row['Position_Name'] == 'Tutor':
        if pd.to_numeric(row['Working_Hours'], errors='coerce') is not None:
            wh = float(row['Working_Hours'])
            if wh > 3:
                return 'Salaried Tutor'
            else:
                return 'Sessional Tutor'
        else:
            return 'Sessional Tutor' # default if parsing fails
    return None

acl_monthly_df['Contract_Type'] = acl_monthly_df.apply(contract_type, axis=1)

# Calculate Target Hours only for Salaried Tutors
# Target = FTE * 840, if Salaried Tutor
acl_monthly_df['FTE'] = pd.to_numeric(acl_monthly_df['FTE'], errors='coerce').fillna(0)
acl_monthly_df['Target_Hours'] = np.where(
    acl_monthly_df['Contract_Type']=='Salaried Tutor',
    acl_monthly_df['FTE']*840,
    0
)

# Now we have a combined monthly dataset with month info.
# This dataset can be used to track changes over time by grouping on Assignment_Number and seeing changes in Contract_Type across months.
# We'll save this processed monthly dataset
acl_monthly_df.to_csv('processed_data/acl_monthly_processed.csv', index=False)





# Cell 4: Process TutorPayReport (TERMS data)
# File: TutorPayReport_New 23.24 FY 22.11.24.xlsx
# This file has columns:
# PEOPLENAME (H), PEOPLESURNAME (I), STAFFROLE (U), CONTRACTNAME (V), ACTIVITYTYPE (S), Potential Hours (W), TIMEADJUSTEARLY (N), TIMEADJUSTLATE (O), ...
# EVENTDATE (M), Course Funding Group (C), Course Prov Mon C (E), Course Prov Mon D (F), VENUE (G)

tutor_pay_file = 'TutorPayReport_New 23.24 FY 22.11.24.xlsx'
tutor_pay_df = pd.read_excel(tutor_pay_file)

# Rename columns of interest
tutor_pay_df = tutor_pay_df.rename(columns={
    'PEOPLENAME': 'PeopleName',
    'PEOPLESURNAME': 'PeopleSurname',
    'STAFFROLE': 'StaffRole',
    'CONTRACTNAME': 'ContractName',
    'ACTIVITYTYPE': 'ActivityType',
    'Potential Hours': 'PotentialHours',
    'TIMEADJUSTEARLY': 'TimeAdjustEarly',
    'TIMEADJUSTLATE': 'TimeAdjustLate',
    'Course Funding Group': 'CourseFundingGroup',
    'Course Prov Mon C': 'CurriculumName',
    'Course Prov Mon D': 'ToplineVenue',
    'VENUE': 'Venue',
    'EVENTDATE': 'EventDate'
}, errors='ignore')

# Filter STAFFROLE: We only care about 'Tutor' and 'Learning Support' (Strip extra spaces)
tutor_pay_df['StaffRole'] = tutor_pay_df['StaffRole'].str.strip()
tutor_pay_df = tutor_pay_df[tutor_pay_df['StaffRole'].isin(['Tutor','Learning Support'])]

# Focus on CONTRACTNAME: 'LSA-NB', 'SALARIED TUTOR', 'SESSIONAL TUTOR'
tutor_pay_df['ContractName'] = tutor_pay_df['ContractName'].str.strip()

# ActivityType grouping for "Assessment"
assessment_types = ['Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess', 'Creative Assessments', 
                    'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess', 
                    'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments']
tutor_pay_df['ActivityTypeGroup'] = np.where(tutor_pay_df['ActivityType'].isin(assessment_types), 
                                             'Assessment', tutor_pay_df['ActivityType'])

# Calculate Actual Hours
# TIMEADJUSTEARLY & TIMEADJUSTLATE in ms
# Convert ms to hours by dividing by (1000*60*60)
tutor_pay_df['TimeAdjustEarly'] = pd.to_numeric(tutor_pay_df['TimeAdjustEarly'], errors='coerce').fillna(0)
tutor_pay_df['TimeAdjustLate'] = pd.to_numeric(tutor_pay_df['TimeAdjustLate'], errors='coerce').fillna(0)
tutor_pay_df['PotentialHours'] = pd.to_numeric(tutor_pay_df['PotentialHours'], errors='coerce').fillna(0)

early_hours = tutor_pay_df['TimeAdjustEarly']/(1000*60*60)
late_hours = tutor_pay_df['TimeAdjustLate']/(1000*60*60)

tutor_pay_df['ActualHours'] = tutor_pay_df['PotentialHours'] - (early_hours + late_hours)
tutor_pay_df['ActualHours'] = tutor_pay_df['ActualHours'].apply(lambda x: x if x>0 else 0)

# Clean PeopleName and PeopleSurname
tutor_pay_df['PeopleName'] = tutor_pay_df['PeopleName'].apply(clean_name)
tutor_pay_df['PeopleSurname'] = tutor_pay_df['PeopleSurname'].apply(clean_name)

# Save processed Tutor Pay Data
tutor_pay_df.to_csv('processed_data/tutor_pay_processed.csv', index=False)





# Cell 5: Approved and Archived Lists for Terms Caseloading
# We will create a dictionary/map of QualificationNumber -> QualificationName from Approved first, then Archived.

approved_file = 'Approved List for Terms Caseloading FY2324.xlsx'
archived_file = 'Archived List for Terms Caseloading FY2324.xlsx'

approved_df = pd.read_excel(approved_file)
archived_df = pd.read_excel(archived_file)

# Columns we care about: QualificationName (B), QualificationNumber (D)
approved_df = approved_df.rename(columns={'QualificationName': 'QualificationName', 'QualificationNumber': 'QualificationNumber'})
archived_df = archived_df.rename(columns={'QualificationName': 'QualificationName', 'QualificationNumber': 'QualificationNumber'})

approved_map = dict(zip(approved_df['QualificationNumber'], approved_df['QualificationName']))
archived_map = dict(zip(archived_df['QualificationNumber'], archived_df['QualificationName']))





# Cell 6: Terms Caseloading File
# File 5: Terms caseloading 23.24 FY.xlsx
# Columns of interest:
# Funding Reference (M), Activity Name (D)
# Primary Staff Name (A), Primary Staff Surname (B)
# Start Date (H), End Date (I), Level (N), Activity Group (S)

terms_caseload_file = 'Terms caseloading 23.24 FY.xlsx'
terms_df = pd.read_excel(terms_caseload_file)

# Rename columns
terms_df = terms_df.rename(columns={
    'Funding Reference': 'FundingReference',
    'Activity Name': 'ActivityName',
    'Primary Staff Name': 'PrimaryStaffName',
    'Primary Staff Surname': 'PrimaryStaffSurname',
    'Start Date': 'StartDate',
    'End Date': 'EndDate',
    'Level': 'Level',
    'Activity Group': 'ActivityGroup'
}, errors='ignore')

# Clean staff names
terms_df['PrimaryStaffName'] = terms_df['PrimaryStaffName'].apply(clean_name)
terms_df['PrimaryStaffSurname'] = terms_df['PrimaryStaffSurname'].apply(clean_name)

# Process FundingReference:
# If ends in 'P' or 'N', remove that character
def clean_funding_ref(ref):
    if pd.isna(ref):
        return ''
    ref = str(ref).strip()
    if ref.endswith('P') or ref.endswith('N'):
        ref = ref[:-1]  # remove last char
    return ref

terms_df['FundingReference'] = terms_df['FundingReference'].apply(clean_funding_ref)

def get_qualification_name(ref):
    # If ref is BLANK or '', use ActivityName
    if ref == '' or ref.upper() == 'BLANK':
        return None
    # Try approved_map first
    if ref in approved_map:
        return approved_map[ref]
    # If not found, try archived_map
    if ref in archived_map:
        return archived_map[ref]
    return None

terms_df['QualificationName'] = terms_df['FundingReference'].apply(get_qualification_name)
terms_df['QualificationName'] = terms_df.apply(
    lambda x: x['QualificationName'] if pd.notna(x['QualificationName']) else x['ActivityName'],
    axis=1
)

# We now have a dataset with tutors and their learners, courses, etc.
# We'll save the processed terms data
terms_df.to_csv('processed_data/terms_caseload_processed.csv', index=False)





# Cell 7: PICS caseload for PBI (File 3)
# Columns of interest:
# Assessor Full Name (K), Programme (D), Apprenticeship Standard Title (N), Apprenticeship Achieved Date (Q), Start Date (G), Learning Expected End (H), Actual End (I)

pics_caseload_file = 'Pics caseload for PBI.xlsx'
pics_df = pd.read_excel(pics_caseload_file)

pics_df = pics_df.rename(columns={
    'Assessor Full Name': 'AssessorFullName',
    'Programme': 'Programme',
    'Apprenticeship Standard Title': 'ApprenticeshipStandardTitle',
    'Apprenticeship Achieved Date': 'ApprenticeshipAchievedDate',
    'Start Date': 'StartDate',
    'Learning Expected End': 'LearningExpectedEnd',
    'Actual End': 'ActualEnd'
}, errors='ignore')

# Clean assessor names
pics_df['AssessorFullName'] = pics_df['AssessorFullName'].apply(clean_name)

# Convert date columns if needed
date_cols = ['ApprenticeshipAchievedDate', 'StartDate', 'LearningExpectedEnd', 'ActualEnd']
for dc in date_cols:
    pics_df[dc] = pd.to_datetime(pics_df[dc], errors='coerce')

# We'll count active learners per assessor and month in Power BI later using these date fields.

# Save processed PICS caseload
pics_df.to_csv('processed_data/pics_caseload_processed.csv', index=False)




# Cell 8: PICS Hours for Assessor Look up (File 4)
# Columns start on row 3
# Columns: B 'Standard title', F 'Weighted Monthly Hours (1.6)'
pics_hours_file = 'PICS Hours for Assessor Look up.xlsx'

pics_hours_df = pd.read_excel(pics_hours_file, skiprows=2)
pics_hours_df = pics_hours_df.rename(columns={'Standard title': 'StandardTitle', 'Weighted Monthly Hours (1.6)': 'WeightedMonthlyHours'})

# Hours in format H:MM, convert to decimal hours
def time_str_to_hours(t):
    if pd.isna(t):
        return 0.0
    # assume format H:MM
    parts = str(t).split(':')
    if len(parts) == 2:
        h = float(parts[0])
        m = float(parts[1])
        return h + m/60.0
    else:
        # If something else, try to convert directly
        return pd.to_numeric(t, errors='coerce').fillna(0)

pics_hours_df['WeightedMonthlyHours'] = pics_hours_df['WeightedMonthlyHours'].apply(time_str_to_hours)

# Save processed
pics_hours_df.to_csv('processed_data/pics_hours_lookup_processed.csv', index=False)




# Cell 9: Additional Data Cleaning / Name Matching
# We have multiple datasets with staff names (ACL monthly, TutorPay, Terms, PICS).
# We will create a master staff mapping. This is optional but recommended.

# Extract unique staff from ACL monthly (tutors and LSAs)
acl_staff = acl_monthly_df[['First_Name','Known_As','Last_Name','Contract_Type']].drop_duplicates()

# Extract from Tutor Pay
tutor_pay_staff = tutor_pay_df[['PeopleName','PeopleSurname','StaffRole','ContractName']].drop_duplicates()
tutor_pay_staff = tutor_pay_staff.rename(columns={
    'PeopleName':'First_Name',
    'PeopleSurname':'Last_Name'
})

# Extract from Terms caseloading (tutors)
terms_staff = terms_df[['PrimaryStaffName','PrimaryStaffSurname']].drop_duplicates()
terms_staff = terms_staff.rename(columns={
    'PrimaryStaffName':'First_Name',
    'PrimaryStaffSurname':'Last_Name'
})

# Extract from PICS caseload (assessors)
pics_staff = pics_df[['AssessorFullName']].drop_duplicates()
# Split assessor full name into first/last if possible:
def split_full_name(fullname):
    # simplistic split by space
    if pd.isna(fullname) or fullname.strip()=='':
        return pd.Series(['',''])
    parts = fullname.strip().split()
    first, last = parts[0], parts[-1]
    return pd.Series([first.title(), last.title()])
pics_staff[['First_Name','Last_Name']] = pics_staff['AssessorFullName'].apply(split_full_name)
pics_staff = pics_staff.drop(columns=['AssessorFullName'])

# In reality, you might need a more robust name matching strategy (fuzzy matching).
# For now, we assume clean_name and standardization suffices.

# Combine all staff references
all_staff = pd.concat([acl_staff[['First_Name','Known_As','Last_Name']],
                       tutor_pay_staff[['First_Name','Last_Name']],
                       terms_staff[['First_Name','Last_Name']],
                       pics_staff[['First_Name','Last_Name']]], ignore_index=True)

all_staff = all_staff.fillna('')
all_staff = all_staff.drop_duplicates()

# This master staff list can be refined manually if needed,
# or used as a lookup to ensure consistency in Power BI relationships.
all_staff.to_csv('processed_data/all_staff_master.csv', index=False)



# Cell 10: Final Notes and Outputs

# We have now created processed datasets for:
# - Monthly ACL reports: acl_monthly_processed.csv
# - TutorPay (TERMS) data: tutor_pay_processed.csv
# - Terms caseloading: terms_caseload_processed.csv
# - PICS caseload: pics_caseload_processed.csv
# - PICS hours lookup: pics_hours_lookup_processed.csv
# - Master staff list: all_staff_master.csv

# These processed datasets can now be loaded into Power BI.
# In Power BI, we will:
# - Create relationships between these tables based on staff names (or ideally a unique staff ID if available).
# - Implement filters/slicers for Line Manager, Month, Week, Funding Group, Curriculum, Venues, Levels.
# - Create measures for utilization percentage = (Actual Hours / Target Hours) for salaried tutors.
# - Create drill-through pages to show detail tables.
# - Implement a toggle to show Known As vs First Name by using a slicer or bookmark that switches a measure or a calculated column in visuals.

# End of code.
print("Data processing completed. Processed files available in 'processed_data' folder.")
Editor is loading...