Untitled

import pandas as pd
import numpy as np
import glob
import re
from calendar import month_name

# Helper function to parse line manager name
def parse_line_manager(name):
    if pd.isna(name):
        return pd.Series([None, None])
    parts = name.split(',')
    if len(parts) == 2:
        first_known = parts[0].strip()  # e.g. "Joanne Jo"
        last_name = parts[1].strip()
        fk_parts = first_known.split()
        if len(fk_parts) == 2:
            first_name = fk_parts[0].strip()
            known_as = fk_parts[1].strip()
            return pd.Series([first_name + " " + last_name, known_as + " " + last_name])
    return pd.Series([None, None])

# Read all ACL monthly report files
files = glob.glob("ACL Monthly Report *.xlsx")

all_dataframes = []
for file in files:
    # Extract month/year from filename
    match = re.search(r'ACL Monthly Report (\d{2})\.(\d{2})\.(\d{2})\.xlsx', file)
    if match:
        day_str = match.group(1)
        month_str = match.group(2)
        year_str = match.group(3)

        month_int = int(month_str)
        full_year = int("20" + year_str)
        month_name_str = month_name[month_int]

        # Read the Excel file with correct header skipping (adjust if needed)
        df = pd.read_excel(file, skiprows=2, header=0)
        df.columns = df.columns.str.strip().str.replace(' ', '_')

        # Add Month/Year columns
        df['Month'] = month_name_str
        df['MonthNumber'] = month_int
        df['Year'] = full_year

        all_dataframes.append(df)

final_df = pd.concat(all_dataframes, ignore_index=True)

# Filter to only Tutors and LSAs
final_df = final_df[final_df['Position_Name'].isin(['Tutor', 'Learning Support Assistant'])]

# Determine Contract Type (Sessional vs Salaried)
final_df['Working_Hours'] = pd.to_numeric(final_df['Working_Hours'], errors='coerce')
final_df['ContractType'] = final_df['Working_Hours'].apply(lambda x: "Sessional" if x <= 3 else "Salaried")

# Calculate Target Hours
final_df['Full-Time_Equivalent'] = pd.to_numeric(final_df['Full-Time_Equivalent'], errors='coerce')
final_df['Target_Hours'] = final_df['Full-Time_Equivalent'] * 840

# Create Employee Name Variations
final_df['FullName'] = final_df['First_Name'].str.strip() + " " + final_df['Last_Name'].str.strip()
final_df['KnownAsFullName'] = final_df['Known_As'].str.strip() + " " + final_df['Last_Name'].str.strip()

# Parse Line Manager Names
final_df[['LineManager_FullName', 'LineManager_KnownAsFullName']] = final_df['Line_Manager_Name'].apply(parse_line_manager)

# Identify changes month-to-month
final_df['YearMonth'] = final_df['Year']*100 + final_df['MonthNumber']
final_df = final_df.sort_values(by=['Assignment_Number', 'YearMonth'])

grouped = final_df.groupby('Assignment_Number', group_keys=False)
final_df['Prev_Position_Name'] = grouped['Position_Name'].shift(1)
final_df['Prev_ContractType'] = grouped['ContractType'].shift(1)
final_df['Prev_YearMonth'] = grouped['YearMonth'].shift(1)

final_df['Is_New_Contract_This_Month'] = final_df['Prev_YearMonth'].isna()

final_df['Next_YearMonth'] = grouped['YearMonth'].shift(-1)
final_df['Will_End_After_This_Month'] = final_df['Next_YearMonth'].isna()

final_df['Position_Changed'] = (final_df['Position_Name'] != final_df['Prev_Position_Name']) & final_df['Prev_Position_Name'].notna()
final_df['ContractType_Changed'] = (final_df['ContractType'] != final_df['Prev_ContractType']) & final_df['Prev_ContractType'].notna()

# Select only necessary columns (adjust if needed)
final_columns = [
    'Assignment_Number', 'Title', 'First_Name', 'Known_As', 'Last_Name',
    'Position_Name', 'Working_Hours', 'Full-Time_Equivalent', 'Month',
    'MonthNumber', 'Year', 'Line_Manager_Name', 'ContractType', 'Target_Hours',
    'FullName', 'KnownAsFullName', 'LineManager_FullName', 'LineManager_KnownAsFullName',
    'YearMonth', 'Is_New_Contract_This_Month', 'Will_End_After_This_Month',
    'Position_Changed', 'ContractType_Changed'
]

# Ensure all these columns exist before subsetting:
existing_cols = final_df.columns
missing_cols = [c for c in final_columns if c not in existing_cols]
if missing_cols:
    print("Missing columns:", missing_cols)
else:
    final_df = final_df[final_columns]

# Extract unique employees from final_df
unique_employees_previous = final_df[['First_Name', 'Known_As', 'Last_Name', 'FullName', 'KnownAsFullName']].drop_duplicates()

print("final_df shape:", final_df.shape)
print("unique_employees_previous shape:", unique_employees_previous.shape)
print("Sample of final_df:")
print(final_df.head())
Editor is loading...