Untitled

 avatar
user_1718919
plain_text
10 months ago
5.4 kB
4
Indexable
unique_employees_previous = final_df[['First_Name', 'Known_As', 'Last_Name', 'FullName', 'KnownAsFullName']].drop_duplicates()


import pandas as pd

# Read the TutorPayReport data
df_new = pd.read_excel("TutorPayReport_New 23.24 FY 22.11.24.xlsx")

# Clean column names: remove spaces, etc. (Adjust if needed)
df_new.columns = df_new.columns.str.strip().str.replace(' ', '_')

# Keep only the columns we care about:
# From your instructions:
# C: Course_Funding_Group
# E: Course_Prov_Mon_C (Curriculum name)
# F: Course_Prov_Mon_D (Topline venue name)
# G: VENUE (Actual venue name)
# H: PEOPLENAME
# I: PEOPLESURNAME
# M: EVENTDATE
# N: TIMEADJUSTEARLY
# O: TIMEADJUSTLATE
# S: ACTIVITYTYPE
# U: STAFFROLE
# V: CONTRACTNAME
# W: Potential_Hours

# Rename columns according to their letters if they didn't come in clean:
# Check actual headers after loading. Assuming they match exactly after cleaning:
# For example, if original columns were "Course Funding Group", "Course Prov Mon C", etc.
# This step depends on the exact original names. Let's assume they are now:
# "Course_Funding_Group", "Course_Prov_Mon_C", "Course_Prov_Mon_D", "VENUE",
# "PEOPLENAME", "PEOPLESURNAME", "EVENTDATE", "TIMEADJUSTEARLY", "TIMEADJUSTLATE",
# "ACTIVITYTYPE", "STAFFROLE", "CONTRACTNAME", "Potential_Hours"

df_new = df_new[[
    'Course_Funding_Group', 'Course_Prov_Mon_C', 'Course_Prov_Mon_D',
    'VENUE', 'PEOPLENAME', 'PEOPLESURNAME', 'EVENTDATE', 'TIMEADJUSTEARLY',
    'TIMEADJUSTLATE', 'ACTIVITYTYPE', 'STAFFROLE', 'CONTRACTNAME',
    'Potential_Hours'
]]

# Filter by STAFFROLE and CONTRACTNAME
df_new = df_new[df_new['STAFFROLE'].isin(['Tutor', 'Learning Support'])]
df_new = df_new[df_new['CONTRACTNAME'].isin(['LSA-NB', 'SALARIED TUTOR', 'SESSIONAL TUTOR'])]

# At this point, we have a filtered df_new but we have not yet grouped activities or calculated actual hours.
# Now we want to extract unique employees from df_new to attempt the name matching.

unique_employees_new = df_new[['PEOPLENAME', 'PEOPLESURNAME']].drop_duplicates()




# Merge on last names first
merged = unique_employees_new.merge(
    unique_employees_previous,
    left_on='PEOPLESURNAME',
    right_on='Last_Name',
    how='left'
)

# Prepare for name comparison
merged['peoplename_lower'] = merged['PEOPLENAME'].str.lower().str.strip()
merged['first_name_lower'] = merged['First_Name'].str.lower().str.strip()
merged['known_as_lower'] = merged['Known_As'].str.lower().str.strip()

# Identify matches where PEOPLENAME matches either First_Name or Known_As
matches = merged[
    (merged['first_name_lower'] == merged['peoplename_lower']) |
    (merged['known_as_lower'] == merged['peoplename_lower'])
]

match_count = matches.groupby(['PEOPLENAME','PEOPLESURNAME']).size().reset_index(name='Matches_Found')

# Check how many matches each new tutor got
detailed_matches = matches.merge(match_count, on=['PEOPLENAME','PEOPLESURNAME'])

# Now you can analyze `detailed_matches`:
# - If Matches_Found == 1: perfect single match.
# - If Matches_Found > 1: multiple matches, need manual review.
# - If no entry in match_count for a given tutor in unique_employees_new: no matches found, also needs review.

# Let's identify unmatched employees
unmatched = unique_employees_new.merge(match_count, on=['PEOPLENAME','PEOPLESURNAME'], how='left')
unmatched = unmatched[unmatched['Matches_Found'].isna()]
# 'unmatched' shows tutors from the new dataset with no matches in previous dataset



# Suppose `employee_mapping` has columns: PEOPLENAME, PEOPLESURNAME, Matched_FullName, Matched_LastName, etc.
# You can merge it back:
df_new = df_new.merge(employee_mapping, on=['PEOPLENAME','PEOPLESURNAME'], how='left')

# Grouping ActivityType
assessment_values = [
    'Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess',
    'Creative Assessments', 'E&M ApprenticeAssess', 'English FS Assess',
    'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess',
    'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments'
]

def map_activity_type(x):
    x = x.strip()
    if x in assessment_values:
        return 'Assessment'
    elif x == 'Community Engagement':
        return 'Community Engagement'
    elif x == 'Tutorials/Drop Ins':
        return 'Tutorials/Drop Ins'
    else:
        return None  # or filter these out

df_new['Activity_Category'] = df_new['ACTIVITYTYPE'].apply(map_activity_type)
df_new = df_new[df_new['Activity_Category'].notna()]  # Keep only rows we care about

# Calculate Actual Hours
# TIMEADJUSTEARLY and TIMEADJUSTLATE are in milliseconds, and negative or zero.
# Convert to positive hours:
df_new['TIMEADJUSTEARLY_Hours'] = (df_new['TIMEADJUSTEARLY'].abs() / (1000*60*60))
df_new['TIMEADJUSTLATE_Hours'] = (df_new['TIMEADJUSTLATE'].abs() / (1000*60*60))

df_new['Actual_Hours'] = df_new['Potential_Hours'] - (df_new['TIMEADJUSTEARLY_Hours'] + df_new['TIMEADJUSTLATE_Hours'])
df_new['Actual_Hours'] = df_new['Actual_Hours'].apply(lambda x: x if x > 0 else 0)

# Date Handling: Convert EVENTDATE from "19 Sept 2023" to DD/MM/YYYY
df_new['EVENTDATE'] = pd.to_datetime(df_new['EVENTDATE'], format='%d %b %Y', errors='coerce')
df_new['EVENTDATE'] = df_new['EVENTDATE'].dt.strftime('%d/%m/%Y')

# Now df_new is fully processed.
Editor is loading...
Leave a Comment