Untitled
user_1718919
plain_text
4 months ago
5.4 kB
3
Indexable
unique_employees_previous = final_df[['First_Name', 'Known_As', 'Last_Name', 'FullName', 'KnownAsFullName']].drop_duplicates() import pandas as pd # Read the TutorPayReport data df_new = pd.read_excel("TutorPayReport_New 23.24 FY 22.11.24.xlsx") # Clean column names: remove spaces, etc. (Adjust if needed) df_new.columns = df_new.columns.str.strip().str.replace(' ', '_') # Keep only the columns we care about: # From your instructions: # C: Course_Funding_Group # E: Course_Prov_Mon_C (Curriculum name) # F: Course_Prov_Mon_D (Topline venue name) # G: VENUE (Actual venue name) # H: PEOPLENAME # I: PEOPLESURNAME # M: EVENTDATE # N: TIMEADJUSTEARLY # O: TIMEADJUSTLATE # S: ACTIVITYTYPE # U: STAFFROLE # V: CONTRACTNAME # W: Potential_Hours # Rename columns according to their letters if they didn't come in clean: # Check actual headers after loading. Assuming they match exactly after cleaning: # For example, if original columns were "Course Funding Group", "Course Prov Mon C", etc. # This step depends on the exact original names. Let's assume they are now: # "Course_Funding_Group", "Course_Prov_Mon_C", "Course_Prov_Mon_D", "VENUE", # "PEOPLENAME", "PEOPLESURNAME", "EVENTDATE", "TIMEADJUSTEARLY", "TIMEADJUSTLATE", # "ACTIVITYTYPE", "STAFFROLE", "CONTRACTNAME", "Potential_Hours" df_new = df_new[[ 'Course_Funding_Group', 'Course_Prov_Mon_C', 'Course_Prov_Mon_D', 'VENUE', 'PEOPLENAME', 'PEOPLESURNAME', 'EVENTDATE', 'TIMEADJUSTEARLY', 'TIMEADJUSTLATE', 'ACTIVITYTYPE', 'STAFFROLE', 'CONTRACTNAME', 'Potential_Hours' ]] # Filter by STAFFROLE and CONTRACTNAME df_new = df_new[df_new['STAFFROLE'].isin(['Tutor', 'Learning Support'])] df_new = df_new[df_new['CONTRACTNAME'].isin(['LSA-NB', 'SALARIED TUTOR', 'SESSIONAL TUTOR'])] # At this point, we have a filtered df_new but we have not yet grouped activities or calculated actual hours. # Now we want to extract unique employees from df_new to attempt the name matching. unique_employees_new = df_new[['PEOPLENAME', 'PEOPLESURNAME']].drop_duplicates() # Merge on last names first merged = unique_employees_new.merge( unique_employees_previous, left_on='PEOPLESURNAME', right_on='Last_Name', how='left' ) # Prepare for name comparison merged['peoplename_lower'] = merged['PEOPLENAME'].str.lower().str.strip() merged['first_name_lower'] = merged['First_Name'].str.lower().str.strip() merged['known_as_lower'] = merged['Known_As'].str.lower().str.strip() # Identify matches where PEOPLENAME matches either First_Name or Known_As matches = merged[ (merged['first_name_lower'] == merged['peoplename_lower']) | (merged['known_as_lower'] == merged['peoplename_lower']) ] match_count = matches.groupby(['PEOPLENAME','PEOPLESURNAME']).size().reset_index(name='Matches_Found') # Check how many matches each new tutor got detailed_matches = matches.merge(match_count, on=['PEOPLENAME','PEOPLESURNAME']) # Now you can analyze `detailed_matches`: # - If Matches_Found == 1: perfect single match. # - If Matches_Found > 1: multiple matches, need manual review. # - If no entry in match_count for a given tutor in unique_employees_new: no matches found, also needs review. # Let's identify unmatched employees unmatched = unique_employees_new.merge(match_count, on=['PEOPLENAME','PEOPLESURNAME'], how='left') unmatched = unmatched[unmatched['Matches_Found'].isna()] # 'unmatched' shows tutors from the new dataset with no matches in previous dataset # Suppose `employee_mapping` has columns: PEOPLENAME, PEOPLESURNAME, Matched_FullName, Matched_LastName, etc. # You can merge it back: df_new = df_new.merge(employee_mapping, on=['PEOPLENAME','PEOPLESURNAME'], how='left') # Grouping ActivityType assessment_values = [ 'Access Assessments', 'Assessments', 'Beauty OL Assess', 'CFL PA Vols Assess', 'Creative Assessments', 'E&M ApprenticeAssess', 'English FS Assess', 'English GCSE Assess', 'ESOL F2F Assess', 'ESOL Online Assess', 'Hair/Barb F2F Assess', 'Maths Assessments', 'SWiS Assessments' ] def map_activity_type(x): x = x.strip() if x in assessment_values: return 'Assessment' elif x == 'Community Engagement': return 'Community Engagement' elif x == 'Tutorials/Drop Ins': return 'Tutorials/Drop Ins' else: return None # or filter these out df_new['Activity_Category'] = df_new['ACTIVITYTYPE'].apply(map_activity_type) df_new = df_new[df_new['Activity_Category'].notna()] # Keep only rows we care about # Calculate Actual Hours # TIMEADJUSTEARLY and TIMEADJUSTLATE are in milliseconds, and negative or zero. # Convert to positive hours: df_new['TIMEADJUSTEARLY_Hours'] = (df_new['TIMEADJUSTEARLY'].abs() / (1000*60*60)) df_new['TIMEADJUSTLATE_Hours'] = (df_new['TIMEADJUSTLATE'].abs() / (1000*60*60)) df_new['Actual_Hours'] = df_new['Potential_Hours'] - (df_new['TIMEADJUSTEARLY_Hours'] + df_new['TIMEADJUSTLATE_Hours']) df_new['Actual_Hours'] = df_new['Actual_Hours'].apply(lambda x: x if x > 0 else 0) # Date Handling: Convert EVENTDATE from "19 Sept 2023" to DD/MM/YYYY df_new['EVENTDATE'] = pd.to_datetime(df_new['EVENTDATE'], format='%d %b %Y', errors='coerce') df_new['EVENTDATE'] = df_new['EVENTDATE'].dt.strftime('%d/%m/%Y') # Now df_new is fully processed.
Editor is loading...
Leave a Comment