C04
user_1718919
plain_text
4 months ago
11 kB
4
Indexable
def process_qualification_lists( approved_file: str, archived_file: str ) -> pd.DataFrame: """ Process approved and archived qualification lists Args: approved_file: Path to approved qualifications file archived_file: Path to archived qualifications file Returns: DataFrame with combined qualification reference data """ logging.info("Processing qualification lists...") required_columns = ['QualificationName', 'QualificationNumber'] # Read approved list approved_df, error = safe_read_excel(approved_file) if error: logging.error(f"Failed to read approved qualifications: {error}") return pd.DataFrame(columns=required_columns + ['Source']) approved_df = approved_df[required_columns].copy() approved_df['Source'] = 'Approved' # Read archived list archived_df, error = safe_read_excel(archived_file) if error: logging.error(f"Failed to read archived qualifications: {error}") return pd.DataFrame(columns=required_columns + ['Source']) archived_df = archived_df[required_columns].copy() archived_df['Source'] = 'Archived' # Validate data try: validate_processed_data(approved_df, required_columns, 'approved_qualifications') validate_processed_data(archived_df, required_columns, 'archived_qualifications') except ValueError as e: logging.error(f"Validation failed: {str(e)}") return pd.DataFrame(columns=required_columns + ['Source']) # Combine lists and clean qualification numbers qual_df = pd.concat([approved_df, archived_df], ignore_index=True) qual_df['QualificationNumber'] = qual_df['QualificationNumber'].apply(clean_qualification_number) # Check for duplicates duplicates = qual_df.groupby('QualificationNumber').size() duplicate_quals = duplicates[duplicates > 1] if len(duplicate_quals) > 0: logging.warning(f"Found {len(duplicate_quals)} duplicate qualification numbers") for qual_num in duplicate_quals.index: dupes = qual_df[qual_df['QualificationNumber'] == qual_num] logging.warning(f"Duplicate entries for {qual_num}:") for _, row in dupes.iterrows(): logging.warning(f" {row['Source']}: {row['QualificationName']}") # Remove duplicates, keeping 'Approved' version if exists qual_df = qual_df.sort_values('Source', ascending=False).drop_duplicates( 'QualificationNumber', keep='first' ) # Log summary statistics logging.info(f"Processed {len(qual_df)} unique qualifications:") logging.info(f" - Approved: {len(qual_df[qual_df['Source'] == 'Approved'])}") logging.info(f" - Archived: {len(qual_df[qual_df['Source'] == 'Archived'])}") return qual_df def process_terms_caseloading( terms_file: str, qualification_df: pd.DataFrame, contract_history: pd.DataFrame ) -> Dict[str, pd.DataFrame]: """ Process Terms caseloading data and link with qualification information Args: terms_file: Path to Terms caseloading file qualification_df: Qualification reference DataFrame contract_history: Contract history DataFrame Returns: Dictionary of processed DataFrames """ logging.info("Processing Terms caseloading data...") # Read Terms caseloading data with error handling terms_df, error = safe_read_excel(terms_file) if error: logging.error(f"Failed to read Terms caseloading data: {error}") raise ValueError(f"Cannot process Terms caseloading data: {error}") # Select and rename relevant columns cols_to_keep = { 'Primary Staff Name': 'Staff_First_Name', 'Primary Staff Surname': 'Staff_Last_Name', 'Start Date': 'Start_Date', 'End Date': 'End_Date', 'Activity Name': 'Activity_Name', 'Funding Reference': 'Qualification_Number', 'Level': 'Level', 'Activity Group': 'Activity_Group', 'Name': 'Learner_First_Name', 'Surname': 'Learner_Last_Name' } try: terms_df = terms_df[cols_to_keep.keys()].copy() terms_df.rename(columns=cols_to_keep, inplace=True) except KeyError as e: logging.error(f"Missing required columns: {str(e)}") raise ValueError(f"Required columns missing from Terms caseloading data: {str(e)}") # Clean qualification numbers terms_df['Qualification_Number'] = terms_df['Qualification_Number'].apply(clean_qualification_number) # Create staff standard name format and clean names name_columns = ['Staff_First_Name', 'Staff_Last_Name'] terms_df = standardize_names(terms_df, name_columns) terms_df['Staff_Standard'] = terms_df.apply( lambda x: f"{x['Staff_First_Name']} {x['Staff_Last_Name']}", axis=1 ) # Convert dates date_columns = ['Start_Date', 'End_Date'] for col in date_columns: terms_df[col] = pd.to_datetime(terms_df[col], errors='coerce') invalid_dates = terms_df[col].isna().sum() if invalid_dates > 0: logging.warning(f"Found {invalid_dates} invalid dates in {col}") # Look up qualification names with detailed logging def get_qualification_name(row): if pd.isna(row['Qualification_Number']): logging.debug(f"No qualification number for activity: {row['Activity_Name']}") return row['Activity_Name'] qual_match = qualification_df[ qualification_df['QualificationNumber'] == row['Qualification_Number'] ] if len(qual_match) > 0: source = qual_match.iloc[0]['Source'] qual_name = qual_match.iloc[0]['QualificationName'] logging.debug(f"Found qualification {row['Qualification_Number']} in {source} list: {qual_name}") return qual_name logging.debug(f"No match found for qualification {row['Qualification_Number']}, using activity name: {row['Activity_Name']}") return row['Activity_Name'] terms_df['Qualification_Name'] = terms_df.apply(get_qualification_name, axis=1) # Add contract info based on start date def get_contract_info(row): if pd.isna(row['Start_Date']): return pd.Series({ 'Staff_Position': None, 'Staff_Contract': None, 'Staff_Target_Hours': None, 'Multiple_Contracts': False }) staff_contracts = contract_history[ contract_history['Staff_Name'] == row['Staff_Standard'] ] relevant_contracts = staff_contracts[ (staff_contracts['Start_Date'] <= row['Start_Date']) & (staff_contracts['End_Date'] >= row['Start_Date']) ] if len(relevant_contracts) > 0: # Handle multiple contracts positions = ', '.join(relevant_contracts['Position'].unique()) contract_types = ', '.join(relevant_contracts['Contract_Type'].unique()) total_target = relevant_contracts['Target_Hours'].sum() return pd.Series({ 'Staff_Position': positions, 'Staff_Contract': contract_types, 'Staff_Target_Hours': total_target, 'Multiple_Contracts': len(relevant_contracts) > 1 }) return pd.Series({ 'Staff_Position': None, 'Staff_Contract': None, 'Staff_Target_Hours': None, 'Multiple_Contracts': False }) # Add contract details logging.info("Adding contract information...") contract_details = terms_df.apply(get_contract_info, axis=1) terms_df = pd.concat([terms_df, contract_details], axis=1) # Create monthly learner counts logging.info("Calculating monthly learner counts...") monthly_data = [] dates = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M') for date in dates: month_start = date.replace(day=1) month_end = date # Get active learners for this month month_mask = ( (terms_df['Start_Date'] <= month_end) & ((terms_df['End_Date'].isna()) | (terms_df['End_Date'] >= month_start)) ) month_data = terms_df[month_mask].copy() if not month_data.empty: month_data['Year'] = date.year month_data['Month'] = date.month monthly_data.append(month_data) logging.debug( f"Month {date.strftime('%Y-%m')}: {len(month_data)} active learners" ) monthly_df = pd.concat(monthly_data, ignore_index=True) if monthly_data else pd.DataFrame() if monthly_df.empty: logging.warning("No monthly data generated!") return {} # Create summary dataframes logging.info("Creating summary dataframes...") # Learner summary learner_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month'] ).agg({ 'Learner_First_Name': 'count', 'Staff_Position': 'first', 'Staff_Contract': 'first', 'Staff_Target_Hours': 'first', 'Multiple_Contracts': 'first' }).reset_index() learner_summary.rename(columns={ 'Learner_First_Name': 'Total_Learners' }, inplace=True) # Qualification summary qual_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month', 'Qualification_Name', 'Level', 'Staff_Position', 'Staff_Contract', 'Activity_Group'] ).agg({ 'Learner_First_Name': 'count', 'Staff_Target_Hours': 'first', 'Multiple_Contracts': 'first' }).reset_index() qual_summary.rename(columns={ 'Learner_First_Name': 'Learners_In_Qualification' }, inplace=True) # Level summary level_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month', 'Level'] ).agg({ 'Learner_First_Name': 'count', 'Multiple_Contracts': 'first' }).reset_index() level_summary.rename(columns={ 'Learner_First_Name': 'Learners_In_Level' }, inplace=True) # Activity group summary activity_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month', 'Activity_Group'] ).agg({ 'Learner_First_Name': 'count', 'Multiple_Contracts': 'first' }).reset_index() activity_summary.rename(columns={ 'Learner_First_Name': 'Learners_In_Activity' }, inplace=True) logging.info("Terms caseloading processing completed!") return { 'terms_detailed': monthly_df, 'learner_summary': learner_summary, 'qualification_summary': qual_summary, 'level_summary': level_summary, 'activity_summary': activity_summary, 'qualification_reference': qualification_df }
Editor is loading...
Leave a Comment