C-4
user_1718919
plain_text
4 months ago
12 kB
4
Indexable
def process_qualification_lists( approved_file: str, archived_file: str ) -> pd.DataFrame: """ Process approved and archived qualification lists """ logging.info("Processing qualification lists...") required_columns = ['QualificationName', 'QualificationNumber'] # Read both files with validation approved_df, error = safe_read_excel(approved_file) if error: logging.error(f"Failed to read approved qualifications: {error}") return pd.DataFrame(columns=required_columns + ['Source']) archived_df, error = safe_read_excel(archived_file) if error: logging.error(f"Failed to read archived qualifications: {error}") return pd.DataFrame(columns=required_columns + ['Source']) # Process approved list approved_df = approved_df[required_columns].copy() approved_df['Source'] = 'Approved' # Process archived list archived_df = archived_df[required_columns].copy() archived_df['Source'] = 'Archived' # Validate data try: validate_processed_data(approved_df, required_columns, 'approved_qualifications') validate_processed_data(archived_df, required_columns, 'archived_qualifications') except ValueError as e: logging.error(f"Validation failed: {str(e)}") return pd.DataFrame(columns=required_columns + ['Source']) # Clean qualification numbers in both datasets for df in [approved_df, archived_df]: df['QualificationNumber'] = df['QualificationNumber'].apply(clean_qualification_number) df['QualificationName'] = df['QualificationName'].str.strip() # Check for duplicates all_quals = pd.concat([approved_df, archived_df], ignore_index=True) duplicates = all_quals.groupby('QualificationNumber').size() duplicate_quals = duplicates[duplicates > 1] if len(duplicate_quals) > 0: logging.warning(f"Found {len(duplicate_quals)} duplicate qualification numbers") for qual_num in duplicate_quals.index: dupes = all_quals[all_quals['QualificationNumber'] == qual_num] logging.warning(f"Duplicate entries for {qual_num}:") for _, row in dupes.iterrows(): logging.warning(f" {row['Source']}: {row['QualificationName']}") # Remove duplicates, keeping 'Approved' version if exists qual_df = all_quals.sort_values( ['Source', 'QualificationNumber'], ascending=[False, True] # False for Source puts 'Approved' first ).drop_duplicates('QualificationNumber', keep='first') # Log summary statistics logging.info(f"Processed {len(qual_df)} unique qualifications:") logging.info(f" - Approved: {len(qual_df[qual_df['Source'] == 'Approved'])}") logging.info(f" - Archived: {len(qual_df[qual_df['Source'] == 'Archived'])}") return qual_df def process_terms_caseloading( terms_file: str, qualification_df: pd.DataFrame, contract_history: pd.DataFrame ) -> Dict[str, pd.DataFrame]: """ Process Terms caseloading data and link with qualification information """ logging.info("Processing Terms caseloading data...") # Read Terms caseloading data with error handling terms_df, error = safe_read_excel(terms_file) if error: logging.error(f"Failed to read Terms caseloading data: {error}") raise ValueError(f"Cannot process Terms caseloading data: {error}") # Select and rename relevant columns cols_to_keep = { 'Primary Staff Name': 'Staff_First_Name', 'Primary Staff Surname': 'Staff_Last_Name', 'Start Date': 'Start_Date', 'End Date': 'End_Date', 'Activity Name': 'Activity_Name', 'Funding Reference': 'Qualification_Number', 'Level': 'Level', 'Activity Group': 'Activity_Group', 'Name': 'Learner_First_Name', 'Surname': 'Learner_Last_Name' } try: terms_df = terms_df[cols_to_keep.keys()].copy() terms_df.rename(columns=cols_to_keep, inplace=True) except KeyError as e: logging.error(f"Missing required columns: {str(e)}") raise ValueError(f"Required columns missing from Terms caseloading data: {str(e)}") # Clean qualification numbers and create staff names terms_df['Qualification_Number'] = terms_df['Qualification_Number'].apply(clean_qualification_number) # Clean name columns name_columns = ['Staff_First_Name', 'Staff_Last_Name', 'Learner_First_Name', 'Learner_Last_Name'] terms_df = standardize_names(terms_df, name_columns) terms_df['Staff_Standard'] = terms_df.apply( lambda x: f"{x['Staff_First_Name']} {x['Staff_Last_Name']}", axis=1 ) # Convert dates with validation date_columns = ['Start_Date', 'End_Date'] for col in date_columns: terms_df[col] = pd.to_datetime(terms_df[col], errors='coerce') invalid_dates = terms_df[col].isna().sum() if invalid_dates > 0: logging.warning(f"Found {invalid_dates} invalid dates in {col}") logging.debug("Sample of rows with invalid dates:") sample_invalid = terms_df[terms_df[col].isna()].head() for _, row in sample_invalid.iterrows(): logging.debug(f" Staff: {row['Staff_Standard']}, Activity: {row['Activity_Name']}") # Look up qualification names def get_qualification_name(row): if pd.isna(row['Qualification_Number']): logging.debug(f"No qualification number for activity: {row['Activity_Name']}") return row['Activity_Name'] qual_match = qualification_df[ qualification_df['QualificationNumber'] == row['Qualification_Number'] ] if len(qual_match) > 0: source = qual_match.iloc[0]['Source'] qual_name = qual_match.iloc[0]['QualificationName'] logging.debug(f"Found qualification {row['Qualification_Number']} in {source} list: {qual_name}") return qual_name logging.debug(f"No match found for qualification {row['Qualification_Number']}, " f"using activity name: {row['Activity_Name']}") return row['Activity_Name'] terms_df['Qualification_Name'] = terms_df.apply(get_qualification_name, axis=1) # Add contract info based on start date with handling of multiple contracts def get_contract_info(row): if pd.isna(row['Start_Date']): return pd.Series({ 'Staff_Position': None, 'Staff_Contract': None, 'Staff_Target_Hours': None, 'Multiple_Contracts': False }) staff_contracts = contract_history[ contract_history['Staff_Name'] == row['Staff_Standard'] ] relevant_contracts = staff_contracts[ (staff_contracts['Start_Date'] <= row['Start_Date']) & (staff_contracts['End_Date'] >= row['Start_Date']) ] if len(relevant_contracts) > 0: positions = relevant_contracts['Position'].unique() contract_types = relevant_contracts['Contract_Type'].unique() total_target = relevant_contracts['Target_Hours'].sum() return pd.Series({ 'Staff_Position': ' & '.join(positions), 'Staff_Contract': ' & '.join(contract_types), 'Staff_Target_Hours': total_target, 'Multiple_Contracts': len(relevant_contracts) > 1 }) return pd.Series({ 'Staff_Position': None, 'Staff_Contract': None, 'Staff_Target_Hours': None, 'Multiple_Contracts': False }) # Add contract details logging.info("Adding contract information...") contract_details = terms_df.apply(get_contract_info, axis=1) terms_df = pd.concat([terms_df, contract_details], axis=1) # Create monthly learner counts logging.info("Calculating monthly learner counts...") monthly_data = [] dates = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M') for date in dates: month_start = date.replace(day=1) month_end = date # Get active learners for this month month_mask = ( (terms_df['Start_Date'] <= month_end) & ((terms_df['End_Date'].isna()) | (terms_df['End_Date'] >= month_start)) ) month_data = terms_df[month_mask].copy() if not month_data.empty: month_data['Year'] = date.year month_data['Month'] = date.month monthly_data.append(month_data) logging.debug( f"Month {date.strftime('%Y-%m')}: {len(month_data)} active learners" ) # Combine monthly data with empty DataFrame handling monthly_df = pd.concat(monthly_data, ignore_index=True) if monthly_data else pd.DataFrame() if monthly_df.empty: logging.warning("No monthly data generated!") return {} # Create summary dataframes logging.info("Creating summary dataframes...") # Learner summary learner_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month'] ).agg({ 'Learner_First_Name': 'count', 'Staff_Position': 'first', 'Staff_Contract': 'first', 'Staff_Target_Hours': 'first', 'Multiple_Contracts': 'first' }).reset_index() learner_summary.rename(columns={ 'Learner_First_Name': 'Total_Learners' }, inplace=True) # Qualification summary qual_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month', 'Qualification_Name', 'Level', 'Staff_Position', 'Staff_Contract', 'Activity_Group'] ).agg({ 'Learner_First_Name': 'count', 'Staff_Target_Hours': 'first', 'Multiple_Contracts': 'first' }).reset_index() qual_summary.rename(columns={ 'Learner_First_Name': 'Learners_In_Qualification' }, inplace=True) # Level summary level_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month', 'Level'] ).agg({ 'Learner_First_Name': 'count', 'Multiple_Contracts': 'first' }).reset_index() level_summary.rename(columns={ 'Learner_First_Name': 'Learners_In_Level' }, inplace=True) # Activity group summary activity_summary = monthly_df.groupby( ['Staff_Standard', 'Year', 'Month', 'Activity_Group'] ).agg({ 'Learner_First_Name': 'count', 'Multiple_Contracts': 'first' }).reset_index() activity_summary.rename(columns={ 'Learner_First_Name': 'Learners_In_Activity' }, inplace=True) logging.info("Terms caseloading processing completed!") return { 'terms_detailed': monthly_df, 'learner_summary': learner_summary, 'qualification_summary': qual_summary, 'level_summary': level_summary, 'activity_summary': activity_summary, 'qualification_reference': qualification_df }
Editor is loading...
Leave a Comment