C04
user_1718919
plain_text
10 months ago
11 kB
5
Indexable
def process_qualification_lists(
approved_file: str,
archived_file: str
) -> pd.DataFrame:
"""
Process approved and archived qualification lists
Args:
approved_file: Path to approved qualifications file
archived_file: Path to archived qualifications file
Returns:
DataFrame with combined qualification reference data
"""
logging.info("Processing qualification lists...")
required_columns = ['QualificationName', 'QualificationNumber']
# Read approved list
approved_df, error = safe_read_excel(approved_file)
if error:
logging.error(f"Failed to read approved qualifications: {error}")
return pd.DataFrame(columns=required_columns + ['Source'])
approved_df = approved_df[required_columns].copy()
approved_df['Source'] = 'Approved'
# Read archived list
archived_df, error = safe_read_excel(archived_file)
if error:
logging.error(f"Failed to read archived qualifications: {error}")
return pd.DataFrame(columns=required_columns + ['Source'])
archived_df = archived_df[required_columns].copy()
archived_df['Source'] = 'Archived'
# Validate data
try:
validate_processed_data(approved_df, required_columns, 'approved_qualifications')
validate_processed_data(archived_df, required_columns, 'archived_qualifications')
except ValueError as e:
logging.error(f"Validation failed: {str(e)}")
return pd.DataFrame(columns=required_columns + ['Source'])
# Combine lists and clean qualification numbers
qual_df = pd.concat([approved_df, archived_df], ignore_index=True)
qual_df['QualificationNumber'] = qual_df['QualificationNumber'].apply(clean_qualification_number)
# Check for duplicates
duplicates = qual_df.groupby('QualificationNumber').size()
duplicate_quals = duplicates[duplicates > 1]
if len(duplicate_quals) > 0:
logging.warning(f"Found {len(duplicate_quals)} duplicate qualification numbers")
for qual_num in duplicate_quals.index:
dupes = qual_df[qual_df['QualificationNumber'] == qual_num]
logging.warning(f"Duplicate entries for {qual_num}:")
for _, row in dupes.iterrows():
logging.warning(f" {row['Source']}: {row['QualificationName']}")
# Remove duplicates, keeping 'Approved' version if exists
qual_df = qual_df.sort_values('Source', ascending=False).drop_duplicates(
'QualificationNumber', keep='first'
)
# Log summary statistics
logging.info(f"Processed {len(qual_df)} unique qualifications:")
logging.info(f" - Approved: {len(qual_df[qual_df['Source'] == 'Approved'])}")
logging.info(f" - Archived: {len(qual_df[qual_df['Source'] == 'Archived'])}")
return qual_df
def process_terms_caseloading(
terms_file: str,
qualification_df: pd.DataFrame,
contract_history: pd.DataFrame
) -> Dict[str, pd.DataFrame]:
"""
Process Terms caseloading data and link with qualification information
Args:
terms_file: Path to Terms caseloading file
qualification_df: Qualification reference DataFrame
contract_history: Contract history DataFrame
Returns:
Dictionary of processed DataFrames
"""
logging.info("Processing Terms caseloading data...")
# Read Terms caseloading data with error handling
terms_df, error = safe_read_excel(terms_file)
if error:
logging.error(f"Failed to read Terms caseloading data: {error}")
raise ValueError(f"Cannot process Terms caseloading data: {error}")
# Select and rename relevant columns
cols_to_keep = {
'Primary Staff Name': 'Staff_First_Name',
'Primary Staff Surname': 'Staff_Last_Name',
'Start Date': 'Start_Date',
'End Date': 'End_Date',
'Activity Name': 'Activity_Name',
'Funding Reference': 'Qualification_Number',
'Level': 'Level',
'Activity Group': 'Activity_Group',
'Name': 'Learner_First_Name',
'Surname': 'Learner_Last_Name'
}
try:
terms_df = terms_df[cols_to_keep.keys()].copy()
terms_df.rename(columns=cols_to_keep, inplace=True)
except KeyError as e:
logging.error(f"Missing required columns: {str(e)}")
raise ValueError(f"Required columns missing from Terms caseloading data: {str(e)}")
# Clean qualification numbers
terms_df['Qualification_Number'] = terms_df['Qualification_Number'].apply(clean_qualification_number)
# Create staff standard name format and clean names
name_columns = ['Staff_First_Name', 'Staff_Last_Name']
terms_df = standardize_names(terms_df, name_columns)
terms_df['Staff_Standard'] = terms_df.apply(
lambda x: f"{x['Staff_First_Name']} {x['Staff_Last_Name']}", axis=1
)
# Convert dates
date_columns = ['Start_Date', 'End_Date']
for col in date_columns:
terms_df[col] = pd.to_datetime(terms_df[col], errors='coerce')
invalid_dates = terms_df[col].isna().sum()
if invalid_dates > 0:
logging.warning(f"Found {invalid_dates} invalid dates in {col}")
# Look up qualification names with detailed logging
def get_qualification_name(row):
if pd.isna(row['Qualification_Number']):
logging.debug(f"No qualification number for activity: {row['Activity_Name']}")
return row['Activity_Name']
qual_match = qualification_df[
qualification_df['QualificationNumber'] == row['Qualification_Number']
]
if len(qual_match) > 0:
source = qual_match.iloc[0]['Source']
qual_name = qual_match.iloc[0]['QualificationName']
logging.debug(f"Found qualification {row['Qualification_Number']} in {source} list: {qual_name}")
return qual_name
logging.debug(f"No match found for qualification {row['Qualification_Number']}, using activity name: {row['Activity_Name']}")
return row['Activity_Name']
terms_df['Qualification_Name'] = terms_df.apply(get_qualification_name, axis=1)
# Add contract info based on start date
def get_contract_info(row):
if pd.isna(row['Start_Date']):
return pd.Series({
'Staff_Position': None,
'Staff_Contract': None,
'Staff_Target_Hours': None,
'Multiple_Contracts': False
})
staff_contracts = contract_history[
contract_history['Staff_Name'] == row['Staff_Standard']
]
relevant_contracts = staff_contracts[
(staff_contracts['Start_Date'] <= row['Start_Date']) &
(staff_contracts['End_Date'] >= row['Start_Date'])
]
if len(relevant_contracts) > 0:
# Handle multiple contracts
positions = ', '.join(relevant_contracts['Position'].unique())
contract_types = ', '.join(relevant_contracts['Contract_Type'].unique())
total_target = relevant_contracts['Target_Hours'].sum()
return pd.Series({
'Staff_Position': positions,
'Staff_Contract': contract_types,
'Staff_Target_Hours': total_target,
'Multiple_Contracts': len(relevant_contracts) > 1
})
return pd.Series({
'Staff_Position': None,
'Staff_Contract': None,
'Staff_Target_Hours': None,
'Multiple_Contracts': False
})
# Add contract details
logging.info("Adding contract information...")
contract_details = terms_df.apply(get_contract_info, axis=1)
terms_df = pd.concat([terms_df, contract_details], axis=1)
# Create monthly learner counts
logging.info("Calculating monthly learner counts...")
monthly_data = []
dates = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M')
for date in dates:
month_start = date.replace(day=1)
month_end = date
# Get active learners for this month
month_mask = (
(terms_df['Start_Date'] <= month_end) &
((terms_df['End_Date'].isna()) | (terms_df['End_Date'] >= month_start))
)
month_data = terms_df[month_mask].copy()
if not month_data.empty:
month_data['Year'] = date.year
month_data['Month'] = date.month
monthly_data.append(month_data)
logging.debug(
f"Month {date.strftime('%Y-%m')}: {len(month_data)} active learners"
)
monthly_df = pd.concat(monthly_data, ignore_index=True) if monthly_data else pd.DataFrame()
if monthly_df.empty:
logging.warning("No monthly data generated!")
return {}
# Create summary dataframes
logging.info("Creating summary dataframes...")
# Learner summary
learner_summary = monthly_df.groupby(
['Staff_Standard', 'Year', 'Month']
).agg({
'Learner_First_Name': 'count',
'Staff_Position': 'first',
'Staff_Contract': 'first',
'Staff_Target_Hours': 'first',
'Multiple_Contracts': 'first'
}).reset_index()
learner_summary.rename(columns={
'Learner_First_Name': 'Total_Learners'
}, inplace=True)
# Qualification summary
qual_summary = monthly_df.groupby(
['Staff_Standard', 'Year', 'Month', 'Qualification_Name',
'Level', 'Staff_Position', 'Staff_Contract', 'Activity_Group']
).agg({
'Learner_First_Name': 'count',
'Staff_Target_Hours': 'first',
'Multiple_Contracts': 'first'
}).reset_index()
qual_summary.rename(columns={
'Learner_First_Name': 'Learners_In_Qualification'
}, inplace=True)
# Level summary
level_summary = monthly_df.groupby(
['Staff_Standard', 'Year', 'Month', 'Level']
).agg({
'Learner_First_Name': 'count',
'Multiple_Contracts': 'first'
}).reset_index()
level_summary.rename(columns={
'Learner_First_Name': 'Learners_In_Level'
}, inplace=True)
# Activity group summary
activity_summary = monthly_df.groupby(
['Staff_Standard', 'Year', 'Month', 'Activity_Group']
).agg({
'Learner_First_Name': 'count',
'Multiple_Contracts': 'first'
}).reset_index()
activity_summary.rename(columns={
'Learner_First_Name': 'Learners_In_Activity'
}, inplace=True)
logging.info("Terms caseloading processing completed!")
return {
'terms_detailed': monthly_df,
'learner_summary': learner_summary,
'qualification_summary': qual_summary,
'level_summary': level_summary,
'activity_summary': activity_summary,
'qualification_reference': qualification_df
}Editor is loading...
Leave a Comment