C12
user_1718919
plain_text
10 months ago
6.0 kB
6
Indexable
# --- PART 4: PICS DATA PROCESSING ---
def convert_time_to_hours(time_str):
"""Convert time string in format HH:MM to decimal hours"""
if pd.isna(time_str):
return 0
try:
hours, minutes = map(int, str(time_str).split(':'))
return hours + minutes/60
except:
return 0
def process_pics_data(caseload_file, hours_lookup_file):
"""Process PICS caseload data and hours lookup"""
print("\nProcessing PICS data...")
# Read hours lookup first
print("Reading hours lookup data...")
hours_df = pd.read_excel(hours_lookup_file, skiprows=2) # Skip first two rows
# Process hours lookup
hours_lookup = hours_df[[
'Standard title', 'Weighted Monthly Hours (1.6)'
]].copy()
# Convert time format to decimal hours
hours_lookup['Monthly_Hours'] = hours_lookup['Weighted Monthly Hours (1.6)'].apply(
convert_time_to_hours
)
# Clean and standardize titles
hours_lookup['Standard_Title'] = hours_lookup['Standard title'].str.strip()
print(f"Processed {len(hours_lookup)} standard titles with hours")
# Read caseload data
print("\nReading PICS caseload data...")
caseload_df = pd.read_excel(caseload_file)
# Select and rename relevant columns
cols_to_keep = {
'Assessor Full Name': 'Assessor_Name',
'Programme': 'Programme_Level',
'Apprenticeship Standard Title': 'Standard_Title',
'Apprenticeship Achieved Date': 'Achieved_Date',
'Start Date': 'Start_Date',
'Learning Expected End': 'Expected_End',
'Actual End': 'Actual_End'
}
df = caseload_df[cols_to_keep.keys()].copy()
df.rename(columns=cols_to_keep, inplace=True)
# Convert dates
date_columns = ['Achieved_Date', 'Start_Date', 'Expected_End', 'Actual_End']
for col in date_columns:
df[col] = pd.to_datetime(df[col], errors='coerce')
# Clean titles for matching with lookup
df['Standard_Title'] = df['Standard_Title'].str.strip()
# Add hours from lookup
df = df.merge(
hours_lookup[['Standard_Title', 'Monthly_Hours']],
on='Standard_Title',
how='left'
)
# Fill missing hours with 0
df['Monthly_Hours'] = df['Monthly_Hours'].fillna(0)
# Function to check if a student was active in a given month
def was_active_in_month(row, check_date):
start_date = row['Start_Date']
end_date = row['Actual_End'] if pd.notna(row['Actual_End']) else row['Expected_End']
if pd.isna(start_date) or pd.isna(end_date):
return False
return start_date <= check_date <= end_date
# Create monthly snapshots for FY23/24
monthly_data = []
# Generate dates for FY23/24
dates = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M')
print("\nGenerating monthly snapshots...")
for date in dates:
# Filter for active students in this month
month_mask = df.apply(lambda row: was_active_in_month(row, date), axis=1)
month_data = df[month_mask].copy()
# Add month info
month_data['Snapshot_Date'] = date
month_data['Year'] = date.year
month_data['Month'] = date.month
monthly_data.append(month_data)
monthly_df = pd.concat(monthly_data, ignore_index=True)
# Create summaries
# 1. Monthly summary per assessor
monthly_summary = monthly_df.groupby(
['Assessor_Name', 'Year', 'Month']
).agg({
'Standard_Title': 'count', # Number of active students
'Monthly_Hours': 'sum' # Total hours needed
}).reset_index()
monthly_summary.rename(columns={
'Standard_Title': 'Active_Students'
}, inplace=True)
# 2. Programme level summary
programme_summary = monthly_df.groupby(
['Assessor_Name', 'Programme_Level', 'Year', 'Month']
).agg({
'Standard_Title': 'count'
}).reset_index()
programme_summary.rename(columns={
'Standard_Title': 'Students_In_Programme'
}, inplace=True)
# 3. Standard title summary
standard_summary = monthly_df.groupby(
['Assessor_Name', 'Standard_Title', 'Year', 'Month']
).agg({
'Standard_Title': 'count',
'Monthly_Hours': 'sum'
}).reset_index()
standard_summary.rename(columns={
'Standard_Title': 'Students_In_Standard',
'Monthly_Hours': 'Required_Hours'
}, inplace=True)
print("PICS data processing completed!")
return {
'detailed_monthly': monthly_df,
'monthly_summary': monthly_summary,
'programme_summary': programme_summary,
'standard_summary': standard_summary,
'hours_reference': hours_lookup
}
# Execute processing
if __name__ == "__main__":
print("\nStarting PICS data processing...")
# Define input files
caseload_file = "PICS caseload for PBI.xlsx"
hours_lookup_file = "PICS Hours for Assessor Look up.xlsx"
try:
# Process PICS data
pics_data = process_pics_data(caseload_file, hours_lookup_file)
# Save results
output_folder = "processed_data" # Using same output folder as before
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Save all dataframes
for name, df in pics_data.items():
output_file = os.path.join(output_folder, f'pics_{name}.csv')
df.to_csv(output_file, index=False)
print(f"Saved {name} to {output_file}")
print("\nPICS data processing completed successfully!")
except Exception as e:
print(f"\nError during processing: {str(e)}")
raiseEditor is loading...
Leave a Comment