C12

# --- PART 4: PICS DATA PROCESSING ---
def convert_time_to_hours(time_str):
    """Convert time string in format HH:MM to decimal hours"""
    if pd.isna(time_str):
        return 0
    try:
        hours, minutes = map(int, str(time_str).split(':'))
        return hours + minutes/60
    except:
        return 0

def process_pics_data(caseload_file, hours_lookup_file):
    """Process PICS caseload data and hours lookup"""
    print("\nProcessing PICS data...")
    
    # Read hours lookup first
    print("Reading hours lookup data...")
    hours_df = pd.read_excel(hours_lookup_file, skiprows=2)  # Skip first two rows
    
    # Process hours lookup
    hours_lookup = hours_df[[
        'Standard title', 'Weighted Monthly Hours (1.6)'
    ]].copy()
    
    # Convert time format to decimal hours
    hours_lookup['Monthly_Hours'] = hours_lookup['Weighted Monthly Hours (1.6)'].apply(
        convert_time_to_hours
    )
    
    # Clean and standardize titles
    hours_lookup['Standard_Title'] = hours_lookup['Standard title'].str.strip()
    
    print(f"Processed {len(hours_lookup)} standard titles with hours")
    
    # Read caseload data
    print("\nReading PICS caseload data...")
    caseload_df = pd.read_excel(caseload_file)
    
    # Select and rename relevant columns
    cols_to_keep = {
        'Assessor Full Name': 'Assessor_Name',
        'Programme': 'Programme_Level',
        'Apprenticeship Standard Title': 'Standard_Title',
        'Apprenticeship Achieved Date': 'Achieved_Date',
        'Start Date': 'Start_Date',
        'Learning Expected End': 'Expected_End',
        'Actual End': 'Actual_End'
    }
    
    df = caseload_df[cols_to_keep.keys()].copy()
    df.rename(columns=cols_to_keep, inplace=True)
    
    # Convert dates
    date_columns = ['Achieved_Date', 'Start_Date', 'Expected_End', 'Actual_End']
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Clean titles for matching with lookup
    df['Standard_Title'] = df['Standard_Title'].str.strip()
    
    # Add hours from lookup
    df = df.merge(
        hours_lookup[['Standard_Title', 'Monthly_Hours']], 
        on='Standard_Title', 
        how='left'
    )
    
    # Fill missing hours with 0
    df['Monthly_Hours'] = df['Monthly_Hours'].fillna(0)
    
    # Function to check if a student was active in a given month
    def was_active_in_month(row, check_date):
        start_date = row['Start_Date']
        end_date = row['Actual_End'] if pd.notna(row['Actual_End']) else row['Expected_End']
        
        if pd.isna(start_date) or pd.isna(end_date):
            return False
            
        return start_date <= check_date <= end_date
    
    # Create monthly snapshots for FY23/24
    monthly_data = []
    
    # Generate dates for FY23/24
    dates = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M')
    
    print("\nGenerating monthly snapshots...")
    for date in dates:
        # Filter for active students in this month
        month_mask = df.apply(lambda row: was_active_in_month(row, date), axis=1)
        month_data = df[month_mask].copy()
        
        # Add month info
        month_data['Snapshot_Date'] = date
        month_data['Year'] = date.year
        month_data['Month'] = date.month
        
        monthly_data.append(month_data)
    
    monthly_df = pd.concat(monthly_data, ignore_index=True)
    
    # Create summaries
    
    # 1. Monthly summary per assessor
    monthly_summary = monthly_df.groupby(
        ['Assessor_Name', 'Year', 'Month']
    ).agg({
        'Standard_Title': 'count',  # Number of active students
        'Monthly_Hours': 'sum'      # Total hours needed
    }).reset_index()
    
    monthly_summary.rename(columns={
        'Standard_Title': 'Active_Students'
    }, inplace=True)
    
    # 2. Programme level summary
    programme_summary = monthly_df.groupby(
        ['Assessor_Name', 'Programme_Level', 'Year', 'Month']
    ).agg({
        'Standard_Title': 'count'
    }).reset_index()
    
    programme_summary.rename(columns={
        'Standard_Title': 'Students_In_Programme'
    }, inplace=True)
    
    # 3. Standard title summary
    standard_summary = monthly_df.groupby(
        ['Assessor_Name', 'Standard_Title', 'Year', 'Month']
    ).agg({
        'Standard_Title': 'count',
        'Monthly_Hours': 'sum'
    }).reset_index()
    
    standard_summary.rename(columns={
        'Standard_Title': 'Students_In_Standard',
        'Monthly_Hours': 'Required_Hours'
    }, inplace=True)
    
    print("PICS data processing completed!")
    
    return {
        'detailed_monthly': monthly_df,
        'monthly_summary': monthly_summary,
        'programme_summary': programme_summary,
        'standard_summary': standard_summary,
        'hours_reference': hours_lookup
    }

# Execute processing
if __name__ == "__main__":
    print("\nStarting PICS data processing...")
    
    # Define input files
    caseload_file = "PICS caseload for PBI.xlsx"
    hours_lookup_file = "PICS Hours for Assessor Look up.xlsx"
    
    try:
        # Process PICS data
        pics_data = process_pics_data(caseload_file, hours_lookup_file)
        
        # Save results
        output_folder = "processed_data"  # Using same output folder as before
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        # Save all dataframes
        for name, df in pics_data.items():
            output_file = os.path.join(output_folder, f'pics_{name}.csv')
            df.to_csv(output_file, index=False)
            print(f"Saved {name} to {output_file}")
        
        print("\nPICS data processing completed successfully!")
        
    except Exception as e:
        print(f"\nError during processing: {str(e)}")
        raise
Editor is loading...