C4

 avatar
user_1718919
plain_text
4 months ago
10 kB
4
Indexable
# Cell 4: PICS Data Processing and Main Execution
def process_pics_hours_lookup(hours_df: pd.DataFrame) -> pd.DataFrame:
    """Process the PICS hours lookup data"""
    hours_lookup = hours_df[[
        'Standard title', 'Weighted Monthly Hours (1.6)'
    ]].copy()
    
    def parse_weighted_hours(time_str):
        if pd.isna(time_str):
            return 0.0
        try:
            if isinstance(time_str, str) and ':' in time_str:
                hours, minutes = map(int, time_str.split(':')[:2])
                return hours + (minutes / 60)
            elif isinstance(time_str, (datetime.time, pd.Timestamp)):
                return time_str.hour + (time_str.minute / 60)
            return float(time_str)
        except Exception as e:
            logging.warning(f"Error parsing weighted hours '{time_str}': {str(e)}")
            return 0.0

    hours_lookup['Monthly_Hours'] = hours_lookup['Weighted Monthly Hours (1.6)'].apply(parse_weighted_hours)
    hours_lookup['Standard_Title'] = hours_lookup['Standard title'].str.strip()

    # Log zero hours entries
    zero_hours = hours_lookup[hours_lookup['Monthly_Hours'] == 0]
    if not zero_hours.empty:
        logging.warning(f"Found {len(zero_hours)} standards with zero hours:")
        for _, row in zero_hours.iterrows():
            logging.warning(f"  - {row['Standard_Title']}: {row['Weighted Monthly Hours (1.6)']}")

    return hours_lookup[['Standard_Title', 'Monthly_Hours']]

def process_pics_data(
    caseload_file: str,
    hours_lookup_file: str,
    contract_history: pd.DataFrame
) -> Dict[str, pd.DataFrame]:
    """
    Process PICS caseload data and hours lookup
    """
    logging.info("Processing PICS data...")
    
    # Read hours lookup first
    logging.info("Reading hours lookup data...")
    hours_df, error = safe_read_excel(hours_lookup_file, skiprows=2)
    if error:
        logging.error(f"Failed to read hours lookup: {error}")
        raise ValueError(f"Cannot process PICS data: {error}")
        
    # Process hours lookup
    hours_lookup = process_pics_hours_lookup(hours_df)
    logging.info(f"Processed {len(hours_lookup)} standard titles with hours")
    
    # Read and validate caseload data
    logging.info("Reading PICS caseload data...")
    caseload_df, error = safe_read_excel(caseload_file)
    if error:
        logging.error(f"Failed to read caseload data: {error}")
        raise ValueError(f"Cannot process PICS data: {error}")
        
    required_columns = [
        'Assessor Full Name', 'Programme', 'Apprenticeship Standard Title',
        'Apprenticeship Achieved Date', 'Start Date', 'Learning Expected End',
        'Actual End'
    ]
    
    # Validate and select columns
    try:
        validate_processed_data(caseload_df, required_columns, 'PICS_caseload')
        df = caseload_df[required_columns].copy()
    except ValueError as e:
        logging.error(f"Validation failed: {str(e)}")
        raise
    
    # Rename columns for consistency
    column_mapping = {
        'Assessor Full Name': 'Assessor_Name',
        'Programme': 'Programme_Level',
        'Apprenticeship Standard Title': 'Standard_Title',
        'Apprenticeship Achieved Date': 'Achieved_Date',
        'Start Date': 'Start_Date',
        'Learning Expected End': 'Expected_End',
        'Actual End': 'Actual_End'
    }
    df.rename(columns=column_mapping, inplace=True)
    
    # Clean assessor names
    df['Assessor_Name'] = df['Assessor_Name'].apply(clean_name)
    
    # Convert dates with validation and proper error handling
    date_columns = ['Achieved_Date', 'Start_Date', 'Expected_End', 'Actual_End']
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        invalid_dates = df[col].isna()
        invalid_count = invalid_dates.sum()
        if invalid_count > 0:
            logging.warning(f"Found {invalid_count} invalid dates in {col}")
            if col not in ['Achieved_Date', 'Actual_End']:  # These can legitimately be null
                logging.warning("Sample of rows with invalid dates:")
                sample_invalid = df[invalid_dates].head()
                for _, row in sample_invalid.iterrows():
                    logging.warning(f"Invalid {col} for {row['Assessor_Name']}: {row['Standard_Title']}")
    
    # Clean titles and match with hours lookup
    df['Standard_Title'] = df['Standard_Title'].str.strip()
    
    # Add hours from lookup with validation
    df = df.merge(
        hours_lookup,
        on='Standard_Title',
        how='left',
        validate='m:1'
    )
    
    # Check for unmatched standards
    unmatched = df[df['Monthly_Hours'].isna()]['Standard_Title'].unique()
    if len(unmatched) > 0:
        logging.warning(f"Found {len(unmatched)} standards without matching hours:")
        for title in unmatched:
            logging.warning(f"  - {title}")
    
    df['Monthly_Hours'] = df['Monthly_Hours'].fillna(0)
    
    # Create monthly snapshots with improved date handling
    monthly_data = []
    date_range = pd.date_range(start='2023-04-01', end='2024-03-31', freq='M')
    
    logging.info("Generating monthly snapshots...")
    for date in date_range:
        month_start = date.replace(day=1)
        month_end = date
        
        logging.info(f"Processing month: {month_start.strftime('%B %Y')}")
        
        # Get active students for this month with proper date comparison
        month_mask = (
            (df['Start_Date'].notna() & (df['Start_Date'] <= month_end)) &
            ((df['Actual_End'].isna() | (df['Actual_End'] >= month_start)) |
             (df['Expected_End'].isna() | (df['Expected_End'] >= month_start)))
        )
        
        month_data = df[month_mask].copy()
        
        if not month_data.empty:
            month_data['Snapshot_Date'] = month_end
            month_data['Year'] = month_end.year
            month_data['Month'] = month_end.month
            
            # Add contract info for this month
            def get_assessor_contract(row):
                staff_contracts = contract_history[
                    contract_history['Staff_Name'] == row['Assessor_Name']
                ]
                
                relevant_contracts = staff_contracts[
                    (staff_contracts['Start_Date'] <= month_end) &
                    (staff_contracts['End_Date'] >= month_start)
                ]
                
                if len(relevant_contracts) > 0:
                    positions = relevant_contracts['Position'].unique()
                    contract_types = relevant_contracts['Contract_Type'].unique()
                    
                    # Calculate total target hours only for salaried contracts
                    total_target = relevant_contracts[
                        relevant_contracts['Contract_Type'].str.contains('Salaried', na=False)
                    ]['Target_Hours'].sum()
                    
                    return pd.Series({
                        'Assessor_Position': ' & '.join(positions),
                        'Assessor_Contract': ' & '.join(contract_types),
                        'Assessor_Target_Hours': total_target if total_target > 0 else None,
                        'Multiple_Contracts': len(relevant_contracts) > 1
                    })
                
                return pd.Series({
                    'Assessor_Position': None,
                    'Assessor_Contract': None,
                    'Assessor_Target_Hours': None,
                    'Multiple_Contracts': False
                })
            
            # Add contract details for the month
            contract_details = month_data.apply(get_assessor_contract, axis=1)
            month_data = pd.concat([month_data, contract_details], axis=1)
            
            monthly_data.append(month_data)
            logging.info(f"Active students in {month_start.strftime('%B %Y')}: {len(month_data)}")
    
    # Combine monthly data with empty DataFrame handling
    monthly_df = pd.concat(monthly_data, ignore_index=True) if monthly_data else pd.DataFrame()
    
    if monthly_df.empty:
        logging.warning("No monthly data generated!")
        return {}
    
    # Create summaries with proper grouping
    logging.info("Creating summary views...")
    
    # Monthly summary per assessor
    monthly_summary = monthly_df.groupby(
        ['Assessor_Name', 'Year', 'Month', 'Assessor_Position', 
         'Assessor_Contract', 'Multiple_Contracts']
    ).agg({
        'Standard_Title': 'nunique',
        'Monthly_Hours': 'sum',
        'Assessor_Target_Hours': 'first'
    }).reset_index()
    
    monthly_summary.rename(columns={
        'Standard_Title': 'Active_Students',
        'Monthly_Hours': 'Required_Hours'
    }, inplace=True)
    
    # Programme level summary
    programme_summary = monthly_df.groupby(
        ['Assessor_Name', 'Programme_Level', 'Year', 'Month']
    ).agg({
        'Standard_Title': 'nunique',
        'Assessor_Position': 'first',
        'Assessor_Contract': 'first',
        'Multiple_Contracts': 'first',
        'Assessor_Target_Hours': 'first'
    }).reset_index()
    
    programme_summary.rename(columns={
        'Standard_Title': 'Students_In_Programme'
    }, inplace=True)
    
    # Standard title summary
    standard_summary = monthly_df.groupby(
        ['Assessor_Name', 'Standard_Title', 'Year', 'Month']
    ).agg({
        'Monthly_Hours': 'sum',
        'Assessor_Name': 'count',
        'Assessor_Position': 'first',
        'Assessor_Contract': 'first',
        'Multiple_Contracts': 'first',
        'Assessor_Target_Hours': 'first'
    }).reset_index()
    
    standard_summary.rename(columns={
        'Assessor_Name': 'Students_In_Standard',
        'Monthly_Hours': 'Required_Hours'
    }, inplace=True)
    
    logging.info("PICS data processing completed!")
    
    return {
        'detailed_monthly': monthly_df,
        'monthly_summary': monthly_summary,
        'programme_summary': programme_summary,
        'standard_summary': standard_summary,
        'hours_reference': hours_lookup
    }
Editor is loading...
Leave a Comment