Untitled

mail@pastecode.io avatar
unknown
plain_text
a month ago
4.4 kB
0
Indexable
Never
import pandas as pd
import pandas_gbq

def do_load(df):
    project_id = 'bmas-eu-ng-rva-uat-data'
    table_id = 'bmas-eu-ng-rva-data-dev.WEATHER_TEST.Program_Mapping'
    pandas_gbq.to_gbq(df, table_id, project_id)
    return df

def preprocess(SQL_Query, project_id):
    test = pd.read_gbq(SQL_Query, project_id=project_id)
    if 'Summary' in test.columns:
        test['Summary'] = test['Summary'].fillna('')
    print('Preprocess completed')
    return test

def do_import(processed_df):
    df = pd.read_gbq()
    return df

def do_process(finaldf, project_mappingdf):
    print(finaldf.head())

    # Ensure required columns exist
    required_columns = ['Label', 'Project name', 'Service description', 'SKU description', 'Credit type', 'Program', 'Region', 'Environment']
    for col in required_columns:
        if col not in finaldf.columns:
            finaldf[col] = ''

    # Fill NaN labels with the Project name
    finaldf.loc[finaldf['Label'].isna(), 'Label'] = finaldf['Project name']
    finaldf['Label'] = finaldf['Label'].str.replace('project:', '')

    # Initial conditions
    finaldf.loc[
        (finaldf['Project name'] == '[Charges not specific to a project]') & (finaldf['Service description'] == 'Support'),
        ['Program', 'Region', 'Environment']
    ] = ['All', '', '']

    finaldf.loc[
        (finaldf['Project name'] == '[Charges not specific to a project]') & 
        (finaldf['Service description'] != 'Support'),
        'Program'
    ] = 'All'

    finaldf.loc[
        (finaldf['Project name'] == '[Charges not specific to a project]') & 
        (finaldf['Service description'] != 'Support'),
        'Environment'
    ] = ''

    finaldf.loc[
        (finaldf['Project name'] == '[Charges not specific to a project]') & 
        (finaldf['Service description'] != 'Support'),
        'Region'
    ] = finaldf['SKU description'].apply(lambda x: 'EU' if 'EU' in str(x) else ('IND' if 'IND' in str(x) else ('AUS' if 'AUS' in str(x) else ('US' if 'US' in str(x) else ''))))

    def map_region(description):
        if 'EU' in description:
            return 'EU'
        elif 'IND' in description:
            return 'IND'
        elif 'AUS' in description:
            return 'AUS'
        elif 'US' in description:
            return 'US'
        else:
            return ''

    finaldf.loc[finaldf['Project name'] == 'bq-admin-ericsson', 'Program'] = 'All'
    finaldf.loc[finaldf['Project name'] == 'bq-admin-ericsson', 'Environment'] = ''
    finaldf.loc[finaldf['Project name'] == 'bq-admin-ericsson', 'Region'] = finaldf.loc[finaldf['Label'] == 'bq-admin-ericsson', 'SKU description'].apply(map_region)

    finaldf.loc[finaldf['Credit type'] == 'PROMOTION', 'Program'] = 'PROMOTION'

    # Ensure there are no missing Program, Region, Environment before mapping
    finaldf['Program'] = finaldf['Program'].astype(str)
    finaldf['Region'] = finaldf['Region'].astype(str)
    finaldf['Environment'] = finaldf['Environment'].astype(str)

    # Merge to fill in missing Program, Region, Environment
    merge_cols = ['Labels', 'Program', 'Region', 'Environment']
    merged_df = finaldf.merge(project_mappingdf[merge_cols], on='Label', suffixes=('', '_mapped'), how='left')

    # Update missing values with mapped values
    finaldf['Program'] = finaldf['Program'].mask(finaldf['Program'] == '', merged_df['Program_mapped'])
    finaldf['Region'] = finaldf['Region'].mask(finaldf['Region'] == '', merged_df['Region_mapped'])
    finaldf['Environment'] = finaldf['Environment'].mask(finaldf['Environment'] == '', merged_df['Environment_mapped'])

    # Saving output
    finaldf.to_csv("project_mapping.csv", index=False)
    print(finaldf.head())
    return finaldf

if __name__ == '__main__':
    finaldf = pd.read_csv(r"C:\Users\ensaani\OneDrive - Ericsson\Desktop\NGRVA Projects\GCP Billing\June data\BCSS_Cost table, 2024-06-01 — 2024-06-30.csv")
    project_mappingdf = pd.read_excel(r"C:\Users\ensaani\Downloads\BCSS, 2024-06-01 — 2024-06-30_P 2.xlsx", sheet_name='Project-Program-Mapping')
    SQL_Query = 'SELECT * FROM `bmas-eu-ng-rva-data-dev.WEATHER_TEST.Project_Program_Mapping` LIMIT 1000'
    processed_df = preprocess(SQL_Query, project_id='bmas-eu-ng-rva-uat-data')
    finaldf = do_process(finaldf, project_mappingdf)
    print("Done")
Leave a Comment