Untitled

 avatar
unknown
python
2 years ago
11 kB
12
Indexable
import os
import json
import pandas as pd
import psycopg2
import configparser
from pymongo import MongoClient
import boto3
from sqlalchemy import create_engine
import io
import re
import numpy as np


config = configparser.RawConfigParser()
config.read(os.path.join(os.getcwd() , '../config/config.property'))

#Database Connection
MONGODB = dict(config.items('MONGODB'))
MONGODB_HOST = MONGODB['host']
MONGODB_PORT = int(MONGODB['port'])
MONGODB_DATABASE = MONGODB['dbname']
MONGODB_COLLECTION1 = MONGODB["hospital_structure_details"]

MONGO_CLIENT = MongoClient(MONGODB_HOST, MONGODB_PORT)
MONGODB_DB = MONGO_CLIENT[MONGODB_DATABASE]
HOSPITAL_STRUCTURE_DETAILS= MONGODB_DB[MONGODB_COLLECTION1]     #Evaluation Batch Detail


#S3 credentials
S3BUCKET = dict(config.items("S3BUCKET"))
ACCESS_KEY_ID=S3BUCKET["keyid"]
AWS_SECRET_ACCESS_KEY=S3BUCKET["accesskey"]
REGION=S3BUCKET["region"] 
BUCKET=S3BUCKET['bucket_name']
FOLDER = S3BUCKET['folder_name']


#POSTGRES credentials
POSTGRES = dict(config.items("POSTGRES"))
HOST=POSTGRES["host"]
PORT=POSTGRES["port"]
DATABASE=POSTGRES["database"] 
USER=POSTGRES['user']
PASSWORD = POSTGRES['password']



#HOSPITAL_ID
HOSPITAL = dict(config.items("HOSPITAL_ID"))
HOSPITAL_ID = HOSPITAL['hospital_id']



ref_dict = {
    'hospital_id':'',
    'page':'',
    'table':'',
    's3url':'',
    'service_code':'STRUCT011',
    'category':'STRUCT008',
    'subcategory':'STRUCT009',
    'service_description':'STRUCT003',
    'bounding_box_sd':'',
    'type_of_room':'STRUCT001',
    'price_per_unit':'STRUCT002',
    'remarks':'STRUCT004',
    'los':'STRUCT007',
    'inclusions':'STRUCT005',
    'exclusions':'STRUCT006',
    'start_date':'',
    'end_date':'',
    'tariff_type':'',
    'room_mapping':'',
    'confidence_score_sd':'',
    'standard_term':''
}

def create_postgres_data(json_file):
    json_response = json_file['response']
    col_dict = column_dict(json_response)
    print('coldict : ', col_dict)
    row_dict = row_dictionary(json_response)    
    print('rowdict : ', row_dict)
    df = populate_df(json_file, row_dict, col_dict) 
    # df.to_excel('/home/rishabh/Desktop/SOC_UI/downloadHospital/data/out3.xlsx', index=False)
    return df


def row_dictionary(json_response):
    row_dict = {}
    des_list=[]
    for i in json_response:
        if i['label']=='STRUCT003':
            des_list.append(i['value'])
    description_list = list(set(des_list))
    # print(description_list)
    for description in description_list:
        row=[]
        for res in json_response:
            if res['value']==description:
                row_number = res['fieldType'].split('R')[-1]
                row = row+[row_number]
        row_dict[description]=row
    row_dict = dict(sorted(row_dict.items(), key=lambda x: int(x[1][0])))
    # print(row_dict)
    return row_dict


def column_dict(json_response):
    col_dict = {}
    
    for ref in ref_dict:
        val = ref_dict[ref]
        if val !='':
            for i in range(0,len(json_response)):
                if val==json_response[i]['label']:
                    column = (json_response[i]['fieldType'].split('R')[0]).split('C')[-1]
                    col_dict[val]=str(column)
    return col_dict


def populate_df(json_file, row_dict, col_dict):
    json_response = json_file['response']
    data=[]
    for service_desc in row_dict:  
        for row_num in row_dict[service_desc]:
            post_dict = {}
            post_dict['hospital_id']=json_file['hospitalId']
            post_dict['page']=json_file['page']
            post_dict['table']=json_file['table']
            post_dict['s3url']=json_file['pageViewURL']
            try:
                post_dict['service_code']=str(cell_value(json_response, row_num, col_dict[ref_dict['service_code']]))
            except:
                post_dict['service_code']=''
            try:
                post_dict['category']=str(cell_value(json_response, row_num, col_dict[ref_dict['category']]))
            except:
                post_dict['category']=''
            try:
                post_dict['subcategory']=str(cell_value(json_response, row_num, col_dict[ref_dict['subcategory']]))
            except:
                post_dict['subcategory']=''
            try:
                post_dict['type_of_room']=str(cell_value(json_response, row_num, col_dict[ref_dict['type_of_room']]))
            except:
                post_dict['type_of_room']=''
            try:
                post_dict['price_per_unit']=str(cell_value(json_response, row_num, col_dict[ref_dict['price_per_unit']]))
                post_dict['price_per_unit'] = post_dict['price_per_unit'].replace(",", "")
                post_dict['price_per_unit'] = int(post_dict['price_per_unit'])
            except:
                post_dict['price_per_unit']=0
            try:
                post_dict['inclusions']=str(cell_value(json_response, row_num, col_dict[ref_dict['inclusions']]))
            except:
                post_dict['inclusions']=''
            try:
                post_dict['exclusions']=str(cell_value(json_response, row_num, col_dict[ref_dict['exclusions']]))
            except:
                post_dict['exclusions']=''
            try:
                post_dict['remarks']=str(cell_value(json_response, row_num, col_dict[ref_dict['remarks']]))
            except:
                post_dict['remarks']=''
            try:
                 post_dict['los']=str(cell_value(json_response, row_num, col_dict[ref_dict['los']]))
            except:
                 post_dict['los']=''
            post_dict['service_description']=str(service_desc)
            post_dict['bounding_box_sd']=''
            # post_dict['start_date']=get_date_and_tariff('start_date', json_file['pageViewURL'])
            post_dict['start_date']=''
            post_dict['end_date']='2091/12/31'
            post_dict['tariff_type']=get_date_and_tariff('tariff_type', json_file['pageViewURL'])
            # post_dict['tariff_type']=''
            post_dict['room_mapping']=''            
            post_dict['confidence_score_sd']=''
            post_dict['standard_term']=''
            data= data + [post_dict]
    df = pd.DataFrame(data)
    return df

def cell_value(json_response, row_number, column_number):
    field_type = 'C'+column_number+'R'+row_number
    for i in json_response:
        if i['fieldType'] == field_type:
            # print(i['value'])
            return i['value']

def get_date_and_tariff(pattern_type, pageViewURL):
    if pattern_type=='start_date':
        date_pattern = r'(?<!\d)(?:\s*)(\d{4}[-/]\d{2}[-/]\d{2}|\d{2}[-/]\d{2}[-/]\d{4}|\d{4}[/]\d{2}[/]\d{2}|\d{2}[/]\d{2}[/]\d{4})(?:\s*|\b)'
        date_matches = re.findall(date_pattern, pageViewURL)
        start_date=''
        if date_matches:
            start_date = date_matches[0]
            return start_date
        else:
            return start_date
    
    if pattern_type=='tariff_type':
        url_lower = pageViewURL.lower()
        if 'soc' in url_lower or 'open' in url_lower:
            return 'OPEN'
        elif 'agreed' in url_lower:
            return 'AGREED'
        else:
            return 'OPEN'

        
def move_columns(df, columns_to_move, target_column):
    columns_not_to_move = [col for col in df.columns if col not in columns_to_move]
    target_index = columns_not_to_move.index(target_column)
    new_order = columns_not_to_move[:target_index + 1] + columns_to_move + columns_not_to_move[target_index + 1:]
    df = df.reindex(columns=new_order)
    return df


def wide_format(df_long):
    df2 = df_long.pivot_table(index = ['hospital_id','page','table','s3url', 'service_code','category','subcategory','service_description', 'bounding_box_sd','remarks','los','inclusions','exclusions','start_date','end_date','tariff_type','room_mapping','confidence_score_sd','standard_term'],columns='type_of_room', values='price_per_unit').reset_index()
    print('This is df2:', df2)
    columns_to_drop = ['bounding_box_sd', 'standard_term']
    df2.drop(columns_to_drop, axis=1, inplace=True)
    column_index_last = df2.columns.get_loc('confidence_score_sd')
    # print(column_index_last)
    selected_columns = df2.iloc[:, column_index_last+1:]
    col_names = list(selected_columns)
    target_column = 'service_description'
    df2 = move_columns(df2, col_names, target_column)
    df_wide = df2.dropna(axis=1, how="all")
    # df_wide = df_wide.drop(['confidence_score_sd'], axis=1, inplace=True)
    return df_wide

    
def upload_to_S3(df2, hospital_id):
    s3_client = boto3.client("s3", aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
    name = hospital_id+'.xlsx'
    with io.BytesIO() as excel_buffer:
        df2.to_excel(excel_buffer, index=False)
        response = s3_client.put_object(
        Bucket='drishti-soc-data', Key=FOLDER+'/'+name, Body=excel_buffer.getvalue()
    )
        
    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
    
    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")


def upload_to_postgres(df_long):
    conn_string = 'postgresql://' +USER+':'+PASSWORD+'@'+HOST+'/'+DATABASE
    db = create_engine(conn_string)
    conn = db.connect()
    df_long.to_sql(HOSPITAL_ID, con=conn, if_exists='replace', index=False)
    print('Pushed to postgres')


if __name__ == '__main__':

    hospital_id = HOSPITAL_ID
    records = HOSPITAL_STRUCTURE_DETAILS.find({'hospitalId':hospital_id},{'_id':0})
    records_list = list(records)
    
    column_headers = list(ref_dict.keys())
    df_final = pd.DataFrame(columns=column_headers)
    list_df = []

    for i in range(0, len(records_list)):
        a = json.dumps(records_list[i])
        json_file = json.loads(a)
        df = create_postgres_data(json_file)
        # print('yes')
        print(df.head())
        df_final = pd.concat([df_final, df], ignore_index=True)
    
    df_long = df_final[df_final['price_per_unit'] != '0']
    df_long = df_long[df_long['service_description'] != '']

    df_long.reset_index(drop=True,inplace=True)
    upload_to_postgres(df_long)
    
    # df_long.to_excel('/home/rishabh/Desktop/SOC_UI/downloadHospital/data/data024/final_long.xlsx', index=False)
    print("This is df long:", df_long.head())
    
    df_wide = wide_format(df_long)
    # df_wide.to_excel('/home/rishabh/Desktop/SOC_UI/downloadHospital/data/data024/final_wide.xlsx')
    upload_to_S3(df_wide, hospital_id)
Editor is loading...