Untitled
unknown
python
2 years ago
11 kB
12
Indexable
import os import json import pandas as pd import psycopg2 import configparser from pymongo import MongoClient import boto3 from sqlalchemy import create_engine import io import re import numpy as np config = configparser.RawConfigParser() config.read(os.path.join(os.getcwd() , '../config/config.property')) #Database Connection MONGODB = dict(config.items('MONGODB')) MONGODB_HOST = MONGODB['host'] MONGODB_PORT = int(MONGODB['port']) MONGODB_DATABASE = MONGODB['dbname'] MONGODB_COLLECTION1 = MONGODB["hospital_structure_details"] MONGO_CLIENT = MongoClient(MONGODB_HOST, MONGODB_PORT) MONGODB_DB = MONGO_CLIENT[MONGODB_DATABASE] HOSPITAL_STRUCTURE_DETAILS= MONGODB_DB[MONGODB_COLLECTION1] #Evaluation Batch Detail #S3 credentials S3BUCKET = dict(config.items("S3BUCKET")) ACCESS_KEY_ID=S3BUCKET["keyid"] AWS_SECRET_ACCESS_KEY=S3BUCKET["accesskey"] REGION=S3BUCKET["region"] BUCKET=S3BUCKET['bucket_name'] FOLDER = S3BUCKET['folder_name'] #POSTGRES credentials POSTGRES = dict(config.items("POSTGRES")) HOST=POSTGRES["host"] PORT=POSTGRES["port"] DATABASE=POSTGRES["database"] USER=POSTGRES['user'] PASSWORD = POSTGRES['password'] #HOSPITAL_ID HOSPITAL = dict(config.items("HOSPITAL_ID")) HOSPITAL_ID = HOSPITAL['hospital_id'] ref_dict = { 'hospital_id':'', 'page':'', 'table':'', 's3url':'', 'service_code':'STRUCT011', 'category':'STRUCT008', 'subcategory':'STRUCT009', 'service_description':'STRUCT003', 'bounding_box_sd':'', 'type_of_room':'STRUCT001', 'price_per_unit':'STRUCT002', 'remarks':'STRUCT004', 'los':'STRUCT007', 'inclusions':'STRUCT005', 'exclusions':'STRUCT006', 'start_date':'', 'end_date':'', 'tariff_type':'', 'room_mapping':'', 'confidence_score_sd':'', 'standard_term':'' } def create_postgres_data(json_file): json_response = json_file['response'] col_dict = column_dict(json_response) print('coldict : ', col_dict) row_dict = row_dictionary(json_response) print('rowdict : ', row_dict) df = populate_df(json_file, row_dict, col_dict) # df.to_excel('/home/rishabh/Desktop/SOC_UI/downloadHospital/data/out3.xlsx', index=False) return df def row_dictionary(json_response): row_dict = {} des_list=[] for i in json_response: if i['label']=='STRUCT003': des_list.append(i['value']) description_list = list(set(des_list)) # print(description_list) for description in description_list: row=[] for res in json_response: if res['value']==description: row_number = res['fieldType'].split('R')[-1] row = row+[row_number] row_dict[description]=row row_dict = dict(sorted(row_dict.items(), key=lambda x: int(x[1][0]))) # print(row_dict) return row_dict def column_dict(json_response): col_dict = {} for ref in ref_dict: val = ref_dict[ref] if val !='': for i in range(0,len(json_response)): if val==json_response[i]['label']: column = (json_response[i]['fieldType'].split('R')[0]).split('C')[-1] col_dict[val]=str(column) return col_dict def populate_df(json_file, row_dict, col_dict): json_response = json_file['response'] data=[] for service_desc in row_dict: for row_num in row_dict[service_desc]: post_dict = {} post_dict['hospital_id']=json_file['hospitalId'] post_dict['page']=json_file['page'] post_dict['table']=json_file['table'] post_dict['s3url']=json_file['pageViewURL'] try: post_dict['service_code']=str(cell_value(json_response, row_num, col_dict[ref_dict['service_code']])) except: post_dict['service_code']='' try: post_dict['category']=str(cell_value(json_response, row_num, col_dict[ref_dict['category']])) except: post_dict['category']='' try: post_dict['subcategory']=str(cell_value(json_response, row_num, col_dict[ref_dict['subcategory']])) except: post_dict['subcategory']='' try: post_dict['type_of_room']=str(cell_value(json_response, row_num, col_dict[ref_dict['type_of_room']])) except: post_dict['type_of_room']='' try: post_dict['price_per_unit']=str(cell_value(json_response, row_num, col_dict[ref_dict['price_per_unit']])) post_dict['price_per_unit'] = post_dict['price_per_unit'].replace(",", "") post_dict['price_per_unit'] = int(post_dict['price_per_unit']) except: post_dict['price_per_unit']=0 try: post_dict['inclusions']=str(cell_value(json_response, row_num, col_dict[ref_dict['inclusions']])) except: post_dict['inclusions']='' try: post_dict['exclusions']=str(cell_value(json_response, row_num, col_dict[ref_dict['exclusions']])) except: post_dict['exclusions']='' try: post_dict['remarks']=str(cell_value(json_response, row_num, col_dict[ref_dict['remarks']])) except: post_dict['remarks']='' try: post_dict['los']=str(cell_value(json_response, row_num, col_dict[ref_dict['los']])) except: post_dict['los']='' post_dict['service_description']=str(service_desc) post_dict['bounding_box_sd']='' # post_dict['start_date']=get_date_and_tariff('start_date', json_file['pageViewURL']) post_dict['start_date']='' post_dict['end_date']='2091/12/31' post_dict['tariff_type']=get_date_and_tariff('tariff_type', json_file['pageViewURL']) # post_dict['tariff_type']='' post_dict['room_mapping']='' post_dict['confidence_score_sd']='' post_dict['standard_term']='' data= data + [post_dict] df = pd.DataFrame(data) return df def cell_value(json_response, row_number, column_number): field_type = 'C'+column_number+'R'+row_number for i in json_response: if i['fieldType'] == field_type: # print(i['value']) return i['value'] def get_date_and_tariff(pattern_type, pageViewURL): if pattern_type=='start_date': date_pattern = r'(?<!\d)(?:\s*)(\d{4}[-/]\d{2}[-/]\d{2}|\d{2}[-/]\d{2}[-/]\d{4}|\d{4}[/]\d{2}[/]\d{2}|\d{2}[/]\d{2}[/]\d{4})(?:\s*|\b)' date_matches = re.findall(date_pattern, pageViewURL) start_date='' if date_matches: start_date = date_matches[0] return start_date else: return start_date if pattern_type=='tariff_type': url_lower = pageViewURL.lower() if 'soc' in url_lower or 'open' in url_lower: return 'OPEN' elif 'agreed' in url_lower: return 'AGREED' else: return 'OPEN' def move_columns(df, columns_to_move, target_column): columns_not_to_move = [col for col in df.columns if col not in columns_to_move] target_index = columns_not_to_move.index(target_column) new_order = columns_not_to_move[:target_index + 1] + columns_to_move + columns_not_to_move[target_index + 1:] df = df.reindex(columns=new_order) return df def wide_format(df_long): df2 = df_long.pivot_table(index = ['hospital_id','page','table','s3url', 'service_code','category','subcategory','service_description', 'bounding_box_sd','remarks','los','inclusions','exclusions','start_date','end_date','tariff_type','room_mapping','confidence_score_sd','standard_term'],columns='type_of_room', values='price_per_unit').reset_index() print('This is df2:', df2) columns_to_drop = ['bounding_box_sd', 'standard_term'] df2.drop(columns_to_drop, axis=1, inplace=True) column_index_last = df2.columns.get_loc('confidence_score_sd') # print(column_index_last) selected_columns = df2.iloc[:, column_index_last+1:] col_names = list(selected_columns) target_column = 'service_description' df2 = move_columns(df2, col_names, target_column) df_wide = df2.dropna(axis=1, how="all") # df_wide = df_wide.drop(['confidence_score_sd'], axis=1, inplace=True) return df_wide def upload_to_S3(df2, hospital_id): s3_client = boto3.client("s3", aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) name = hospital_id+'.xlsx' with io.BytesIO() as excel_buffer: df2.to_excel(excel_buffer, index=False) response = s3_client.put_object( Bucket='drishti-soc-data', Key=FOLDER+'/'+name, Body=excel_buffer.getvalue() ) status = response.get("ResponseMetadata", {}).get("HTTPStatusCode") if status == 200: print(f"Successful S3 put_object response. Status - {status}") else: print(f"Unsuccessful S3 put_object response. Status - {status}") def upload_to_postgres(df_long): conn_string = 'postgresql://' +USER+':'+PASSWORD+'@'+HOST+'/'+DATABASE db = create_engine(conn_string) conn = db.connect() df_long.to_sql(HOSPITAL_ID, con=conn, if_exists='replace', index=False) print('Pushed to postgres') if __name__ == '__main__': hospital_id = HOSPITAL_ID records = HOSPITAL_STRUCTURE_DETAILS.find({'hospitalId':hospital_id},{'_id':0}) records_list = list(records) column_headers = list(ref_dict.keys()) df_final = pd.DataFrame(columns=column_headers) list_df = [] for i in range(0, len(records_list)): a = json.dumps(records_list[i]) json_file = json.loads(a) df = create_postgres_data(json_file) # print('yes') print(df.head()) df_final = pd.concat([df_final, df], ignore_index=True) df_long = df_final[df_final['price_per_unit'] != '0'] df_long = df_long[df_long['service_description'] != ''] df_long.reset_index(drop=True,inplace=True) upload_to_postgres(df_long) # df_long.to_excel('/home/rishabh/Desktop/SOC_UI/downloadHospital/data/data024/final_long.xlsx', index=False) print("This is df long:", df_long.head()) df_wide = wide_format(df_long) # df_wide.to_excel('/home/rishabh/Desktop/SOC_UI/downloadHospital/data/data024/final_wide.xlsx') upload_to_S3(df_wide, hospital_id)
Editor is loading...