Untitled

import pandas as pd
import json
from base64 import b64decode as base_b64decode
import logging
from pandas import read_sql as pd_read_sql
import sys
from sqlalchemy import create_engine
import argparse
from io import TextIOBase as io_TextIOBase
from json import load as json_load
import os
import datetime


# define Python user-defined exceptions
class Error(Exception):
    """Base class for other exceptions"""
    pass


# define Python user-defined exceptions
class ETL_PPM_REPLICATION_MASTER_ERROR(Error):
    pass


class DB_CONNECTION_ERROR(Error):
    pass


def is_datetime(value):
    try:
        # Attempt to parse the value as a datetime
        datetime.datetime.strptime(str(value), '%Y-%m-%d %H:%M:%S')
        return True
    except ValueError:
        return False

# Function to write SQL query to the log file
def write_sql(sql_log_file, query_info):
    sql_log_file.write(query_info)
    sql_log_file.write('\n')



def setDbConnection(logging, json_data, serverInfo, encoding):
    from sqlalchemy import create_engine as sqlalchemy_create_engine
    from base64 import b64decode as base_b64decode
    import mysql.connector
    try:
        cnx = cursor = schema = db_type = None
        encrypt = json_data.get(serverInfo, {}).get('ENCRYPT')
        host = json_data.get(serverInfo, {}).get('DB_HOST')
        port = json_data.get(serverInfo, {}).get('DB_PORT')
        user = json_data.get(serverInfo, {}).get('DB_USER')
        db_type = json_data.get(serverInfo, {}).get('DB_TYPE')
        schema = json_data.get(serverInfo, {}).get('DB_SCHEMA')
        if encrypt == 'Y':
            password = base_b64decode(json_data.get(serverInfo, {}).get('DB_PASSWORD')).decode('utf-8')
        else:
            password = json_data.get(serverInfo, {}).get('DB_PASSWORD')

        if db_type in ('MYSQL', 'MARIA'):
            connection_text = ('mysql+mysqlconnector://%s:%s@%s:%s/%s' % (user
                                                                          , password
                                                                          , host
                                                                          , port
                                                                          , schema))
        elif db_type == 'ORACLE':
            connection_text = ('oracle://%s:%s@%s:%s/%s' % (user
                                                            , password
                                                            , host
                                                            , port
                                                            , json_data.get(serverInfo, {}).get('DB_SID')
                                                            ))
        cnx = sqlalchemy_create_engine(connection_text
                                       , encoding=encoding)
        # ,fast_executemany=True
        # , connect_args={'connect_timeout': 600})
        cursor = cnx.connect()
        logging.info(f"Connected to database server {serverInfo}: {host}:{port}/{schema}")
    # except mysql.connector.Error as dberr:
    #    logging.error("DATABASE CONNECTION ERROR")
    #    logging.error("Error - {} . Line No - {} ".format(str(dberr), str(sys.exc_info()[-1].tb_lineno)))
    #    cnx = cursor = schema = None
    except Exception as dbexp:
        logging.error("DATABASE CONNECTION EXCEPTION")
        logging.error("Error - {} . Line No - {} ".format(str(dbexp), str(sys.exc_info()[-1].tb_lineno)))
        cnx = cursor = schema = None

    return cnx, cursor, schema, db_type


# def main(args, json_data, log_file, logging, encoding):
def process_changes_only(pdb_ppm_sc, pdb_source, pdb_target, ps_rep_env, ps_op_id, ps_bu_id, ps_rep_job_id, pl_logging):
    try:
        pl_logging.info("Started")
        replicationJobId = args.replicationJobId
        return_flag = True
        STATUS = "Success"
        STATUS_MESSAGE = "Service Catalogue Replication Successful."
        failed_entities = []
        opId=ps_op_id
        buId=ps_bu_id

        pl_logging.info("Variable declaration")
        total_events_logged = 0
        source_read_count = 0
        target_read_count = 0
        inserted_count = 0
        updated_count = 0
        insert_failed_count = 0
        update_failed_count = 0
        query_count=0
        pl_logging.info("Master query")
        sql_log_file = open(f"/app/scripts/PPM_Release_Management/Service_Catalog_ETL/logs/{replicationJobId}_ppm_sc_changesonly.sql","w")
        primary_query = f"SELECT * FROM {schema_ppm}.etl_ppm_replication_master WHERE eprm_catalog='SC' AND eprm_enabled_flg='Y'"
        write_sql(sql_log_file,f"-- ++++++++++++++++++++++++++++++++ Primary Query")
        write_sql(sql_log_file,primary_query)
        primary_df = pd_read_sql(primary_query, con=pdb_ppm_sc)
        for index, row in primary_df.iterrows():
            # Fetch primary key column name and table name
            table_name = row['eprm_table_name'].lower()
            eprm_table_col_pk = row['eprm_table_col_pk']
            pk = eprm_table_col_pk.lower()
            where_clause = ((row['eprm_join_cols_reim'] % (ps_op_id, ps_bu_id)))
            query_count+=1
            table_alias=row['eprm_table_alias']
            write_sql(sql_log_file,
                      f"-- ++++++++++++++++++++++++++++++++ Seq# {query_count}| entity# {pk} | {table_name}")
            try:
                source_query = f"SELECT /*+ PARALLEL(2) */ * FROM {schema_source}.{table_name} {table_alias} where {where_clause}"
                write_sql(sql_log_file,source_query)
                pl_logging.info("Reading data from source")
                source_df = pd.read_sql(source_query, connection_source)
                pl_logging.info("source_df: %s" % len(source_df))
                write_sql(sql_log_file,f"source: {table_name}: {source_read_count}")
                source_read_count+=len(source_df)
                
                target_query = f"SELECT /*+ PARALLEL(2) */ * FROM {schema_ext}.{table_name} {table_alias} where {where_clause}"
                write_sql(sql_log_file,target_query)
                pl_logging.info("Reading data from target")
                target_df = pd.read_sql(target_query, connection_target)
                write_sql(sql_log_file, f"target: {table_name}: {target_read_count}")
                target_read_count+=len(target_df)

                for index, source_row in source_df.iterrows():
                    pk_value = source_row[pk]

                    # Check if the primary key exists in the target DataFrame
                    if pk_value not in target_df[pk].values:

                        # Replace 'None' and 'NaT' with None in source_row
                        for column_name, source_val in source_row.items():
                            if source_val == 'None' or source_val == 'NaT' or source_val == 'nan':
                                source_row[column_name] = NULL

                        # Generate an INSERT query dynamically
                        insert_query = f"INSERT INTO {schema_ext}.{table_name} ("
                        insert_columns = []
                        insert_values = []

                        for column_name, source_val in source_row.items():
                            if source_val is not None:
                                if isinstance(source_val, str) and source_val.startswith('TO_DATE'):
                                    # If it already starts with TO_DATE, don't add TO_DATE again
                                    insert_values.append(source_val)
                                elif is_datetime(source_val):
                                    # Format datetime values using the appropriate function for the database type
                                    if db_type_ext == 'ORACLE':
                                        insert_values.append(f"TO_DATE('{source_val}', 'YYYY-MM-DD HH24:MI:SS')")
                                    elif db_type_ext in ('MYSQL', 'MARIA'):
                                        # For MariaDB, use STR_TO_DATE
                                        insert_values.append(f"STR_TO_DATE('{source_val}', '%Y-%m-%d %H:%i:%s')")
                                    else:
                                        # Enclose other values in single quotes
                                        insert_values.append(f"'{source_val}'")
                                    insert_columns.append(column_name)  # Add the column name
                                elif str(source_val) == 'NaT':
                                    # Replace 'NaT' with NULL without single quotes
                                    insert_values.append('NULL')
                                    insert_columns.append(column_name)  # Add the column name
                                elif column_name == 'extended_rule_code':
                                    parts = source_val.split('==')
                                    if len(parts) == 2:
                                        extended_rule_code = f"'{parts[0]}=='{parts[1]}''"
                                        insert_values.append(extended_rule_code)
                                    insert_columns.append(column_name)
                                elif str(source_val) == 'nan':
                                    insert_values.append('NULL')
                                    insert_columns.append(column_name)
                                else:
                                    # Enclose other values in single quotes
                                    source_val = str(source_val).replace("'","''")
                                    insert_values.append(f"'{source_val}'")
                                    insert_columns.append(column_name)  # Add the column name
                            else:
                                insert_values.append('NULL')  # Insert a true NULL
                                insert_columns.append(column_name)  # Add the column name

                            # Construct the INSERT query with column names
                        insert_query = f"INSERT INTO {schema_ext}.{table_name} ({', '.join(insert_columns)}) VALUES ({', '.join(insert_values)})"

                        # Execute the INSERT query
                        try:
                            inserted_count += 1
                            cursor_target.execute(insert_query)
                            write_sql(sql_log_file, "INSERTION" +insert_query + ";\n")
                        except Exception as e:
                            print(insert_query)
                            pl_logging.error(
                                "Error - {} . Line No - {} ".format(str(e), str(sys.exc_info()[-1].tb_lineno)))
                            write_sql(sql_log_file, f"INSERT FAILED: {insert_query}")
                            insert_failed_count += 1
                    else:
                        # Fetch the corresponding row from the target DataFrame based on the primary key
                        target_row = target_df[target_df[pk] == pk_value].iloc[0]
                        if not source_row.equals(target_row):
                            columns_to_update = []
                            for column_name, source_val in source_row.items():
                                target_val = target_row[column_name]
                                if source_val != target_val:
                                    if is_datetime(source_val):
                                        # Format datetime values using the appropriate function for the database type
                                        if db_type_ext == 'ORACLE':
                                            update_value = f"TO_DATE('{source_val}', 'YYYY-MM-DD HH24:MI:SS')"
                                        elif db_type_ext in ('MYSQL', 'MARIA'):
                                            # For MariaDB and MySQL, use STR_TO_DATE
                                            update_value = f"STR_TO_DATE('{source_val}', '%Y-%m-%d %H:%i:%s')"
                                        else:
                                            # Enclose other values in single quotes
                                            update_value = f"'{source_val}'"
                                    elif str(source_val) == 'NaT':
                                        # Replace 'NaT' with NULL without single quotes
                                        update_value = 'NULL'
                                    elif str(source_val) == 'nan':
                                        # Replace 'NaT' with NULL without single quotes
                                        update_value = 'NULL'
                                    elif str(source_val) == 'None':
                                        update_value = 'NULL'


                                    elif column_name == 'extended_rule_code':
                                        parts = source_val.split('==')
                                        if len(parts) == 2:
                                            extended_rule_code = f"'{parts[0]}=='{parts[1]}'"
                                            update_value = extended_rule_code
                                    else:
                                        if column_name == 'created_by':
                                            update_value = f"'{source_val}'"
                                        else:
                                            # Handle non-datetime columns (e.g., strings, numbers) here
                                            update_value = f"'{source_val}'"

                                    # Add the column name and formatted value to the update statement
                                    columns_to_update.append(f"{column_name} = {update_value}")

                            # Generate an update query dynamically
                            if columns_to_update:
                                update_query = f"UPDATE {schema_ext}.{table_name} SET "
                                update_query += ", ".join(columns_to_update)
                                update_query += f" WHERE {eprm_table_col_pk} = '{pk_value}' AND OP_ID='{opId}' AND BU_ID='{buId}'"

                                try:
                                    print(update_query)
                                    cursor_target.execute(update_query)
                                    updated_count+=1
                                    write_sql(sql_log_file, f"UPDATE: {update_query}")
                                except Exception as e:
                                    pl_logging.error(
                                        "Error - {} . Line No - {} ".format(str(e), str(sys.exc_info()[-1].tb_lineno)))
                                    write_sql(sql_log_file, f"UPDATE FAILED: {update_query}")
                                    update_failed_count += 1
            except Exception as e:
                error_msg=f"Error querying table {schema_ext}.{table_name}: {str(e)} skipping the table and proceeding for next table"
                pl_logging.error(error_msg)
                failed_entities.append((table_name,error_msg))
                continue

        # Calculate the inserted and updated counts

        total_events_logged=inserted_count+updated_count
        pl_logging.info(f"Total Events Logged: {total_events_logged}")
        pl_logging.info(f"Source Read Count: {source_read_count}")
        pl_logging.info(f"Target Read Count: {target_read_count}")
        pl_logging.info(f"Inserted Count: {inserted_count}")
        pl_logging.info(f"Updated Count: {updated_count}")
        pl_logging.info(f"Failed Inserted Count: {insert_failed_count}")
        pl_logging.info(f"Failed Updated Count: {update_failed_count}")


        pl_logging.info(f"-- ++++++++++++++++++++++++++++++++++ BACK UPDATE STATUS FOR UI ++++++++++++++++++ \n")
        if len(failed_entities) > 0:
            return_flag = False
            STATUS = "Error"
            STATUS_MESSAGE = str(failed_entities).replace("'", '').replace('"', '')

        pl_logging.info("STATUS: %s" % STATUS)
        pl_logging.info("STATUS_MESSAGE: %s" % STATUS_MESSAGE)

        query_update = f"UPDATE {schema_source}.ppm_replication_status SET status='" + STATUS + "'" \
                       + ", status_message='" + STATUS_MESSAGE + "'" \
                       + ", error_description=NULL" \
                       + ", updated_by='" + replicationJobId + "' /**/" \
                       + " WHERE replication_job_id='"+replicationJobId+"'"

        if db_type_source == 'ORACLE':
            query_update = query_update.replace('/**/', ', updated_date=SYSDATE')
        elif db_type_source in ('MARIA', 'MYSQL'):
            query_update = query_update.replace('/**/', ', updated_date=NOW()')

        pl_logging.info("-- + ppm_replication_status - UPDATE \n")
        pl_logging.info(query_update + ";\n")
        pl_logging.info(f"-- ++++++++++++++++++++++++++++++++++ FIN ++++++++++++++++++++++++++++++++++++++++ \n")
        res = cursor_source.execute(query_update)
        pl_logging.info("query_update: %s" % res)
    except ETL_PPM_REPLICATION_MASTER_ERROR:
        STATUS_MESSAGE = "NO RECORDS PRESENT IN etl_ppm_replication_master TABLE"
        pl_logging.error("EXCEPTION:" + STATUS_MESSAGE)
        return_flag = False
    except Exception as e:
        pl_logging.error("Error - {} . Line No - {} ".format(str(e), str(sys.exc_info()[-1].tb_lineno)))
        return_flag = False

    return return_flag


if __name__ == '__main__':
    import logging
    from configparser import ConfigParser as conf_ConfigParser

    try:
        # python3 ppm_sc_changesonly.py --operation ChangesOnly --replicationTarget SIT --opId HOB --buId DEFAULT --replicationJobId SC_JOB_999
        parser = argparse.ArgumentParser(description="PPM Service Catalog Replication Script")
        parser.add_argument('--operation', required=True, help="Operation Type")
        parser.add_argument('--replicationTarget', required=True, help="Replication Target")
        parser.add_argument('--opId', required=True, help="Operation ID")
        parser.add_argument('--buId', required=True, help="Business Unit ID")
        parser.add_argument('--replicationJobId', required=True, help="Replication Job ID")
        args = parser.parse_args()
        replicationJobId = args.replicationJobId
        json_file_path = "/app/scripts/PPM_Release_Management/Service_Catalog_ETL/config/dna.json"
        conf_file_path = "/app/scripts/PPM_Release_Management/Service_Catalog_ETL/config/ppm_sc_replication.conf"
        sql_log_file = f"/app/scripts/PPM_Release_Management/Service_Catalog_ETL/logs/{replicationJobId}_ppm_sc_changesonly.sql"
        log_file = f'/app/scripts/PPM_Release_Management/Service_Catalog_ETL/logs/{replicationJobId}_ppm_sc_changesonly.log'
        args = parser.parse_args()

        # Set up logging
        CONFIG = conf_ConfigParser()
        CONFIG.read(conf_file_path)

        logging.basicConfig(filename=log_file
                            , level=CONFIG.get('CONFIG_LOGGING', 'LOG_LEVEL', raw=True)
                            , format=CONFIG.get('CONFIG_LOG_FORMAT', 'LOG_FORMAT_DISP', raw=True)
                            , datefmt=CONFIG.get('CONFIG_LOG_FORMAT', 'LOG_FORMAT_DATE', raw=True)
                            )
        logging.info('LOGGER initiated')

        encoding = CONFIG.get('CONFIG_GENERIC', 'DB_CHARSET', raw=True)

        # Read JSON data from file
        if not os.path.exists(json_file_path):
            logging.error("CREDENTIAL FILE MISSING")
            logging.error("CREDENTIAL FILE: %s" % json_file_path)
            raise FileNotFoundError("CREDENTIAL FILE MISSING")

        with open(json_file_path) as json_file:
            json_data = json_load(json_file)

        # Connect to PPM_PC database
        connection_ppm, cursor_ppm, schema_ppm, db_type_ppm = setDbConnection(logging, json_data, 'PPM_PC', encoding)

        # Connect to source database
        connection_source, cursor_source, schema_source, db_type_source = setDbConnection(logging, json_data, 'SOURCE',
                                                                                          encoding)

        # Connect to target_ext database
        connection_target, cursor_target, schema_ext, db_type_ext = setDbConnection(logging, json_data,
                                                                                    args.replicationTarget,
                                                                                    encoding)

        if not (connection_ppm and connection_source and connection_target):
            raise DB_CONNECTION_ERROR
        # if main(args, json_data, log_file, logging, encoding):
        if process_changes_only(pdb_ppm_sc=connection_ppm
                , pdb_source=connection_source
                , pdb_target=connection_target
                , ps_rep_env=args.replicationTarget
                , ps_op_id=args.opId
                , ps_bu_id=args.buId
                , ps_rep_job_id=args.replicationJobId
                , pl_logging=logging):
            print("Update successful")
        else:
            print("Update FAILED")

    except DB_CONNECTION_ERROR:
        logging.error("EXCEPTION: DB CONNECTION ERROR")
        print("EXCEPTION: DB CONNECTION ERROR")

    except FileNotFoundError as ferr:
        print("Error - {} . Line No - {} ".format(str(ferr), str(sys.exc_info()[-1].tb_lineno)))

    except Exception as err:
        print("Error - {} . Line No - {} ".format(str(err), str(sys.exc_info()[-1].tb_lineno)))

in the above insert statrement can you please update and optimize like if source pk_value is not present in target can you directly insert that record in target using to_sql and remove all the checks that we are doing
Editor is loading...