Untitled

 avatar
unknown
plain_text
a year ago
2.8 kB
1
Indexable
import pandas as pd
import datetime

# Define a default datetime value
default_datetime = datetime.datetime(2023, 1, 1, 0, 0, 0)

# Initialize counters for INSERT and UPDATE operations
insert_count = 0
update_count = 0

# Assuming you have source_df and target_df loaded

# Iterate over the primary key values in source_df
for pk_value in source_df[pk].unique():
    source_subset = source_df[source_df[pk] == pk_value]
    target_subset = target_df[target_df[pk] == pk_value]

    if pk_value not in target_subset[pk].values:
        # Insert the entire row from source_df, including the primary key
        insert_data = source_subset.copy()
        
        # Replace 'NaT' values in date columns with default_datetime
        for date_col in ['created_on', 'updated_on']:
            if date_col in insert_data.columns:
                insert_data[date_col] = insert_data[date_col].fillna(default_datetime)
        
        # Convert date columns to string format suitable for insertion
        for date_col in ['created_on', 'updated_on']:
            if date_col in insert_data.columns:
                insert_data[date_col] = insert_data[date_col].dt.strftime('%Y-%m-%d %H:%M:%S')
        
        insert_data.to_sql(table_name, connection_ext, if_exists='append', index=False)
        insert_count += len(insert_data)  # Increment the insert count

    else:
        # Check for updates and generate and execute UPDATE query if needed
        mask = (source_subset != target_subset).any(axis=1)
        if mask.any():
            updates = source_subset.loc[mask].copy()
            
            # Replace 'NaT' values in date columns with default_datetime
            for date_col in ['created_on', 'updated_on']:
                if date_col in updates.columns:
                    updates[date_col] = updates[date_col].fillna(default_datetime)
            
            # Convert date columns to datetime data type if not already
            for date_col in ['created_on', 'updated_on']:
                if date_col in updates.columns:
                    updates[date_col] = pd.to_datetime(updates[date_col])
            
            # Convert date columns to string format suitable for update
            for date_col in ['created_on', 'updated_on']:
                if date_col in updates.columns:
                    updates[date_col] = updates[date_col].dt.strftime('%Y-%m-%d %H:%M:%S')
            
            for column_name, source_val in updates.items():
                target_df.loc[target_df[pk] == pk_value, column_name] = source_val
            update_count += len(updates)  # Increment the update count

# Commit changes to the database (if necessary)
connection_ext.commit()

# Print the counts and queries
print(f"Total INSERT operations: {insert_count}")
print(f"Total UPDATE operations: {update_count}")