import pandas as pd
import datetime
# Define a default datetime value
default_datetime = datetime.datetime(2023, 1, 1, 0, 0, 0)
# Initialize counters for INSERT and UPDATE operations
insert_count = 0
update_count = 0
# Assuming you have source_df and target_df loaded
# Iterate over the primary key values in source_df
for pk_value in source_df[pk].unique():
source_subset = source_df[source_df[pk] == pk_value]
target_subset = target_df[target_df[pk] == pk_value]
if pk_value not in target_subset[pk].values:
# Insert the entire row from source_df, including the primary key
insert_data = source_subset.copy()
# Replace 'NaT' values in date columns with default_datetime
for date_col in ['created_on', 'updated_on']:
if date_col in insert_data.columns:
insert_data[date_col] = insert_data[date_col].fillna(default_datetime)
# Convert date columns to string format suitable for insertion
for date_col in ['created_on', 'updated_on']:
if date_col in insert_data.columns:
insert_data[date_col] = insert_data[date_col].dt.strftime('%Y-%m-%d %H:%M:%S')
insert_data.to_sql(table_name, connection_ext, if_exists='append', index=False)
insert_count += len(insert_data) # Increment the insert count
else:
# Check for updates and generate and execute UPDATE query if needed
mask = (source_subset != target_subset).any(axis=1)
if mask.any():
updates = source_subset.loc[mask].copy()
# Replace 'NaT' values in date columns with default_datetime
for date_col in ['created_on', 'updated_on']:
if date_col in updates.columns:
updates[date_col] = updates[date_col].fillna(default_datetime)
# Convert date columns to datetime data type if not already
for date_col in ['created_on', 'updated_on']:
if date_col in updates.columns:
updates[date_col] = pd.to_datetime(updates[date_col])
# Convert date columns to string format suitable for update
for date_col in ['created_on', 'updated_on']:
if date_col in updates.columns:
updates[date_col] = updates[date_col].dt.strftime('%Y-%m-%d %H:%M:%S')
for column_name, source_val in updates.items():
target_df.loc[target_df[pk] == pk_value, column_name] = source_val
update_count += len(updates) # Increment the update count
# Commit changes to the database (if necessary)
connection_ext.commit()
# Print the counts and queries
print(f"Total INSERT operations: {insert_count}")
print(f"Total UPDATE operations: {update_count}")