Untitled

mail@pastecode.io avatar
unknown
python
a month ago
2.4 kB
1
Indexable
Never
import os
import gzip
import pandas as pd
from tqdm import tqdm

def read_gzipped_csv(filepath, delimiter='|'):
    with gzip.open(filepath, 'rt', newline='', encoding='utf-8') as f:
        return pd.read_csv(f, delimiter=delimiter)

def find_differences(df_before, df_after):
    # Identify unique rows in both dataframes
    df_all = pd.concat([df_before.assign(_version='before'), df_after.assign(_version='after')])
    df_diff = df_all.drop_duplicates(subset=df_before.columns, keep=False)
    
    # Ensure alternating rows for before and after versions of differing records
    df_diff['sort_key'] = df_diff.index
    df_diff = df_diff.sort_values(by=['sort_key', '_version'])
    df_diff.drop(columns=['sort_key'], inplace=True)
    
    return df_diff

def compare_and_output_differences(df_before, df_after, output_file_path):
    before_count, after_count = len(df_before), len(df_after)
    df_diff = find_differences(df_before, df_after)

    with open(output_file_path, 'w') as f:
        f.write(f"Record Count (Before),{before_count}\n")
        f.write(f"Record Count (After),{after_count}\n")
        f.write("Records with Different Data,\n")

        if not df_diff.empty:
            for index, row in df_diff.iterrows():
                f.write(f"{row['_version']}, " + ','.join(row.drop('_version').astype(str)) + "\n")

def compare_tables(table_names):
    base_path = 'arc'
    output_folder = 'output'
    os.makedirs(output_folder, exist_ok=True)

    for table_name in tqdm(table_names, desc="Comparing tables"):
        before_path = os.path.join(base_path, 'before', table_name)
        after_path = os.path.join(base_path, 'after', table_name)

        before_file = next(os.path.join(before_path, f) for f in os.listdir(before_path) if f.endswith('.csv.gz'))
        after_file = next(os.path.join(after_path, f) for f in os.listdir(after_path) if f.endswith('.csv.gz'))

        df_before = read_gzipped_csv(before_file, delimiter='|')
        df_after = read_gzipped_csv(after_file, delimiter='|')

        output_file_path = os.path.join(output_folder, f"{table_name}.csv")
        compare_and_output_differences(df_before, df_after, output_file_path)
        tqdm.write(f"Comparison completed for table {table_name}. Results are available in {output_file_path}")

# Example usage
table_names = ['dim_table1', 'dim_table2']
compare_tables(table_names)
Leave a Comment