Untitled

mail@pastecode.io avatar
unknown
python
a month ago
3.0 kB
1
Indexable
Never
def read_gzipped_csv(filepath, delimiter='|'):
    with gzip.open(filepath, 'rt', newline='', encoding='utf-8') as f:
        return pd.read_csv(f, delimiter=delimiter)

def prepare_differences(df_before, df_after):
    # Combine and tag dataframes for comparison
    df_before['_version'] = 'before'
    df_after['_version'] = 'after'
    df_combined = pd.concat([df_before, df_after], ignore_index=True)
    
    # Find rows that are unique to either 'before' or 'after' by dropping duplicates
    df_diff = df_combined.drop_duplicates(subset=df_combined.columns.difference(['_version']), keep='first').sort_values(by=list(df_combined.columns[:-1]))
    
    # Filter out only alternating rows (before-after pairs)
    alternating_diffs = []
    for _, group in df_diff.groupby(list(df_combined.columns[:-2])):
        if len(group) == 2:  # Ensure we have both before and after
            alternating_diffs.append(group)
    df_alternating_diffs = pd.concat(alternating_diffs, ignore_index=True) if alternating_diffs else pd.DataFrame()

    return df_alternating_diffs

def compare_and_output_differences(df_before, df_after, output_file_path):
    before_count, after_count = len(df_before), len(df_after)
    record_diff = pd.DataFrame({
        'Description': ['Record Count (Before)', 'Record Count (After)', 'Records with Different Data'],
        'Count': [before_count, after_count, '']
    })

    df_diff = prepare_differences(df_before, df_after)
    
    if not df_diff.empty:
        record_diff.at[2, 'Count'] = len(df_diff) // 2  # Divide by 2 to count each pair as one difference
        df_diff['_output'] = df_diff.apply(lambda x: f"{x['_version']}, " + ','.join(x.drop('_version').astype(str)), axis=1)
        diff_details = pd.DataFrame(df_diff['_output'], columns=['Differences'])
        output_df = pd.concat([record_diff, pd.DataFrame([''] * 3, columns=['Differences']), diff_details], ignore_index=True)
    else:
        output_df = record_diff

    output_df.to_csv(output_file_path, index=False, header=False)

def compare_tables(table_names):
    base_path = 'arc'
    output_folder = 'output'
    os.makedirs(output_folder, exist_ok=True)

    for table_name in tqdm(table_names, desc="Comparing tables"):
        before_path = os.path.join(base_path, 'before', table_name)
        after_path = os.path.join(base_path, 'after', table_name)

        before_file = next(os.path.join(before_path, f) for f in os.listdir(before_path) if f.endswith('.csv.gz'))
        after_file = next(os.path.join(after_path, f) for f in os.listdir(after_path) if f.endswith('.csv.gz'))

        df_before = read_gzipped_csv(before_file, delimiter='|')
        df_after = read_gzipped_csv(after_file, delimiter='|')

        output_file_path = os.path.join(output_folder, f"{table_name}.csv")
        compare_and_output_differences(df_before, df_after, output_file_path)
        tqdm.write(f"Comparison completed for table {table_name}. Results are available in {output_file_path}")
Leave a Comment