Untitled

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import openpyxl
import csv
import hashlib
import logging
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from openpyxl.drawing.image import Image
from openpyxl.cell.rich_text import CellRichText
import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Suppress debug logs from PIL and other libraries
for key in logging.Logger.manager.loggerDict.keys():
    if 'PIL' in key or 'Pillow' in key or 'openpyxl' in key:
        logging.getLogger(key).setLevel(logging.WARNING)

DEBUG_MODE = False

def set_debug_mode(enabled):
    global DEBUG_MODE
    DEBUG_MODE = enabled
    if DEBUG_MODE:
        logging.getLogger().setLevel(logging.DEBUG)
    else:
        logging.getLogger().setLevel(logging.INFO)

# Function to generate MD5 checksum for file
def md5(fname):
    hash_md5 = hashlib.md5()
    try:
        with open(fname, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
    except FileNotFoundError:
        logging.error(f"File not found: {fname}")
        sys.exit(1)
    return hash_md5.hexdigest()

# Function to write mismatches to CSV
def writer_csv(output_list, output_file="DCW_Compare_Mismatches.csv"):
    col_labels = [
        'Sheet_Name', 'Cell_Reference', 'Difference_Type', 'Old_Value', 'New_Value', 'Details'
    ]
    try:
        with open(output_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='"')
            writer.writerow(col_labels)
            for item in output_list:
                writer.writerow([
                    item['sheet_name'],
                    item['cell_reference'],
                    item['difference_type'],
                    item['old_value'],
                    item['new_value'],
                    item['details']
                ])
        logging.info(f'Mismatch results have been written to {output_file} - please review')
    except IOError:
        logging.error(f"Could not write to file: {output_file}")
        sys.exit(1)

# Function to extract cell details
def get_cell_details(cell):
    cell_details = {
        'value': cell.value,
        'data_type': cell.data_type,
        'font': cell.font.__dict__,
        'border': cell.border.__dict__,
        'fill': cell.fill.__dict__,
        'number_format': cell.number_format,
        'alignment': cell.alignment.__dict__,
        'protection': cell.protection.__dict__,
        'hyperlink': cell.hyperlink,
        'comment': cell.comment,
        'rich_text': cell.value if isinstance(cell.value, CellRichText) else None,
        'formula': cell.value if cell.data_type == 'f' else None  # Ensure formula is included if present
    }
    return cell_details

# Function to extract image details
def get_image_details(ws):
    image_details = []
    for image in ws._images:
        img_details = {
            'name': image.path if hasattr(image, 'path') else None,
            'size': (image.width, image.height),
            'position': (image.anchor._from.col, image.anchor._from.row)
        }
        image_details.append(img_details)
    return image_details

def compare_images(images_old, images_new, sheet_name):
    mismatches = []

    old_image_positions = {(img['position'], img['size'], img['name']): img for img in images_old}
    new_image_positions = {(img['position'], img['size'], img['name']): img for img in images_new}

    old_positions_set = set(old_image_positions.keys())
    new_positions_set = set(new_image_positions.keys())

    added_images = new_positions_set - old_positions_set
    removed_images = old_positions_set - new_positions_set

    for position in added_images:
        img_new = new_image_positions[position]
        mismatches.append({
            'sheet_name': sheet_name,
            'cell_reference': f"Image at {img_new['position']}",
            'difference_type': 'Image Added',
            'old_value': 'N/A',
            'new_value': img_new['name'],
            'details': f"New size: {img_new['size']}; Position: {img_new['position']}"
        })

    for position in removed_images:
        img_old = old_image_positions[position]
        mismatches.append({
            'sheet_name': sheet_name,
            'cell_reference': f"Image at {img_old['position']}",
            'difference_type': 'Image Removed',
            'old_value': img_old['name'],
            'new_value': 'N/A',
            'details': f"Old size: {img_old['size']}; Position: {img_old['position']}"
        })

    for position in old_positions_set & new_positions_set:
        img_old = old_image_positions[position]
        img_new = new_image_positions[position]
        if img_old['name'] != img_new['name'] or img_old['size'] != img_new['size']:
            mismatches.append({
                'sheet_name': sheet_name,
                'cell_reference': f"Image at {img_old['position']}",
                'difference_type': 'Image Details Mismatch',
                'old_value': img_old['name'],
                'new_value': img_new['name'],
                'details': f"Old size: {img_old['size']}, New size: {img_new['size']}; Position: {img_old['position']}"
            })

    return mismatches

def compare_rich_text_objects(old_rich_text, new_rich_text):
    def extract_text(rich_text):
        if hasattr(rich_text, 'text'):  # Check if 'text' attribute exists
            return rich_text.text
        elif isinstance(rich_text, str):  # Handle plain strings
            return rich_text
        return ""

    old_text = extract_text(old_rich_text)
    new_text = extract_text(new_rich_text)

    return old_text != new_text

# Function to extract and compare formatting properties
def extract_and_compare_formats(old_format, new_format):
    def compare_dicts(dict1, dict2):
        differences = []
        for key in set(dict1.keys()).union(set(dict2.keys())):
            value1 = dict1.get(key)
            value2 = dict2.get(key)
            if value1 != value2:
                differences.append((key, 'diff'))
        return differences

    differences = compare_dicts(old_format, new_format)
    return differences

def compare_values(old_value, new_value):
    # Check if both values are dates
    if isinstance(old_value, datetime.datetime) and isinstance(new_value, datetime.datetime):
        return old_value != new_value
    # For other types, compare as strings to ensure accuracy
    return str(old_value) != str(new_value)

# Function to check sheets
def sheet_checker(sheet_name, ws_new, ws_old):
    output_list = []

    # Determine the max row and column to cover all cells
    max_row = max(ws_new.max_row, ws_old.max_row)
    max_column = max(ws_new.max_column, ws_old.max_column)

    # Compare cell values and formatting
    for row in range(1, max_row + 1):
        for col in range(1, max_column + 1):
            cell_new = ws_new.cell(row=row, column=col)
            cell_old = ws_old.cell(row=row, column=col)

            old_details = get_cell_details(cell_old)
            new_details = get_cell_details(cell_new)

            value_mismatch = compare_values(old_details['value'], new_details['value'])
            rich_text_mismatch = compare_rich_text_objects(old_details['rich_text'], new_details['rich_text'])
            rich_text_type_mismatch = isinstance(old_details['rich_text'], CellRichText) != isinstance(
                new_details['rich_text'], CellRichText)
            font_mismatch = old_details['font'] != new_details['font']
            formula_mismatch = old_details['formula'] != new_details['formula']  # Compare formulas

            if value_mismatch or rich_text_mismatch or rich_text_type_mismatch or font_mismatch or formula_mismatch:
                logging.debug(f"Mismatch at {cell_old.coordinate} in sheet {sheet_name}")
                logging.debug(f"Old cell details: {old_details}")
                logging.debug(f"New cell details: {new_details}")

                differences = []

                if value_mismatch:
                    logging.debug(f"Value mismatch: old_value={old_details['value']}, new_value={new_details['value']}")
                    differences.append(
                        f"Value mismatch: old_value={old_details['value']}, new_value={new_details['value']}")

                if rich_text_mismatch or rich_text_type_mismatch:
                    logging.debug(f"Rich text differences")
                    differences.append(f"Rich text differences")

                if font_mismatch:
                    format_differences = extract_and_compare_formats(old_details['font'], new_details['font'])
                    logging.debug(f"Format differences: {format_differences}")
                    differences.append(f"Format differences")

                if formula_mismatch:
                    logging.debug(
                        f"Formula mismatch: old_formula={old_details['formula']}, new_formula={new_details['formula']}")
                    differences.append(
                        f"Formula mismatch: old_formula={old_details['formula']}, new_formula={new_details['formula']}")

                details = "; ".join(differences)

                mismatch_dict = {
                    'sheet_name': sheet_name,
                    'cell_reference': str(cell_old.coordinate),
                    'difference_type': 'Mismatch',
                    'old_value': old_details['value'],
                    'new_value': new_details['value'],
                    'details': details
                }
                output_list.append(mismatch_dict)

    # Compare images
    images_old = get_image_details(ws_old)
    images_new = get_image_details(ws_new)
    image_mismatches = compare_images(images_old, images_new, sheet_name)
    output_list.extend(image_mismatches)

    return output_list

def main(new_file, old_file):
    if md5(new_file) != md5(old_file):
        logging.info('Oh dear... your files do not match')
        logging.info(f'new MD5 Checksum: {md5(new_file)}')
        logging.info(f'old MD5 Checksum: {md5(old_file)}')
        logging.info("Check the Output Error files for details of the cells that don't match")

        try:
            wb_new = openpyxl.load_workbook(new_file, data_only=False, rich_text=True)
            wb_old = openpyxl.load_workbook(old_file, data_only=False, rich_text=True)
        except Exception as e:
            logging.error(f"Error loading workbooks: {e}")
            sys.exit(1)

        new_sheets = set(wb_new.sheetnames)
        old_sheets = set(wb_old.sheetnames)

        common_sheets = new_sheets.intersection(old_sheets)
        missing_in_new = old_sheets - new_sheets
        missing_in_old = new_sheets - old_sheets

        if missing_in_new:
            logging.warning(f"Sheets missing in new file: {missing_in_new}")
        if missing_in_old:
            logging.warning(f"Sheets missing in old file: {missing_in_old}")

        output_list = []
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(sheet_checker, sheet_name, wb_new[sheet_name], wb_old[sheet_name]) for sheet_name in common_sheets]

            for future in as_completed(futures):
                output_list.extend(future.result())

        writer_csv(output_list)
    else:
        logging.info('Congratulations... your files match!')
        logging.info(f'new MD5 Checksum: {md5(new_file)}')
        logging.info(f'old MD5 Checksum: {md5(old_file)}')

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Compare two Excel files.")
    parser.add_argument("new_file", help="Path to the new Excel file.")
    parser.add_argument("old_file", help="Path to the old Excel file.")
    parser.add_argument("--debug", action="store_true", help="Enable debug mode")

    args = parser.parse_args()
    set_debug_mode(args.debug)
    main(args.new_file, args.old_file)
Editor is loading...