Untitled
unknown
plain_text
a year ago
13 kB
5
Indexable
#!/usr/bin/env python # -*- coding: utf-8 -*- import openpyxl import csv import hashlib import logging import sys from concurrent.futures import ThreadPoolExecutor, as_completed from openpyxl.drawing.image import Image from openpyxl.cell.rich_text import CellRichText import datetime # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s') # Suppress debug logs from PIL and other libraries for key in logging.Logger.manager.loggerDict.keys(): if 'PIL' in key or 'Pillow' in key or 'openpyxl' in key: logging.getLogger(key).setLevel(logging.WARNING) DEBUG_MODE = False def set_debug_mode(enabled): global DEBUG_MODE DEBUG_MODE = enabled if DEBUG_MODE: logging.getLogger().setLevel(logging.DEBUG) else: logging.getLogger().setLevel(logging.INFO) # Function to generate MD5 checksum for file def md5(fname): hash_md5 = hashlib.md5() try: with open(fname, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) except FileNotFoundError: logging.error(f"File not found: {fname}") sys.exit(1) return hash_md5.hexdigest() # Function to write mismatches to CSV def writer_csv(output_list, output_file="DCW_Compare_Mismatches.csv"): col_labels = [ 'Sheet_Name', 'Cell_Reference', 'Difference_Type', 'Old_Value', 'New_Value', 'Details' ] try: with open(output_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='"') writer.writerow(col_labels) for item in output_list: writer.writerow([ item['sheet_name'], item['cell_reference'], item['difference_type'], item['old_value'], item['new_value'], item['details'] ]) logging.info(f'Mismatch results have been written to {output_file} - please review') except IOError: logging.error(f"Could not write to file: {output_file}") sys.exit(1) # Function to extract cell details def get_cell_details(cell): cell_details = { 'value': cell.value, 'data_type': cell.data_type, 'font': cell.font.__dict__, 'border': cell.border.__dict__, 'fill': cell.fill.__dict__, 'number_format': cell.number_format, 'alignment': cell.alignment.__dict__, 'protection': cell.protection.__dict__, 'hyperlink': cell.hyperlink, 'comment': cell.comment, 'rich_text': cell.value if isinstance(cell.value, CellRichText) else None, 'formula': cell.value if cell.data_type == 'f' else None # Ensure formula is included if present } return cell_details # Function to extract image details def get_image_details(sheet): images = [] for drawing in sheet._images: if isinstance(drawing, Image): images.append({ 'name': drawing.path if hasattr(drawing, 'path') else None, 'size': (drawing.width, drawing.height), 'position': (drawing.anchor._from.col, drawing.anchor._from.row) }) return images def compare_images(images_old, images_new, sheet_name): mismatches = [] old_image_names_positions = {(img['name'], img['position']) for img in images_old} new_image_names_positions = {(img['name'], img['position']) for img in images_new} added_images = new_image_names_positions - old_image_names_positions removed_images = old_image_names_positions - new_image_names_positions for img_name, img_position in added_images: img_new = next(img for img in images_new if img['name'] == img_name and img['position'] == img_position) mismatches.append({ 'sheet_name': sheet_name, 'cell_reference': f"Image at {img_new['position']}", 'difference_type': 'Image Added', 'old_value': 'N/A', 'new_value': img_new['name'], 'details': f"New size: {img_new['size']}; Position: {img_new['position']}" }) for img_name, img_position in removed_images: img_old = next(img for img in images_old if img['name'] == img_name and img['position'] == img_position) mismatches.append({ 'sheet_name': sheet_name, 'cell_reference': f"Image at {img_old['position']}", 'difference_type': 'Image Removed', 'old_value': img_old['name'], 'new_value': 'N/A', 'details': f"Old size: {img_old['size']}; Position: {img_old['position']}" }) for img_name, img_position in old_image_names_positions & new_image_names_positions: img_old = next(img for img in images_old if img['name'] == img_name and img['position'] == img_position) img_new = next(img for img in images_new if img['name'] == img_name and img['position'] == img_position) if img_old['position'] != img_new['position']: mismatches.append({ 'sheet_name': sheet_name, 'cell_reference': f"Image at {img_old['position']} to {img_new['position']}", 'difference_type': 'Image Position Changed', 'old_value': img_old['position'], 'new_value': img_new['position'], 'details': f"Image: {img_old['name']}; Old position: {img_old['position']}; New position: {img_new['position']}" }) if img_old['size'] != img_new['size']: mismatches.append({ 'sheet_name': sheet_name, 'cell_reference': f"Image at {img_old['position']}", 'difference_type': 'Image Size Changed', 'old_value': img_old['size'], 'new_value': img_new['size'], 'details': f"Image: {img_old['name']}; Old size: {img_old['size']}; New size: {img_new['size']}" }) return mismatches def compare_rich_text_objects(old_rich_text, new_rich_text): def extract_text(rich_text): if hasattr(rich_text, 'text'): # Check if 'text' attribute exists return rich_text.text elif isinstance(rich_text, str): # Handle plain strings return rich_text return "" old_text = extract_text(old_rich_text) new_text = extract_text(new_rich_text) return old_text != new_text # Function to extract and compare formatting properties def extract_and_compare_formats(old_format, new_format): def compare_dicts(dict1, dict2): differences = [] for key in set(dict1.keys()).union(set(dict2.keys())): value1 = dict1.get(key) value2 = dict2.get(key) if value1 != value2: differences.append((key, 'diff')) return differences differences = compare_dicts(old_format, new_format) return differences def compare_values(old_value, new_value): # Check if both values are dates if isinstance(old_value, datetime.datetime) and isinstance(new_value, datetime.datetime): return old_value != new_value # For other types, compare as strings to ensure accuracy return str(old_value) != str(new_value) # Function to check sheets # Function to check sheets def sheet_checker(sheet_name, ws_new, ws_old): output_list = [] # Determine the max row and column to cover all cells max_row = max(ws_new.max_row, ws_old.max_row) max_column = max(ws_new.max_column, ws_old.max_column) # Compare cell values and formatting for row in range(1, max_row + 1): for col in range(1, max_column + 1): cell_new = ws_new.cell(row=row, column=col) cell_old = ws_old.cell(row=row, column=col) old_details = get_cell_details(cell_old) new_details = get_cell_details(cell_new) value_mismatch = compare_values(old_details['value'], new_details['value']) rich_text_mismatch = compare_rich_text_objects(old_details['rich_text'], new_details['rich_text']) rich_text_type_mismatch = isinstance(old_details['rich_text'], CellRichText) != isinstance( new_details['rich_text'], CellRichText) font_mismatch = old_details['font'] != new_details['font'] formula_mismatch = old_details['formula'] != new_details['formula'] # Compare formulas if value_mismatch or rich_text_mismatch or rich_text_type_mismatch or font_mismatch or formula_mismatch: logging.debug(f"Mismatch at {cell_old.coordinate} in sheet {sheet_name}") logging.debug(f"Old cell details: {old_details}") logging.debug(f"New cell details: {new_details}") differences = [] if value_mismatch: logging.debug(f"Value mismatch: old_value={old_details['value']}, new_value={new_details['value']}") differences.append( f"Value mismatch: old_value={old_details['value']}, new_value={new_details['value']}") if rich_text_mismatch or rich_text_type_mismatch: logging.debug(f"Rich text differences") differences.append(f"Rich text differences") if font_mismatch: format_differences = extract_and_compare_formats(old_details['font'], new_details['font']) logging.debug(f"Format differences: {format_differences}") differences.append(f"Format differences") if formula_mismatch: logging.debug( f"Formula mismatch: old_formula={old_details['formula']}, new_formula={new_details['formula']}") differences.append( f"Formula mismatch: old_formula={old_details['formula']}, new_formula={new_details['formula']}") details = "; ".join(differences) mismatch_dict = { 'sheet_name': sheet_name, 'cell_reference': str(cell_old.coordinate), 'difference_type': 'Mismatch', 'old_value': old_details['value'], 'new_value': new_details['value'], 'details': details } output_list.append(mismatch_dict) # Compare images images_old = get_image_details(ws_old) images_new = get_image_details(ws_new) logging.info(f"Figures Check in sheet {sheet_name} of old file: {images_old}") logging.info(f"Figures Check in sheet {sheet_name} of new file: {images_new}") image_mismatches = compare_images(images_old, images_new, sheet_name) output_list.extend(image_mismatches) return output_list def main(new_file, old_file): if md5(new_file) != md5(old_file): logging.info('Oh dear... your files do not match') logging.info(f'new MD5 Checksum: {md5(new_file)}') logging.info(f'old MD5 Checksum: {md5(old_file)}') logging.info("Check the Output Error files for details of the cells that don't match") try: wb_new = openpyxl.load_workbook(new_file, data_only=False, rich_text=True) wb_old = openpyxl.load_workbook(old_file, data_only=False, rich_text=True) except Exception as e: logging.error(f"Error loading workbooks: {e}") sys.exit(1) new_sheets = set(wb_new.sheetnames) old_sheets = set(wb_old.sheetnames) common_sheets = new_sheets.intersection(old_sheets) missing_in_new = old_sheets - new_sheets missing_in_old = new_sheets - old_sheets if missing_in_new: logging.warning(f"Sheets missing in new file: {missing_in_new}") if missing_in_old: logging.warning(f"Sheets missing in old file: {missing_in_old}") output_list = [] with ThreadPoolExecutor() as executor: futures = [executor.submit(sheet_checker, sheet_name, wb_new[sheet_name], wb_old[sheet_name]) for sheet_name in common_sheets] for future in as_completed(futures): output_list.extend(future.result()) writer_csv(output_list) else: logging.info('Congratulations... your files match!') logging.info(f'new MD5 Checksum: {md5(new_file)}') logging.info(f'old MD5 Checksum: {md5(old_file)}') if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Compare two Excel files.") parser.add_argument("new_file", help="Path to the new Excel file.") parser.add_argument("old_file", help="Path to the old Excel file.") parser.add_argument("--debug", action="store_true", help="Enable debug mode") args = parser.parse_args() set_debug_mode(args.debug) main(args.new_file, args.old_file)
Editor is loading...
Leave a Comment