Untitled
unknown
plain_text
a year ago
12 kB
7
Indexable
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import openpyxl
import csv
import hashlib
import logging
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from openpyxl.drawing.image import Image
from openpyxl.cell.rich_text import CellRichText
import datetime
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
# Suppress debug logs from PIL and other libraries
for key in logging.Logger.manager.loggerDict.keys():
if 'PIL' in key or 'Pillow' in key or 'openpyxl' in key:
logging.getLogger(key).setLevel(logging.WARNING)
DEBUG_MODE = False
def set_debug_mode(enabled):
global DEBUG_MODE
DEBUG_MODE = enabled
if DEBUG_MODE:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
# Function to generate MD5 checksum for file
def md5(fname):
hash_md5 = hashlib.md5()
try:
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
except FileNotFoundError:
logging.error(f"File not found: {fname}")
sys.exit(1)
return hash_md5.hexdigest()
# Function to write mismatches to CSV
def writer_csv(output_list, output_file="DCW_Compare_Mismatches.csv"):
col_labels = [
'Sheet_Name', 'Cell_Reference', 'Difference_Type', 'Old_Value', 'New_Value', 'Details'
]
try:
with open(output_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"')
writer.writerow(col_labels)
for item in output_list:
writer.writerow([
item['sheet_name'],
item['cell_reference'],
item['difference_type'],
item['old_value'],
item['new_value'],
item['details']
])
logging.info(f'Mismatch results have been written to {output_file} - please review')
except IOError:
logging.error(f"Could not write to file: {output_file}")
sys.exit(1)
# Function to extract cell details
def get_cell_details(cell):
cell_details = {
'value': cell.value,
'data_type': cell.data_type,
'font': cell.font.__dict__,
'border': cell.border.__dict__,
'fill': cell.fill.__dict__,
'number_format': cell.number_format,
'alignment': cell.alignment.__dict__,
'protection': cell.protection.__dict__,
'hyperlink': cell.hyperlink,
'comment': cell.comment,
'rich_text': cell.value if isinstance(cell.value, CellRichText) else None,
'formula': cell.value if cell.data_type == 'f' else None # Ensure formula is included if present
}
return cell_details
# Function to extract image details
def get_image_details(ws):
image_details = []
for image in ws._images:
img_details = {
'name': image.path if hasattr(image, 'path') else None,
'size': (image.width, image.height),
'position': (image.anchor._from.col, image.anchor._from.row)
}
image_details.append(img_details)
return image_details
def compare_images(images_old, images_new, sheet_name):
mismatches = []
old_image_positions = {(img['position'], img['size'], img['name']): img for img in images_old}
new_image_positions = {(img['position'], img['size'], img['name']): img for img in images_new}
old_positions_set = set(old_image_positions.keys())
new_positions_set = set(new_image_positions.keys())
added_images = new_positions_set - old_positions_set
removed_images = old_positions_set - new_positions_set
for position in added_images:
img_new = new_image_positions[position]
mismatches.append({
'sheet_name': sheet_name,
'cell_reference': f"Image at {img_new['position']}",
'difference_type': 'Image Added',
'old_value': 'N/A',
'new_value': img_new['name'],
'details': f"New size: {img_new['size']}; Position: {img_new['position']}"
})
for position in removed_images:
img_old = old_image_positions[position]
mismatches.append({
'sheet_name': sheet_name,
'cell_reference': f"Image at {img_old['position']}",
'difference_type': 'Image Removed',
'old_value': img_old['name'],
'new_value': 'N/A',
'details': f"Old size: {img_old['size']}; Position: {img_old['position']}"
})
for position in old_positions_set & new_positions_set:
img_old = old_image_positions[position]
img_new = new_image_positions[position]
if img_old['name'] != img_new['name'] or img_old['size'] != img_new['size']:
mismatches.append({
'sheet_name': sheet_name,
'cell_reference': f"Image at {img_old['position']}",
'difference_type': 'Image Details Mismatch',
'old_value': img_old['name'],
'new_value': img_new['name'],
'details': f"Old size: {img_old['size']}, New size: {img_new['size']}; Position: {img_old['position']}"
})
return mismatches
def compare_rich_text_objects(old_rich_text, new_rich_text):
def extract_text(rich_text):
if hasattr(rich_text, 'text'): # Check if 'text' attribute exists
return rich_text.text
elif isinstance(rich_text, str): # Handle plain strings
return rich_text
return ""
old_text = extract_text(old_rich_text)
new_text = extract_text(new_rich_text)
return old_text != new_text
# Function to extract and compare formatting properties
def extract_and_compare_formats(old_format, new_format):
def compare_dicts(dict1, dict2):
differences = []
for key in set(dict1.keys()).union(set(dict2.keys())):
value1 = dict1.get(key)
value2 = dict2.get(key)
if value1 != value2:
differences.append((key, 'diff'))
return differences
differences = compare_dicts(old_format, new_format)
return differences
def compare_values(old_value, new_value):
# Check if both values are dates
if isinstance(old_value, datetime.datetime) and isinstance(new_value, datetime.datetime):
return old_value != new_value
# For other types, compare as strings to ensure accuracy
return str(old_value) != str(new_value)
# Function to check sheets
def sheet_checker(sheet_name, ws_new, ws_old):
output_list = []
# Determine the max row and column to cover all cells
max_row = max(ws_new.max_row, ws_old.max_row)
max_column = max(ws_new.max_column, ws_old.max_column)
# Compare cell values and formatting
for row in range(1, max_row + 1):
for col in range(1, max_column + 1):
cell_new = ws_new.cell(row=row, column=col)
cell_old = ws_old.cell(row=row, column=col)
old_details = get_cell_details(cell_old)
new_details = get_cell_details(cell_new)
value_mismatch = compare_values(old_details['value'], new_details['value'])
rich_text_mismatch = compare_rich_text_objects(old_details['rich_text'], new_details['rich_text'])
rich_text_type_mismatch = isinstance(old_details['rich_text'], CellRichText) != isinstance(
new_details['rich_text'], CellRichText)
font_mismatch = old_details['font'] != new_details['font']
formula_mismatch = old_details['formula'] != new_details['formula'] # Compare formulas
if value_mismatch or rich_text_mismatch or rich_text_type_mismatch or font_mismatch or formula_mismatch:
logging.debug(f"Mismatch at {cell_old.coordinate} in sheet {sheet_name}")
logging.debug(f"Old cell details: {old_details}")
logging.debug(f"New cell details: {new_details}")
differences = []
if value_mismatch:
logging.debug(f"Value mismatch: old_value={old_details['value']}, new_value={new_details['value']}")
differences.append(
f"Value mismatch: old_value={old_details['value']}, new_value={new_details['value']}")
if rich_text_mismatch or rich_text_type_mismatch:
logging.debug(f"Rich text differences")
differences.append(f"Rich text differences")
if font_mismatch:
format_differences = extract_and_compare_formats(old_details['font'], new_details['font'])
logging.debug(f"Format differences: {format_differences}")
differences.append(f"Format differences")
if formula_mismatch:
logging.debug(
f"Formula mismatch: old_formula={old_details['formula']}, new_formula={new_details['formula']}")
differences.append(
f"Formula mismatch: old_formula={old_details['formula']}, new_formula={new_details['formula']}")
details = "; ".join(differences)
mismatch_dict = {
'sheet_name': sheet_name,
'cell_reference': str(cell_old.coordinate),
'difference_type': 'Mismatch',
'old_value': old_details['value'],
'new_value': new_details['value'],
'details': details
}
output_list.append(mismatch_dict)
# Compare images
images_old = get_image_details(ws_old)
images_new = get_image_details(ws_new)
image_mismatches = compare_images(images_old, images_new, sheet_name)
output_list.extend(image_mismatches)
return output_list
def main(new_file, old_file):
if md5(new_file) != md5(old_file):
logging.info('Oh dear... your files do not match')
logging.info(f'new MD5 Checksum: {md5(new_file)}')
logging.info(f'old MD5 Checksum: {md5(old_file)}')
logging.info("Check the Output Error files for details of the cells that don't match")
try:
wb_new = openpyxl.load_workbook(new_file, data_only=False, rich_text=True)
wb_old = openpyxl.load_workbook(old_file, data_only=False, rich_text=True)
except Exception as e:
logging.error(f"Error loading workbooks: {e}")
sys.exit(1)
new_sheets = set(wb_new.sheetnames)
old_sheets = set(wb_old.sheetnames)
common_sheets = new_sheets.intersection(old_sheets)
missing_in_new = old_sheets - new_sheets
missing_in_old = new_sheets - old_sheets
if missing_in_new:
logging.warning(f"Sheets missing in new file: {missing_in_new}")
if missing_in_old:
logging.warning(f"Sheets missing in old file: {missing_in_old}")
output_list = []
with ThreadPoolExecutor() as executor:
futures = [executor.submit(sheet_checker, sheet_name, wb_new[sheet_name], wb_old[sheet_name]) for sheet_name in common_sheets]
for future in as_completed(futures):
output_list.extend(future.result())
writer_csv(output_list)
else:
logging.info('Congratulations... your files match!')
logging.info(f'new MD5 Checksum: {md5(new_file)}')
logging.info(f'old MD5 Checksum: {md5(old_file)}')
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Compare two Excel files.")
parser.add_argument("new_file", help="Path to the new Excel file.")
parser.add_argument("old_file", help="Path to the old Excel file.")
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
args = parser.parse_args()
set_debug_mode(args.debug)
main(args.new_file, args.old_file)
Editor is loading...
Leave a Comment