Untitled
unknown
plain_text
14 days ago
1.7 kB
3
Indexable
import csv import re from collections import OrderedDict def remove_duplicates(input_file='522914.txt', output_file='522914_cleaned.txt'): email_regex = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') rows_dict = OrderedDict() duplicate_count = 0 skipped_count = 0 with open(input_file, 'r', newline='', encoding='utf-8') as infile: reader = csv.reader(infile) header = next(reader) first_row = next(reader) email_column = None for i, value in enumerate(first_row): if email_regex.match(value.strip()): email_column = i break if email_column is None: raise ValueError("No email column found in the input file.") rows = [first_row] + list(reader) for row in rows: try: identifier = (row[0].strip().lower(), row[1].strip().lower(), row[email_column].strip().lower()) if identifier not in rows_dict: rows_dict[identifier] = row else: duplicate_count += 1 except (IndexError, AttributeError): # Skip rows with missing fields skipped_count += 1 continue with open(output_file, 'w', newline='', encoding='utf-8') as outfile: writer = csv.writer(outfile) writer.writerow(header) writer.writerows(rows_dict.values()) print(f"Duplicates removed: {duplicate_count}") print(f"Rows skipped due to missing fields: {skipped_count}") remove_duplicates()
Editor is loading...
Leave a Comment