Untitled
unknown
plain_text
8 months ago
1.7 kB
5
Indexable
import csv
import re
from collections import OrderedDict
def remove_duplicates(input_file='522914.txt', output_file='522914_cleaned.txt'):
email_regex = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
rows_dict = OrderedDict()
duplicate_count = 0
skipped_count = 0
with open(input_file, 'r', newline='', encoding='utf-8') as infile:
reader = csv.reader(infile)
header = next(reader)
first_row = next(reader)
email_column = None
for i, value in enumerate(first_row):
if email_regex.match(value.strip()):
email_column = i
break
if email_column is None:
raise ValueError("No email column found in the input file.")
rows = [first_row] + list(reader)
for row in rows:
try:
identifier = (row[0].strip().lower(),
row[1].strip().lower(),
row[email_column].strip().lower())
if identifier not in rows_dict:
rows_dict[identifier] = row
else:
duplicate_count += 1
except (IndexError, AttributeError):
# Skip rows with missing fields
skipped_count += 1
continue
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
writer.writerow(header)
writer.writerows(rows_dict.values())
print(f"Duplicates removed: {duplicate_count}")
print(f"Rows skipped due to missing fields: {skipped_count}")
remove_duplicates()
Editor is loading...
Leave a Comment