Untitled

 avatar
unknown
plain_text
14 days ago
1.7 kB
3
Indexable
import csv
import re
from collections import OrderedDict

def remove_duplicates(input_file='522914.txt', output_file='522914_cleaned.txt'):
    email_regex = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    rows_dict = OrderedDict()
    duplicate_count = 0
    skipped_count = 0

    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        header = next(reader)
        
        first_row = next(reader)
        email_column = None
        for i, value in enumerate(first_row):
            if email_regex.match(value.strip()):
                email_column = i
                break

        if email_column is None:
            raise ValueError("No email column found in the input file.")
        
        rows = [first_row] + list(reader)
        
        for row in rows:
            try:
                identifier = (row[0].strip().lower(), 
                            row[1].strip().lower(), 
                            row[email_column].strip().lower())
                if identifier not in rows_dict:
                    rows_dict[identifier] = row
                else:
                    duplicate_count += 1
            except (IndexError, AttributeError):
                # Skip rows with missing fields
                skipped_count += 1
                continue

    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(header)
        writer.writerows(rows_dict.values())

    print(f"Duplicates removed: {duplicate_count}")
    print(f"Rows skipped due to missing fields: {skipped_count}")

remove_duplicates()
Editor is loading...
Leave a Comment