Untitled

 avatar
unknown
plain_text
6 days ago
4.3 kB
3
Indexable
import csv
import re
import os
from collections import OrderedDict

# Configuration
TABLE_NAME = "users"  # Change to your table name
LINES_PER_FILE = 100000

def remove_duplicates(input_file, output_file):
    """Remove duplicates and generate SQL INSERT statements."""
    email_regex = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    rows_dict = OrderedDict()
    duplicate_count = 0
    skipped_count = 0

    try:
        with open(input_file, 'r', newline='', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            
            # Read header
            try:
                header = next(reader)
            except StopIteration:
                print(f"Empty file: {input_file}")
                return

            # Find email column from first data row
            try:
                first_row = next(reader)
            except StopIteration:
                print(f"No data rows in {input_file}")
                return

            email_column = None
            for i, value in enumerate(first_row):
                if email_regex.match(value.strip()):
                    email_column = i
                    break

            if email_column is None:
                print(f"No email column found in {input_file}")
                return

            # Process rows
            rows = [first_row] + list(reader)
            for row in rows:
                try:
                    identifier = (
                        row[0].strip().lower(),
                        row[1].strip().lower(),
                        row[email_column].strip().lower()
                    )
                    if identifier not in rows_dict:
                        rows_dict[identifier] = row
                    else:
                        duplicate_count += 1
                except (IndexError, AttributeError):
                    skipped_count += 1
                    continue

        # Write SQL output
        with open(output_file, 'w', encoding='utf-8') as outfile:
            columns = ", ".join(header)
            for row in rows_dict.values():
                values = []
                for value in row:
                    escaped_value = value.strip().replace("'", "''")
                    values.append(f"'{escaped_value}'")
                values_str = ", ".join(values)
                outfile.write(
                    f"INSERT INTO {TABLE_NAME} ({columns}) VALUES ({values_str});\n"
                )

        print(f"Processed {input_file}: "
              f"{duplicate_count} duplicates, {skipped_count} skipped")

    except Exception as e:
        print(f"Error processing {input_file}: {str(e)}")

def split_and_clean(input_file):
    """Split large file into SQL chunks and deduplicate."""
    file_counter = 1
    buffer = []
    
    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        
        try:
            header = next(reader)
        except StopIteration:
            print("Empty input file")
            return

        buffer.append(header)
        
        for line_number, row in enumerate(reader, 1):
            buffer.append(row)
            
            if line_number % LINES_PER_FILE == 0:
                process_chunk(buffer, header, file_counter)
                file_counter += 1
                buffer = [header]

        # Process final chunk
        if len(buffer) > 1:
            process_chunk(buffer, header, file_counter)

def process_chunk(buffer, header, file_counter):
    """Handle temporary chunk processing."""
    temp_input = f"temp_{file_counter:04d}.csv"
    output_file = f"output_{file_counter:04d}.sql"
    
    try:
        # Write temporary CSV
        with open(temp_input, 'w', newline='', encoding='utf-8') as temp_file:
            writer = csv.writer(temp_file)
            writer.writerows(buffer)
        
        # Deduplicate and create SQL
        remove_duplicates(temp_input, output_file)
        
    finally:
        # Cleanup temporary file
        if os.path.exists(temp_input):
            os.remove(temp_input)

if __name__ == '__main__':
    split_and_clean("your_large_file.txt")  # Change to your input file
Editor is loading...
Leave a Comment