Untitled
unknown
plain_text
6 days ago
4.3 kB
3
Indexable
import csv import re import os from collections import OrderedDict # Configuration TABLE_NAME = "users" # Change to your table name LINES_PER_FILE = 100000 def remove_duplicates(input_file, output_file): """Remove duplicates and generate SQL INSERT statements.""" email_regex = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') rows_dict = OrderedDict() duplicate_count = 0 skipped_count = 0 try: with open(input_file, 'r', newline='', encoding='utf-8') as infile: reader = csv.reader(infile) # Read header try: header = next(reader) except StopIteration: print(f"Empty file: {input_file}") return # Find email column from first data row try: first_row = next(reader) except StopIteration: print(f"No data rows in {input_file}") return email_column = None for i, value in enumerate(first_row): if email_regex.match(value.strip()): email_column = i break if email_column is None: print(f"No email column found in {input_file}") return # Process rows rows = [first_row] + list(reader) for row in rows: try: identifier = ( row[0].strip().lower(), row[1].strip().lower(), row[email_column].strip().lower() ) if identifier not in rows_dict: rows_dict[identifier] = row else: duplicate_count += 1 except (IndexError, AttributeError): skipped_count += 1 continue # Write SQL output with open(output_file, 'w', encoding='utf-8') as outfile: columns = ", ".join(header) for row in rows_dict.values(): values = [] for value in row: escaped_value = value.strip().replace("'", "''") values.append(f"'{escaped_value}'") values_str = ", ".join(values) outfile.write( f"INSERT INTO {TABLE_NAME} ({columns}) VALUES ({values_str});\n" ) print(f"Processed {input_file}: " f"{duplicate_count} duplicates, {skipped_count} skipped") except Exception as e: print(f"Error processing {input_file}: {str(e)}") def split_and_clean(input_file): """Split large file into SQL chunks and deduplicate.""" file_counter = 1 buffer = [] with open(input_file, 'r', newline='', encoding='utf-8') as infile: reader = csv.reader(infile) try: header = next(reader) except StopIteration: print("Empty input file") return buffer.append(header) for line_number, row in enumerate(reader, 1): buffer.append(row) if line_number % LINES_PER_FILE == 0: process_chunk(buffer, header, file_counter) file_counter += 1 buffer = [header] # Process final chunk if len(buffer) > 1: process_chunk(buffer, header, file_counter) def process_chunk(buffer, header, file_counter): """Handle temporary chunk processing.""" temp_input = f"temp_{file_counter:04d}.csv" output_file = f"output_{file_counter:04d}.sql" try: # Write temporary CSV with open(temp_input, 'w', newline='', encoding='utf-8') as temp_file: writer = csv.writer(temp_file) writer.writerows(buffer) # Deduplicate and create SQL remove_duplicates(temp_input, output_file) finally: # Cleanup temporary file if os.path.exists(temp_input): os.remove(temp_input) if __name__ == '__main__': split_and_clean("your_large_file.txt") # Change to your input file
Editor is loading...
Leave a Comment