Untitled
unknown
plain_text
9 months ago
4.3 kB
6
Indexable
import csv
import re
import os
from collections import OrderedDict
# Configuration
TABLE_NAME = "users" # Change to your table name
LINES_PER_FILE = 100000
def remove_duplicates(input_file, output_file):
"""Remove duplicates and generate SQL INSERT statements."""
email_regex = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
rows_dict = OrderedDict()
duplicate_count = 0
skipped_count = 0
try:
with open(input_file, 'r', newline='', encoding='utf-8') as infile:
reader = csv.reader(infile)
# Read header
try:
header = next(reader)
except StopIteration:
print(f"Empty file: {input_file}")
return
# Find email column from first data row
try:
first_row = next(reader)
except StopIteration:
print(f"No data rows in {input_file}")
return
email_column = None
for i, value in enumerate(first_row):
if email_regex.match(value.strip()):
email_column = i
break
if email_column is None:
print(f"No email column found in {input_file}")
return
# Process rows
rows = [first_row] + list(reader)
for row in rows:
try:
identifier = (
row[0].strip().lower(),
row[1].strip().lower(),
row[email_column].strip().lower()
)
if identifier not in rows_dict:
rows_dict[identifier] = row
else:
duplicate_count += 1
except (IndexError, AttributeError):
skipped_count += 1
continue
# Write SQL output
with open(output_file, 'w', encoding='utf-8') as outfile:
columns = ", ".join(header)
for row in rows_dict.values():
values = []
for value in row:
escaped_value = value.strip().replace("'", "''")
values.append(f"'{escaped_value}'")
values_str = ", ".join(values)
outfile.write(
f"INSERT INTO {TABLE_NAME} ({columns}) VALUES ({values_str});\n"
)
print(f"Processed {input_file}: "
f"{duplicate_count} duplicates, {skipped_count} skipped")
except Exception as e:
print(f"Error processing {input_file}: {str(e)}")
def split_and_clean(input_file):
"""Split large file into SQL chunks and deduplicate."""
file_counter = 1
buffer = []
with open(input_file, 'r', newline='', encoding='utf-8') as infile:
reader = csv.reader(infile)
try:
header = next(reader)
except StopIteration:
print("Empty input file")
return
buffer.append(header)
for line_number, row in enumerate(reader, 1):
buffer.append(row)
if line_number % LINES_PER_FILE == 0:
process_chunk(buffer, header, file_counter)
file_counter += 1
buffer = [header]
# Process final chunk
if len(buffer) > 1:
process_chunk(buffer, header, file_counter)
def process_chunk(buffer, header, file_counter):
"""Handle temporary chunk processing."""
temp_input = f"temp_{file_counter:04d}.csv"
output_file = f"output_{file_counter:04d}.sql"
try:
# Write temporary CSV
with open(temp_input, 'w', newline='', encoding='utf-8') as temp_file:
writer = csv.writer(temp_file)
writer.writerows(buffer)
# Deduplicate and create SQL
remove_duplicates(temp_input, output_file)
finally:
# Cleanup temporary file
if os.path.exists(temp_input):
os.remove(temp_input)
if __name__ == '__main__':
split_and_clean("your_large_file.txt") # Change to your input fileEditor is loading...
Leave a Comment