Untitled
yeiad
plain_text
10 months ago
2.0 kB
6
Indexable
import csv
import os
# Input and output file names
input_file = 'unique_video_ids.csv'
output_file = 'clean_video_ids.csv'
# Dictionary to keep track of unique clean video IDs
clean_video_ids = set()
print(f"Processing file: {input_file}")
# Read the input CSV file
with open(input_file, 'r', newline='') as csvfile:
csv_reader = csv.reader(csvfile)
# Read the header row
header = next(csv_reader, None)
# Check if CSV has the expected structure
if header is None or 'video_id' not in header:
print(f"Error: Input file '{input_file}' does not have the expected header with 'video_id' column")
exit(1)
# Find the index of the video_id column
video_id_index = header.index('video_id')
# Process each row in the CSV
for row in csv_reader:
if len(row) > video_id_index:
# Get the video_id value
full_id = row[video_id_index]
# Extract the part before the first pipe character (|)
# If there's no pipe, use the full string
clean_id = full_id.split('|')[0] if '|' in full_id else full_id
# Add to our set of unique clean IDs
clean_video_ids.add(clean_id)
print(f"Found {len(clean_video_ids)} unique clean video IDs")
# Write to the output CSV file
with open(output_file, 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
# Write header
csv_writer.writerow(['video_id'])
# Write each clean ID
for clean_id in sorted(clean_video_ids):
csv_writer.writerow([clean_id])
print(f"CSV file '{output_file}' has been created with {len(clean_video_ids)} clean video IDs")
# Optional: If running in Google Colab, provide a download link
try:
from google.colab import files
files.download(output_file)
print(f"Download initiated for {output_file}")
except ImportError:
print(f"File saved at: {os.path.abspath(output_file)}")Editor is loading...
Leave a Comment