Untitled

import csv
import os

# Input and output file names
input_file = 'unique_video_ids.csv'
output_file = 'clean_video_ids.csv'

# Dictionary to keep track of unique clean video IDs
clean_video_ids = set()

print(f"Processing file: {input_file}")

# Read the input CSV file
with open(input_file, 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    
    # Read the header row
    header = next(csv_reader, None)
    
    # Check if CSV has the expected structure
    if header is None or 'video_id' not in header:
        print(f"Error: Input file '{input_file}' does not have the expected header with 'video_id' column")
        exit(1)
    
    # Find the index of the video_id column
    video_id_index = header.index('video_id')
    
    # Process each row in the CSV
    for row in csv_reader:
        if len(row) > video_id_index:
            # Get the video_id value
            full_id = row[video_id_index]
            
            # Extract the part before the first pipe character (|)
            # If there's no pipe, use the full string
            clean_id = full_id.split('|')[0] if '|' in full_id else full_id
            
            # Add to our set of unique clean IDs
            clean_video_ids.add(clean_id)

print(f"Found {len(clean_video_ids)} unique clean video IDs")

# Write to the output CSV file
with open(output_file, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write header
    csv_writer.writerow(['video_id'])
    
    # Write each clean ID
    for clean_id in sorted(clean_video_ids):
        csv_writer.writerow([clean_id])

print(f"CSV file '{output_file}' has been created with {len(clean_video_ids)} clean video IDs")

# Optional: If running in Google Colab, provide a download link
try:
    from google.colab import files
    files.download(output_file)
    print(f"Download initiated for {output_file}")
except ImportError:
    print(f"File saved at: {os.path.abspath(output_file)}")
Editor is loading...