Untitled
yeiad
plain_text
2 months ago
2.0 kB
3
Indexable
import csv import os # Input and output file names input_file = 'unique_video_ids.csv' output_file = 'clean_video_ids.csv' # Dictionary to keep track of unique clean video IDs clean_video_ids = set() print(f"Processing file: {input_file}") # Read the input CSV file with open(input_file, 'r', newline='') as csvfile: csv_reader = csv.reader(csvfile) # Read the header row header = next(csv_reader, None) # Check if CSV has the expected structure if header is None or 'video_id' not in header: print(f"Error: Input file '{input_file}' does not have the expected header with 'video_id' column") exit(1) # Find the index of the video_id column video_id_index = header.index('video_id') # Process each row in the CSV for row in csv_reader: if len(row) > video_id_index: # Get the video_id value full_id = row[video_id_index] # Extract the part before the first pipe character (|) # If there's no pipe, use the full string clean_id = full_id.split('|')[0] if '|' in full_id else full_id # Add to our set of unique clean IDs clean_video_ids.add(clean_id) print(f"Found {len(clean_video_ids)} unique clean video IDs") # Write to the output CSV file with open(output_file, 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile) # Write header csv_writer.writerow(['video_id']) # Write each clean ID for clean_id in sorted(clean_video_ids): csv_writer.writerow([clean_id]) print(f"CSV file '{output_file}' has been created with {len(clean_video_ids)} clean video IDs") # Optional: If running in Google Colab, provide a download link try: from google.colab import files files.download(output_file) print(f"Download initiated for {output_file}") except ImportError: print(f"File saved at: {os.path.abspath(output_file)}")
Editor is loading...
Leave a Comment