Untitled

 avatar
unknown
plain_text
a year ago
1.7 kB
6
Indexable
# Two options: 
# Option 1. Filtering by substring: This means if we include "cig" in the set of keywords, titles with words like "e-cig" or "cigarettes" will be matched.
# Option 2. Filtering by exact word: In this case, only titles containing the exact word "cig" will be matched, regardless of case. This requires creating sets with plurals, for example, "cigarette" and "cigarettes".

# # Option 1
# # Count videos that contain at least one keyword in the title
# videos_with_keyword = video_df[video_df['title'].str.contains('|'.join(keywords), case=False, na=False)]

# Option 2
# Create a regular expression pattern for exact words
pattern = r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b'
# Filter videos based on regular expression pattern
videos_with_keyword = video_df[video_df['title'].str.contains(pattern, case=False, na=False)]

##Finally
# Custom list of words to exclude
exclude_words = [
  "podcast", "music", "time", "lyric", "Andaman", "Nicobar", "island", "Alan Walker", 
  "Music Video", "type beat", "Lucid Dream", "Juice WRLD", "WRLD", "rugby", "Mobile Legend",
  "drink", "asmr", "ft", "feat", "Weed", "Food", "vlog", "cbd", "acer", "Joe Rogan"
]

# Create a regex pattern for the words to exclude
exclude_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in exclude_words) + r')\b'

# Filter out videos with titles containing any of the exclude words
videos_with_keyword = videos_with_keyword[~videos_with_keyword['title'].str.contains(exclude_pattern, case=False, na=False)]

print("Total number of videos:", len(video_df))
print("Number of videos with at least one keyword in the title:", len(videos_with_keyword))
Editor is loading...
Leave a Comment