Untitled
unknown
plain_text
a year ago
1.8 kB
8
Indexable
import pandas as pd from transformers import AutoModel, AutoTokenizer import torch from sklearn.cluster import KMeans import numpy as np import spacy # Load spaCy English tokenizer, tagger, parser, NER, and word vectors nlp = spacy.load("en_core_web_sm") # Initialize tokenizer and model from Hugging Face Transformers tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") def preprocess_and_normalize_tweets(tweets): normalized_tweets = [] for tweet in tweets: # Placeholder for slang normalization # tweet = normalize_slang(tweet) # spaCy document processing doc = nlp(tweet) # Lemmatize tweet and join back into a string lemmatized_tweet = " ".join([token.lemma_ for token in doc]) normalized_tweets.append(lemmatized_tweet) return normalized_tweets # Function to encode tweets into embeddings def encode_tweets(tweets): with torch.no_grad(): encoded_input = tokenizer(tweets, padding=True, truncation=True, return_tensors='pt') model_output = model(**encoded_input) return model_output.pooler_output # Load the dataset df = pd.read_csv("tweets.csv", nrows=1000) # Filtering tweets where language is 'en' and converting to a list of strings tweets = df[df['language'] == 'en']['content'].tolist() # Preprocess and normalize tweets normalized_tweets = preprocess_and_normalize_tweets(tweets) # Encode tweets tweet_embeddings = encode_tweets(normalized_tweets).numpy() # Perform KMeans clustering kmeans = KMeans(n_clusters=10, random_state=0).fit(tweet_embeddings) labels = kmeans.labels_ # Print the clustered tweets for i, tweet in enumerate(normalized_tweets[:20]): print(f"Tweet: {tweet}\nCluster: {labels[i]}\n")
Editor is loading...
Leave a Comment