Untitled
unknown
plain_text
2 years ago
1.8 kB
15
Indexable
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.cluster import KMeans
import numpy as np
import spacy
# Load spaCy English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")
# Initialize tokenizer and model from Hugging Face Transformers
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
def preprocess_and_normalize_tweets(tweets):
normalized_tweets = []
for tweet in tweets:
# Placeholder for slang normalization
# tweet = normalize_slang(tweet)
# spaCy document processing
doc = nlp(tweet)
# Lemmatize tweet and join back into a string
lemmatized_tweet = " ".join([token.lemma_ for token in doc])
normalized_tweets.append(lemmatized_tweet)
return normalized_tweets
# Function to encode tweets into embeddings
def encode_tweets(tweets):
with torch.no_grad():
encoded_input = tokenizer(tweets, padding=True, truncation=True, return_tensors='pt')
model_output = model(**encoded_input)
return model_output.pooler_output
# Load the dataset
df = pd.read_csv("tweets.csv", nrows=1000)
# Filtering tweets where language is 'en' and converting to a list of strings
tweets = df[df['language'] == 'en']['content'].tolist()
# Preprocess and normalize tweets
normalized_tweets = preprocess_and_normalize_tweets(tweets)
# Encode tweets
tweet_embeddings = encode_tweets(normalized_tweets).numpy()
# Perform KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=0).fit(tweet_embeddings)
labels = kmeans.labels_
# Print the clustered tweets
for i, tweet in enumerate(normalized_tweets[:20]):
print(f"Tweet: {tweet}\nCluster: {labels[i]}\n")
Editor is loading...
Leave a Comment