mail@pastecode.io avatar
21 days ago
1.8 kB
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.cluster import KMeans
import numpy as np
import spacy

# Load spaCy English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Initialize tokenizer and model from Hugging Face Transformers
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def preprocess_and_normalize_tweets(tweets):
    normalized_tweets = []
    for tweet in tweets:
        # Placeholder for slang normalization
        # tweet = normalize_slang(tweet)
        # spaCy document processing
        doc = nlp(tweet)
        # Lemmatize tweet and join back into a string
        lemmatized_tweet = " ".join([token.lemma_ for token in doc])
    return normalized_tweets

# Function to encode tweets into embeddings
def encode_tweets(tweets):
    with torch.no_grad():
        encoded_input = tokenizer(tweets, padding=True, truncation=True, return_tensors='pt')
        model_output = model(**encoded_input)
    return model_output.pooler_output

# Load the dataset
df = pd.read_csv("tweets.csv", nrows=1000)

# Filtering tweets where language is 'en' and converting to a list of strings
tweets = df[df['language'] == 'en']['content'].tolist()

# Preprocess and normalize tweets
normalized_tweets = preprocess_and_normalize_tweets(tweets)

# Encode tweets
tweet_embeddings = encode_tweets(normalized_tweets).numpy()

# Perform KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=0).fit(tweet_embeddings)
labels = kmeans.labels_

# Print the clustered tweets
for i, tweet in enumerate(normalized_tweets[:20]):
    print(f"Tweet: {tweet}\nCluster: {labels[i]}\n")
Leave a Comment