Untitled
unknown
plain_text
2 years ago
2.2 kB
4
Indexable
import gensim import numpy as np def get_tfidf_weights(models, sentence): w2v_model = models['w2v'] tfidf_model = models['tfidf'] dct = models['dct'] # Filter words that are in the w2v vocabulary words_in_vocab = [word for word in sentence if word in w2v_model.wv.vocab] # Calculate tfidf weights for words in the w2v vocabulary tfidf_weights = {word: tfidf_model[dct.token2id[word]] if dct.token2id.get(word) is not None else 0 for word in words_in_vocab} return tfidf_weights def embed_sentence(models, sentence): w2v_model = models['w2v'] tfidf_model = models['tfidf'] dct = models['dct'] # Tokenize the sentence tokenized_sentence = gensim.utils.simple_preprocess(sentence) # Initialize vectors for w2v_embedding and tfidf_embedding w2v_embedding = np.zeros(w2v_model.vector_size) tfidf_embedding = np.zeros(w2v_model.vector_size) total_tfidf_weight = 0 for word in tokenized_sentence: # Check if the word is in the w2v vocabulary if word in w2v_model.wv.vocab: # Update w2v_embedding w2v_embedding += w2v_model.wv[word] # Calculate tfidf weight tfidf_weight = get_tfidf_weights(models, [word])[word] # Update tfidf_embedding tfidf_embedding += tfidf_weight * w2v_model.wv[word] total_tfidf_weight += tfidf_weight # Calculate the average for w2v_embedding w2v_embedding /= len(tokenized_sentence) # Calculate the weighted average for tfidf_embedding tfidf_embedding /= total_tfidf_weight if total_tfidf_weight > 0 else 1 return {'w2v_embedding': w2v_embedding.tolist(), 'tfidf_embedding': tfidf_embedding.tolist()} # Load models w2v_model = gensim.models.Word2Vec.load('models/w2v') tfidf_model = gensim.models.TfidfModel.load('models/tfidf') dct = gensim.corpora.Dictionary.load('models/dct') # Create models dictionary models = {'w2v': w2v_model, 'tfidf': tfidf_model, 'dct': dct} # Example usage sentence = "I have a dream" result = embed_sentence(models, sentence) print(result)
Editor is loading...
Leave a Comment