Untitled
unknown
plain_text
2 years ago
2.2 kB
5
Indexable
import gensim
import numpy as np
def get_tfidf_weights(models, sentence):
w2v_model = models['w2v']
tfidf_model = models['tfidf']
dct = models['dct']
# Filter words that are in the w2v vocabulary
words_in_vocab = [word for word in sentence if word in w2v_model.wv.vocab]
# Calculate tfidf weights for words in the w2v vocabulary
tfidf_weights = {word: tfidf_model[dct.token2id[word]] if dct.token2id.get(word) is not None else 0 for word in words_in_vocab}
return tfidf_weights
def embed_sentence(models, sentence):
w2v_model = models['w2v']
tfidf_model = models['tfidf']
dct = models['dct']
# Tokenize the sentence
tokenized_sentence = gensim.utils.simple_preprocess(sentence)
# Initialize vectors for w2v_embedding and tfidf_embedding
w2v_embedding = np.zeros(w2v_model.vector_size)
tfidf_embedding = np.zeros(w2v_model.vector_size)
total_tfidf_weight = 0
for word in tokenized_sentence:
# Check if the word is in the w2v vocabulary
if word in w2v_model.wv.vocab:
# Update w2v_embedding
w2v_embedding += w2v_model.wv[word]
# Calculate tfidf weight
tfidf_weight = get_tfidf_weights(models, [word])[word]
# Update tfidf_embedding
tfidf_embedding += tfidf_weight * w2v_model.wv[word]
total_tfidf_weight += tfidf_weight
# Calculate the average for w2v_embedding
w2v_embedding /= len(tokenized_sentence)
# Calculate the weighted average for tfidf_embedding
tfidf_embedding /= total_tfidf_weight if total_tfidf_weight > 0 else 1
return {'w2v_embedding': w2v_embedding.tolist(), 'tfidf_embedding': tfidf_embedding.tolist()}
# Load models
w2v_model = gensim.models.Word2Vec.load('models/w2v')
tfidf_model = gensim.models.TfidfModel.load('models/tfidf')
dct = gensim.corpora.Dictionary.load('models/dct')
# Create models dictionary
models = {'w2v': w2v_model, 'tfidf': tfidf_model, 'dct': dct}
# Example usage
sentence = "I have a dream"
result = embed_sentence(models, sentence)
print(result)
Editor is loading...
Leave a Comment