Untitled

import gensim
import numpy as np

def get_tfidf_weights(models, sentence):
    w2v_model = models['w2v']
    tfidf_model = models['tfidf']
    dct = models['dct']

    # Filter words that are in the w2v vocabulary
    words_in_vocab = [word for word in sentence if word in w2v_model.wv.vocab]

    # Calculate tfidf weights for words in the w2v vocabulary
    tfidf_weights = {word: tfidf_model[dct.token2id[word]] if dct.token2id.get(word) is not None else 0 for word in words_in_vocab}

    return tfidf_weights

def embed_sentence(models, sentence):
    w2v_model = models['w2v']
    tfidf_model = models['tfidf']
    dct = models['dct']

    # Tokenize the sentence
    tokenized_sentence = gensim.utils.simple_preprocess(sentence)

    # Initialize vectors for w2v_embedding and tfidf_embedding
    w2v_embedding = np.zeros(w2v_model.vector_size)
    tfidf_embedding = np.zeros(w2v_model.vector_size)
    total_tfidf_weight = 0

    for word in tokenized_sentence:
        # Check if the word is in the w2v vocabulary
        if word in w2v_model.wv.vocab:
            # Update w2v_embedding
            w2v_embedding += w2v_model.wv[word]

            # Calculate tfidf weight
            tfidf_weight = get_tfidf_weights(models, [word])[word]

            # Update tfidf_embedding
            tfidf_embedding += tfidf_weight * w2v_model.wv[word]
            total_tfidf_weight += tfidf_weight

    # Calculate the average for w2v_embedding
    w2v_embedding /= len(tokenized_sentence)

    # Calculate the weighted average for tfidf_embedding
    tfidf_embedding /= total_tfidf_weight if total_tfidf_weight > 0 else 1

    return {'w2v_embedding': w2v_embedding.tolist(), 'tfidf_embedding': tfidf_embedding.tolist()}

# Load models
w2v_model = gensim.models.Word2Vec.load('models/w2v')
tfidf_model = gensim.models.TfidfModel.load('models/tfidf')
dct = gensim.corpora.Dictionary.load('models/dct')

# Create models dictionary
models = {'w2v': w2v_model, 'tfidf': tfidf_model, 'dct': dct}

# Example usage
sentence = "I have a dream"
result = embed_sentence(models, sentence)
print(result)
Editor is loading...