Untitled

 avatar
user_3839718
python
2 months ago
1.7 kB
1
Indexable
Never
import pandas as pd
from db.elasticdb import ElasticDB
from collections import defaultdict
import string
import nltk

es = ElasticDB()
es_client = es.search(index_name="recipenlp", todos=True, query={"size": 500000, "query": {"match_all": {}}})
df = pd.DataFrame(es_client).sample(frac=1).reset_index(drop=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 1000)

df["directions"].tolist()
# flatten the list
directions = [item for sublist in df["directions"].tolist() for item in sublist]



def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Tokenize the text
texts = [[remove_punctuation(word.lower()) for word in text.split()] for text in directions]

# Create a dictionary for word pairs
word_pairs = defaultdict(lambda: defaultdict(int))

# Count the word pairs
for text in texts:
    for i in range(len(text) - 1):
        current_word = text[i]
        next_word = text[i + 1]
        word_pairs[current_word][next_word] += 1

# Convert counts to probabilities
for current_word in word_pairs:
    total_count = sum(word_pairs[current_word].values())
    for next_word in word_pairs[current_word]:
        word_pairs[current_word][next_word] /= total_count

transition_df = pd.DataFrame.from_dict(word_pairs, orient='index').fillna(0)
transition_df.to_csv("transition.csv")
try:
    res = transition_df.loc["pepper"]
    tagged = nltk.pos_tag(res.index.tolist())
    tagged_df = pd.DataFrame(tagged, columns=['word', 'POS'])
    tagged_df["prob"] = res.tolist()
    tagged_df = tagged_df.sort_values(by="prob", ascending=False)
    print(tagged_df.head(10))

except KeyError:
    print("Not found")
Leave a Comment