Untitled
user_3839718
python
2 years ago
1.7 kB
6
Indexable
import pandas as pd
from db.elasticdb import ElasticDB
from collections import defaultdict
import string
import nltk
es = ElasticDB()
es_client = es.search(index_name="recipenlp", todos=True, query={"size": 500000, "query": {"match_all": {}}})
df = pd.DataFrame(es_client).sample(frac=1).reset_index(drop=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 1000)
df["directions"].tolist()
# flatten the list
directions = [item for sublist in df["directions"].tolist() for item in sublist]
def remove_punctuation(text):
return text.translate(str.maketrans('', '', string.punctuation))
# Tokenize the text
texts = [[remove_punctuation(word.lower()) for word in text.split()] for text in directions]
# Create a dictionary for word pairs
word_pairs = defaultdict(lambda: defaultdict(int))
# Count the word pairs
for text in texts:
for i in range(len(text) - 1):
current_word = text[i]
next_word = text[i + 1]
word_pairs[current_word][next_word] += 1
# Convert counts to probabilities
for current_word in word_pairs:
total_count = sum(word_pairs[current_word].values())
for next_word in word_pairs[current_word]:
word_pairs[current_word][next_word] /= total_count
transition_df = pd.DataFrame.from_dict(word_pairs, orient='index').fillna(0)
transition_df.to_csv("transition.csv")
try:
res = transition_df.loc["pepper"]
tagged = nltk.pos_tag(res.index.tolist())
tagged_df = pd.DataFrame(tagged, columns=['word', 'POS'])
tagged_df["prob"] = res.tolist()
tagged_df = tagged_df.sort_values(by="prob", ascending=False)
print(tagged_df.head(10))
except KeyError:
print("Not found")
Editor is loading...
Leave a Comment