Untitled
user_3839718
python
a year ago
1.7 kB
3
Indexable
import pandas as pd from db.elasticdb import ElasticDB from collections import defaultdict import string import nltk es = ElasticDB() es_client = es.search(index_name="recipenlp", todos=True, query={"size": 500000, "query": {"match_all": {}}}) df = pd.DataFrame(es_client).sample(frac=1).reset_index(drop=True) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('max_colwidth', 1000) df["directions"].tolist() # flatten the list directions = [item for sublist in df["directions"].tolist() for item in sublist] def remove_punctuation(text): return text.translate(str.maketrans('', '', string.punctuation)) # Tokenize the text texts = [[remove_punctuation(word.lower()) for word in text.split()] for text in directions] # Create a dictionary for word pairs word_pairs = defaultdict(lambda: defaultdict(int)) # Count the word pairs for text in texts: for i in range(len(text) - 1): current_word = text[i] next_word = text[i + 1] word_pairs[current_word][next_word] += 1 # Convert counts to probabilities for current_word in word_pairs: total_count = sum(word_pairs[current_word].values()) for next_word in word_pairs[current_word]: word_pairs[current_word][next_word] /= total_count transition_df = pd.DataFrame.from_dict(word_pairs, orient='index').fillna(0) transition_df.to_csv("transition.csv") try: res = transition_df.loc["pepper"] tagged = nltk.pos_tag(res.index.tolist()) tagged_df = pd.DataFrame(tagged, columns=['word', 'POS']) tagged_df["prob"] = res.tolist() tagged_df = tagged_df.sort_values(by="prob", ascending=False) print(tagged_df.head(10)) except KeyError: print("Not found")
Editor is loading...
Leave a Comment