Untitled
user_9601555
python
a year ago
1.6 kB
51
Indexable
Never
# Generate sentence from essential_generators import DocumentGenerator gen = DocumentGenerator() n = 300_000 sentences = [gen.sentence() for _ in range(n)] # generate keyword list import random import re def get_random_keyword(splitted_words): kw = splitted_words[random.randint(0, len(splitted_words)-1)] if re.match(r'^[a-zA-Z]+$', kw): return kw else: return '' keywords = list(set([get_random_keyword(sentences[random.randint(0, n-1)].split(" ")) for _ in range(6000)]).difference({''})) print(len(keywords)) # 2389 import polars as pl import pandas as pd import duckdb ## %%timeit works in Jupyter Notebook %%timeit -n 3 -r 5 df = pl.DataFrame({'sentence': sentences}) res_df = df.with_columns([pl.col('sentence').str.contains(keyword).alias(keyword) for keyword in keywords]) # 1.05 s ± 72.5 ms per loop (mean ± std. dev. of 5 runs, 3 loops each) %%timeit -n 3 -r 5 sentence_df = pd.DataFrame({'sentence': sentences}) df = sentence_df.reindex(columns=['sentence'] + keywords, fill_value=False) for kw in keywords: df.loc[df["sentence"].str.contains(kw), kw] = True # > 120 s per loop %%timeit -n 3 -r 5 df = pd.DataFrame({'sentence': sentences}) for kw in keywords: df[kw] = df["sentence"].str.contains(kw) # > 120 s per loop %%timeit -n 3 -r 5 df = pd.DataFrame({'sentence': sentences}) query_sql = "select sentence," + ",".join([f"case when sentence like '%{keyword}%' then TRUE else FALSE end as {keyword}" for keyword in keywords]) + " from df" res_df = duckdb.sql(query_sql).to_df() # 25.7 s ± 222 ms per loop (mean ± std. dev. of 5 runs, 3 loops each)