Untitled

 avatar
user_9601555
plain_text
a year ago
1.6 kB
2
Indexable
Never
# Generate sentence
from essential_generators import DocumentGenerator
gen = DocumentGenerator()
n = 300_000
sentences = [gen.sentence() for _ in range(n)]

# generate keyword list
import random
import re
def get_random_keyword(splitted_words):
    kw = splitted_words[random.randint(0, len(splitted_words)-1)]
    if re.match(r'^[a-zA-Z]+$', kw):
        return kw
    else:
        return ''
keywords = list(set([get_random_keyword(sentences[random.randint(0, n-1)].split(" ")) for _ in range(6000)]).difference({''}))
print(len(keywords)) # 2389

import polars as pl
import pandas as pd
import duckdb

## %%timeit works in Jupyter Notebook

%%timeit -n 3 -r 5
df = pl.DataFrame({'sentence': sentences})
res_df = df.with_columns([pl.col('sentence').str.contains(keyword).alias(keyword) for keyword in keywords])
# 1.05 s ± 72.5 ms per loop (mean ± std. dev. of 5 runs, 3 loops each)

%%timeit -n 3 -r 5
sentence_df = pd.DataFrame({'sentence': sentences})
df = sentence_df.reindex(columns=['sentence'] + keywords, fill_value=False)
for kw in keywords:
  df.loc[df["sentence"].str.contains(kw), kw] = True
# > 120 s per loop

%%timeit -n 3 -r 5
df = pd.DataFrame({'sentence': sentences})
for kw in keywords:
  df[kw] = df["sentence"].str.contains(kw)
# > 120 s per loop

%%timeit -n 3 -r 5
df = pd.DataFrame({'sentence': sentences})
query_sql = "select sentence," + ",".join([f"case when sentence like '%{keyword}%' then TRUE else FALSE end as {keyword}" for keyword in keywords]) + " from df"
res_df = duckdb.sql(query_sql).to_df()
# 25.7 s ± 222 ms per loop (mean ± std. dev. of 5 runs, 3 loops each)