Untitled

mail@pastecode.io avatar
unknown
python
7 months ago
1.7 kB
1
Indexable
Never
from nltk.corpus import stopwords as nltk_stop_words
from constants import excluded_sw, nltk_map
from itertools import combinations


def preliminary_check(kw, sw,langage):
    kw_words = kw.split()
    return [
        x.lower()
        for x in kw_words
        if x.isdigit() or x in excluded_sw.get(langage, []) or x not in sw
    ]

def keyword_pair_generator(kws, sw, language):
    for pair in combinations(kws, 2):
        yield (preliminary_check(pair[0], sw, language), preliminary_check(pair[1], sw, language))


def prepare_params_file(s3_wrap,kws,langage):
    sw = nltk_stop_words.words(nltk_map[langage])
    sw = set([unidecode.unidecode(w) for w in sw])
    buffer = []
    buffer_limit = 5000
    for kw_pair in keyword_pair_generator(kws,sw,langage):
        buffer.append(kw_pair)
        if len(buffer)>= buffer_limit:
            pass
            # write to S3


def get_batches(args):
    s3_wrap=S3(args["client_name"],args["culture"],args["environment"],cfg.AWS_REGION)
    batch_size=int(args["batch_size"]) if "batch_size" in args.keys() else 500
    kws=s3_wrap.read(f"{args['environment']}/{args['client_name']}-{args['culture']}/{args['step_name']}/kws.json")
    kws_nbr=len(kws)
    indices=list(range(0,kws_nbr))
    prepare_params_file(s3_wrap,kws,args['language'])

#    filenames=s3_wrap.get_files_in_bucket(prefix=f"{args['environment']}/{args['client_name']}-{args['culture']}/{args['step_name']}/")
    list_of_files=[s3_wrap.store_in_file(indices[i:i+batch_size],f"{args['environment']}/{args['client_name']}-{args['culture']}/fuzzy/{i}-{i+batch_size}.json") for i in range(0,kws_nbr,batch_size)] 
    return list_of_files,kws_nbr