Untitled
unknown
python
2 years ago
1.7 kB
5
Indexable
from nltk.corpus import stopwords as nltk_stop_words from constants import excluded_sw, nltk_map from itertools import combinations def preliminary_check(kw, sw,langage): kw_words = kw.split() return [ x.lower() for x in kw_words if x.isdigit() or x in excluded_sw.get(langage, []) or x not in sw ] def keyword_pair_generator(kws, sw, language): for pair in combinations(kws, 2): yield (preliminary_check(pair[0], sw, language), preliminary_check(pair[1], sw, language)) def prepare_params_file(s3_wrap,kws,langage): sw = nltk_stop_words.words(nltk_map[langage]) sw = set([unidecode.unidecode(w) for w in sw]) buffer = [] buffer_limit = 5000 for kw_pair in keyword_pair_generator(kws,sw,langage): buffer.append(kw_pair) if len(buffer)>= buffer_limit: pass # write to S3 def get_batches(args): s3_wrap=S3(args["client_name"],args["culture"],args["environment"],cfg.AWS_REGION) batch_size=int(args["batch_size"]) if "batch_size" in args.keys() else 500 kws=s3_wrap.read(f"{args['environment']}/{args['client_name']}-{args['culture']}/{args['step_name']}/kws.json") kws_nbr=len(kws) indices=list(range(0,kws_nbr)) prepare_params_file(s3_wrap,kws,args['language']) # filenames=s3_wrap.get_files_in_bucket(prefix=f"{args['environment']}/{args['client_name']}-{args['culture']}/{args['step_name']}/") list_of_files=[s3_wrap.store_in_file(indices[i:i+batch_size],f"{args['environment']}/{args['client_name']}-{args['culture']}/fuzzy/{i}-{i+batch_size}.json") for i in range(0,kws_nbr,batch_size)] return list_of_files,kws_nbr
Editor is loading...