trevor_linkage_pseudo_code
unknown
python
2 years ago
1.7 kB
6
Indexable
faiss = Faiss()//https://ai.meta.com/tools/faiss/
dataset_to_deduplicate = HQ_SC
deduped = quick_dedup(dataset_to_dedup)
src_1 = deduped.copy()
src_2 = deduped.copy()
src1_vectorised = vectorise(src1, columns_to_vectorise)
src2_vectorised = vectorise(src2, columns_to_vectorise)
comparison_colums {artnumber:[jaro, lev, exact_match]}
faiss_initialised = faiss.index(src2_vectorised)
blocks = create_blocks(src1_vectorised, faiss_initialised)
comparisons = compare_records(blocks)
def compare_records(blocks):
for rec1, items_in_block in blocks.items():
for trg_record in items_in_block:
compare_items(rec1, trg_record, comparison_colums)
def compare_items(rec1, trg_record, comparison_colums):
for comparison_field_type in comparison_columns:
deter_score = deterministic_comparison(rec1, trg_record, comparison_field_type)
jaro_w_score = jaro_score(rec1, trg_score, comparison_field_type)
lev_score = leve_score(rec_1, trg_score, comparison_field_type)
score = string_similarity_matches(rec1, trg_score, comparison_field_type)
score = final_score_compute(deter_score, jaro_score, lev_score)
return score
// rec1 vs trg1: jaro_score: 10, lev_10:9, deter_score:2 = overallcomputation: 3
\result = {artnum_jaro:8, artnum_lev:6, artnum_deterministic:10, fullname_jaro:10, fullname_lev:30........}
def create_blocks(src1_vectorised, faiss_initialised)
for rec1 in src1_vectorised:
block = faiss_initialised.search(rec1, 20)
blocks[rec1] = block
return blocks
def quick_dedup(data_src):
dups = find_dups(dup_col=patientguid, data=data_src)
unique_results = delete_dups(dups)
return unique_resultsEditor is loading...