trevor_linkage_pseudo_code
unknown
python
a month ago
1.7 kB
1
Indexable
Never
faiss = Faiss()//https://ai.meta.com/tools/faiss/ dataset_to_deduplicate = HQ_SC deduped = quick_dedup(dataset_to_dedup) src_1 = deduped.copy() src_2 = deduped.copy() src1_vectorised = vectorise(src1, columns_to_vectorise) src2_vectorised = vectorise(src2, columns_to_vectorise) comparison_colums {artnumber:[jaro, lev, exact_match]} faiss_initialised = faiss.index(src2_vectorised) blocks = create_blocks(src1_vectorised, faiss_initialised) comparisons = compare_records(blocks) def compare_records(blocks): for rec1, items_in_block in blocks.items(): for trg_record in items_in_block: compare_items(rec1, trg_record, comparison_colums) def compare_items(rec1, trg_record, comparison_colums): for comparison_field_type in comparison_columns: deter_score = deterministic_comparison(rec1, trg_record, comparison_field_type) jaro_w_score = jaro_score(rec1, trg_score, comparison_field_type) lev_score = leve_score(rec_1, trg_score, comparison_field_type) score = string_similarity_matches(rec1, trg_score, comparison_field_type) score = final_score_compute(deter_score, jaro_score, lev_score) return score // rec1 vs trg1: jaro_score: 10, lev_10:9, deter_score:2 = overallcomputation: 3 \result = {artnum_jaro:8, artnum_lev:6, artnum_deterministic:10, fullname_jaro:10, fullname_lev:30........} def create_blocks(src1_vectorised, faiss_initialised) for rec1 in src1_vectorised: block = faiss_initialised.search(rec1, 20) blocks[rec1] = block return blocks def quick_dedup(data_src): dups = find_dups(dup_col=patientguid, data=data_src) unique_results = delete_dups(dups) return unique_results