nord vpnnord vpn
Ad

trevor_linkage_pseudo_code

mail@pastecode.io avatar
unknown
python
a month ago
1.7 kB
1
Indexable
Never
faiss = Faiss()//https://ai.meta.com/tools/faiss/

dataset_to_deduplicate = HQ_SC

deduped = quick_dedup(dataset_to_dedup)

src_1 = deduped.copy()
src_2 = deduped.copy()

src1_vectorised = vectorise(src1, columns_to_vectorise)
src2_vectorised = vectorise(src2, columns_to_vectorise)


comparison_colums {artnumber:[jaro, lev, exact_match]}

faiss_initialised = faiss.index(src2_vectorised)


blocks = create_blocks(src1_vectorised, faiss_initialised)
comparisons = compare_records(blocks)

def compare_records(blocks):
    for rec1, items_in_block in blocks.items():
        for trg_record in items_in_block:
            compare_items(rec1, trg_record, comparison_colums)

def compare_items(rec1, trg_record, comparison_colums):
    for comparison_field_type in comparison_columns:
        deter_score = deterministic_comparison(rec1, trg_record, comparison_field_type)
        jaro_w_score = jaro_score(rec1, trg_score, comparison_field_type)
        lev_score = leve_score(rec_1, trg_score, comparison_field_type)
        score = string_similarity_matches(rec1, trg_score, comparison_field_type)
        score = final_score_compute(deter_score, jaro_score, lev_score)
    return score
// rec1 vs trg1: jaro_score: 10, lev_10:9, deter_score:2 = overallcomputation: 3
\result = {artnum_jaro:8, artnum_lev:6, artnum_deterministic:10, fullname_jaro:10, fullname_lev:30........}

def create_blocks(src1_vectorised, faiss_initialised)
    for rec1 in src1_vectorised:
        block = faiss_initialised.search(rec1, 20)
        blocks[rec1] = block
    return blocks



def quick_dedup(data_src):
    dups = find_dups(dup_col=patientguid, data=data_src)
    unique_results = delete_dups(dups)
    return unique_results

nord vpnnord vpn
Ad