Untitled

mail@pastecode.io avatar
unknown
python
a year ago
2.9 kB
3
Indexable
Never
def gen_by_emb(df):
    """
        using embedding to generate training instance,
        embedding : dict[int] -> np.array((dim,))
    """
    global embeds, cand_dict, type_dict

    session = df[0]
    aids = df[1]
    types = df[2]

    unique_aids = list(dict.fromkeys(aids[::-1]))
    unique_aids = list(map(str, unique_aids))


    candidates = unique_aids + cand_dict[session]
    candidates = candidates[:40]
    # candidates = cand_dict[session]

    # if len(candidates) < 40:
        # candidates = candidates + unique_aids[:40 - len(candidates)]


    truths = []
    feas = []
    for embed in embeds:
        try:
            dim = next(iter(embed.values())).shape[0]
        except:
            import IPython;IPython.embed(color='neutral');exit(1)


        truth = []
        fea = []
        cand_emb = np.array([
                        embed[int(ID)] if int(ID) in embed
                        else np.zeros((dim))
                        for ID in candidates
                    ])


        flag = False
        last_type = None
        for aid, _type in zip(aids[::-1], types[::-1]):
            if _type != type_dict[args.type] and last_type==None:
                continue
            if last_type == None:
                last_type = aid
            else:
                flag = True
                if aid in embed:
                    q_embed = embed[aid]

                    score = np.sum(q_embed*cand_emb, axis=1)
                    # dot products
                    fea.append(score.reshape((cand_emb.shape[0], 1)))
                    truth = [
                        1. if r == last_type
                        else 0.
                        for r in candidates
                    ]


                else:
                    fea.append(np.full((cand_emb.shape[0], 1), 0.))
                    truth = [0]*cand_emb.shape[0]
                break

        if flag == False:
            return [], [], []




        # horizonal combined all embed feature
        dots = np.hstack(fea)
        truths = np.hstack(truth)
        truths = np.expand_dims(truths, axis=1)

        fea = np.hstack(
                (
                    dots,
                    np.sum(dots, axis=1).reshape(( cand_emb.shape[0], 1)),
                    np.amax(dots, axis=1).reshape((cand_emb.shape[0], 1)),
                    #cosines,
                    #np.sum(cosines, axis=1).reshape((len(candidates), 1)),
                    #np.amax(cosines, axis=1).reshape((len(candidates), 1))
                )
            )

        '''
        if fea.shape != (40,3):
            import IPython;IPython.embed(color='neutral');exit(1)
        '''
        # first feas
        if not len(feas):
            feas = fea
        else:
            feas = np.hstack((feas, fea))

    if args.mode != 'train':
        return feas, truths, candidates
    else:
        return feas, truths, []