Untitled

def ner_to_bio(self, entities: list, text: str):
    tokens = text.split(" ")
    # Inicializa a lista BIO com 'O' para cada token
    bio_tags = ['O'] * len(tokens)
    scores = []
    
    # Processa cada entidade detectada
    for entity in entities:
        start = entity['start']
        end = entity['end']
        entity_type = entity['entity']
        entity_score = entity['score']
    
        # Encontra o índice do token de início e fim baseado nos índices de caracteres
        token_start = None
        token_end = None
        current_pos = 0
    
        for i, token in enumerate(tokens):
            current_pos += len(token) + 1 #Considering space
    
            if current_pos > start and token_start is None:
                token_start = i
            if current_pos >= end:
                token_end = i
                break
    
        if token_start is not None and token_end is not None:
            # Define o token de início como 'B-TYPE'
            bio_tags[token_start] = entity_type
    
            # Define os tokens seguintes como 'I-TYPE'
            for i in range(token_start + 1, token_end + 1):
                bio_tags[i] = entity_type
    
    return bio_tags
Editor is loading...