Untitled

 avatar
unknown
plain_text
4 months ago
3.8 kB
2
Indexable
def convert_to_bio(text, tokens):

            print(f'ESSE EH O TEXT {text}')
            print(f'ESSE EH O TOKENS {tokens}')

            # Split the text into words (tokens)
            words = text.split()

            # Initialize BIO tags for each word (default is 'O' - Outside any entity)
            bio_tags = ['O'] * len(words)
            scores = []

            # Calculate the start and end positions of each word in the original text
            word_positions = []
            char_start = 0

            for word in words:
                word_positions.append((char_start, char_start + len(word)))
                char_start += len(word) + 1  # account for the space after each word

            # Iterate over each NER result and assign BIO tags
            for token in tokens:
                entity = token['entity']
                start_char = token['start']
                end_char = token['end']
                score = token['score']

                # Find which words this entity spans based on the start and end character positions
                start_idx = None
                end_idx = None

                # Find the word that corresponds to the start position
                for idx, (start, end) in enumerate(word_positions):
                    if start_char >= start and start_char < end:
                        start_idx = idx
                        break

                # Find the word that corresponds to the end position
                for idx, (start, end) in enumerate(word_positions):
                    if end_char > start and end_char <= end:
                        end_idx = idx
                        break

                # Check if we have valid indices for the entity
                if start_idx is None or end_idx is None:
                    raise ValueError(f"Entity with start {start_char} and end {end_char} could not be matched to tokens in the text.")

                # Assign BIO tags to the identified words
                for idx in range(start_idx, end_idx + 1):
                    bio_tags[idx] = entity  # Mark the entity group (e.g., 'B-PER', 'I-PER')
                    scores.append(score)

            # Remove scores for 'O' labels
            filtered_scores = [score for score, tag in zip(scores, bio_tags) if tag != 'O']

            return bio_tags, filtered_scores


text = 'altera a redação do art. 487 da consolidação das leis do trabalho - clt, a fim de dispor sobre o aviso prévio proporcional.''

list_dicts = [{'entity': 'I-FUNDAMENTO', 'score': 0.5929501, 'index': 6, 'word': 'art', 'start': 20, 'end': 23}, {'entity': 'I-FUNDAMENTO', 'score': 0.85288566, 'index': 7, 'word': '.', 'start': 23, 'end': 24}, {'entity': 'I-FUNDAMENTO', 'score': 0.88864744, 'index': 8, 'word': '48', 'start': 25, 'end': 27}, {'entity': 'I-FUNDAMENTO', 'score': 0.9004599, 'index': 9, 'word': '##7', 'start': 27, 'end': 28}, {'entity': 'I-FUNDAMENTO', 'score': 0.8508824, 'index': 10, 'word': 'da', 'start': 29, 'end': 31}, {'entity': 'I-FUNDAMENTO', 'score': 0.8259002, 'index': 11, 'word': 'consolidação', 'start': 32, 'end': 44}, {'entity': 'I-FUNDAMENTO', 'score': 0.8576865, 'index': 12, 'word': 'das', 'start': 45, 'end': 48}, {'entity': 'I-FUNDAMENTO', 'score': 0.8123742, 'index': 13, 'word': 'leis', 'start': 49, 'end': 53}, {'entity': 'I-FUNDAMENTO', 'score': 0.839848, 'index': 14, 'word': 'do', 'start': 54, 'end': 56}, {'entity': 'I-FUNDAMENTO', 'score': 0.80648446, 'index': 15, 'word': 'trabalho', 'start': 57, 'end': 65}, {'entity': 'I-FUNDAMENTO', 'score': 0.82544446, 'index': 16, 'word': '-', 'start': 66, 'end': 67}, {'entity': 'I-FUNDAMENTO', 'score': 0.83774996, 'index': 17, 'word': 'cl', 'start': 68, 'end': 70}, {'entity': 'I-FUNDAMENTO', 'score': 0.8439327, 'index': 18, 'word': '##t', 'start': 70, 'end': 71}]
Editor is loading...
Leave a Comment