Untitled
unknown
plain_text
4 months ago
3.8 kB
2
Indexable
def convert_to_bio(text, tokens): print(f'ESSE EH O TEXT {text}') print(f'ESSE EH O TOKENS {tokens}') # Split the text into words (tokens) words = text.split() # Initialize BIO tags for each word (default is 'O' - Outside any entity) bio_tags = ['O'] * len(words) scores = [] # Calculate the start and end positions of each word in the original text word_positions = [] char_start = 0 for word in words: word_positions.append((char_start, char_start + len(word))) char_start += len(word) + 1 # account for the space after each word # Iterate over each NER result and assign BIO tags for token in tokens: entity = token['entity'] start_char = token['start'] end_char = token['end'] score = token['score'] # Find which words this entity spans based on the start and end character positions start_idx = None end_idx = None # Find the word that corresponds to the start position for idx, (start, end) in enumerate(word_positions): if start_char >= start and start_char < end: start_idx = idx break # Find the word that corresponds to the end position for idx, (start, end) in enumerate(word_positions): if end_char > start and end_char <= end: end_idx = idx break # Check if we have valid indices for the entity if start_idx is None or end_idx is None: raise ValueError(f"Entity with start {start_char} and end {end_char} could not be matched to tokens in the text.") # Assign BIO tags to the identified words for idx in range(start_idx, end_idx + 1): bio_tags[idx] = entity # Mark the entity group (e.g., 'B-PER', 'I-PER') scores.append(score) # Remove scores for 'O' labels filtered_scores = [score for score, tag in zip(scores, bio_tags) if tag != 'O'] return bio_tags, filtered_scores text = 'altera a redação do art. 487 da consolidação das leis do trabalho - clt, a fim de dispor sobre o aviso prévio proporcional.'' list_dicts = [{'entity': 'I-FUNDAMENTO', 'score': 0.5929501, 'index': 6, 'word': 'art', 'start': 20, 'end': 23}, {'entity': 'I-FUNDAMENTO', 'score': 0.85288566, 'index': 7, 'word': '.', 'start': 23, 'end': 24}, {'entity': 'I-FUNDAMENTO', 'score': 0.88864744, 'index': 8, 'word': '48', 'start': 25, 'end': 27}, {'entity': 'I-FUNDAMENTO', 'score': 0.9004599, 'index': 9, 'word': '##7', 'start': 27, 'end': 28}, {'entity': 'I-FUNDAMENTO', 'score': 0.8508824, 'index': 10, 'word': 'da', 'start': 29, 'end': 31}, {'entity': 'I-FUNDAMENTO', 'score': 0.8259002, 'index': 11, 'word': 'consolidação', 'start': 32, 'end': 44}, {'entity': 'I-FUNDAMENTO', 'score': 0.8576865, 'index': 12, 'word': 'das', 'start': 45, 'end': 48}, {'entity': 'I-FUNDAMENTO', 'score': 0.8123742, 'index': 13, 'word': 'leis', 'start': 49, 'end': 53}, {'entity': 'I-FUNDAMENTO', 'score': 0.839848, 'index': 14, 'word': 'do', 'start': 54, 'end': 56}, {'entity': 'I-FUNDAMENTO', 'score': 0.80648446, 'index': 15, 'word': 'trabalho', 'start': 57, 'end': 65}, {'entity': 'I-FUNDAMENTO', 'score': 0.82544446, 'index': 16, 'word': '-', 'start': 66, 'end': 67}, {'entity': 'I-FUNDAMENTO', 'score': 0.83774996, 'index': 17, 'word': 'cl', 'start': 68, 'end': 70}, {'entity': 'I-FUNDAMENTO', 'score': 0.8439327, 'index': 18, 'word': '##t', 'start': 70, 'end': 71}]
Editor is loading...
Leave a Comment