Untitled
unknown
plain_text
10 months ago
3.8 kB
3
Indexable
def convert_to_bio(text, tokens):
print(f'ESSE EH O TEXT {text}')
print(f'ESSE EH O TOKENS {tokens}')
# Split the text into words (tokens)
words = text.split()
# Initialize BIO tags for each word (default is 'O' - Outside any entity)
bio_tags = ['O'] * len(words)
scores = []
# Calculate the start and end positions of each word in the original text
word_positions = []
char_start = 0
for word in words:
word_positions.append((char_start, char_start + len(word)))
char_start += len(word) + 1 # account for the space after each word
# Iterate over each NER result and assign BIO tags
for token in tokens:
entity = token['entity']
start_char = token['start']
end_char = token['end']
score = token['score']
# Find which words this entity spans based on the start and end character positions
start_idx = None
end_idx = None
# Find the word that corresponds to the start position
for idx, (start, end) in enumerate(word_positions):
if start_char >= start and start_char < end:
start_idx = idx
break
# Find the word that corresponds to the end position
for idx, (start, end) in enumerate(word_positions):
if end_char > start and end_char <= end:
end_idx = idx
break
# Check if we have valid indices for the entity
if start_idx is None or end_idx is None:
raise ValueError(f"Entity with start {start_char} and end {end_char} could not be matched to tokens in the text.")
# Assign BIO tags to the identified words
for idx in range(start_idx, end_idx + 1):
bio_tags[idx] = entity # Mark the entity group (e.g., 'B-PER', 'I-PER')
scores.append(score)
# Remove scores for 'O' labels
filtered_scores = [score for score, tag in zip(scores, bio_tags) if tag != 'O']
return bio_tags, filtered_scores
text = 'altera a redação do art. 487 da consolidação das leis do trabalho - clt, a fim de dispor sobre o aviso prévio proporcional.''
list_dicts = [{'entity': 'I-FUNDAMENTO', 'score': 0.5929501, 'index': 6, 'word': 'art', 'start': 20, 'end': 23}, {'entity': 'I-FUNDAMENTO', 'score': 0.85288566, 'index': 7, 'word': '.', 'start': 23, 'end': 24}, {'entity': 'I-FUNDAMENTO', 'score': 0.88864744, 'index': 8, 'word': '48', 'start': 25, 'end': 27}, {'entity': 'I-FUNDAMENTO', 'score': 0.9004599, 'index': 9, 'word': '##7', 'start': 27, 'end': 28}, {'entity': 'I-FUNDAMENTO', 'score': 0.8508824, 'index': 10, 'word': 'da', 'start': 29, 'end': 31}, {'entity': 'I-FUNDAMENTO', 'score': 0.8259002, 'index': 11, 'word': 'consolidação', 'start': 32, 'end': 44}, {'entity': 'I-FUNDAMENTO', 'score': 0.8576865, 'index': 12, 'word': 'das', 'start': 45, 'end': 48}, {'entity': 'I-FUNDAMENTO', 'score': 0.8123742, 'index': 13, 'word': 'leis', 'start': 49, 'end': 53}, {'entity': 'I-FUNDAMENTO', 'score': 0.839848, 'index': 14, 'word': 'do', 'start': 54, 'end': 56}, {'entity': 'I-FUNDAMENTO', 'score': 0.80648446, 'index': 15, 'word': 'trabalho', 'start': 57, 'end': 65}, {'entity': 'I-FUNDAMENTO', 'score': 0.82544446, 'index': 16, 'word': '-', 'start': 66, 'end': 67}, {'entity': 'I-FUNDAMENTO', 'score': 0.83774996, 'index': 17, 'word': 'cl', 'start': 68, 'end': 70}, {'entity': 'I-FUNDAMENTO', 'score': 0.8439327, 'index': 18, 'word': '##t', 'start': 70, 'end': 71}]Editor is loading...
Leave a Comment