Untitled
unknown
python
a year ago
1.3 kB
13
Indexable
def tokenize_function_wo_promtloss(examples, max_length=220):
inputs = examples['prompt']
outputs = examples['completion']
texts = [inp + out + tokenizer.eos_token for inp, out in zip(inputs, outputs)]
tokenized = tokenizer(texts, padding='max_length',
truncation=True, max_length=max_length)
# Create labels by deeply copying input_ids to avoid modification of the original input_ids
labels = copy.deepcopy(tokenized['input_ids'])
for i, input_ids in enumerate(tokenized['input_ids']):
attention_mask = tokenized['attention_mask'][i]
prompt_length = len(tokenizer.encode(inputs[i], add_special_tokens=False))
# Find the actual start position of the prompt in the padded sequence
start_position = attention_mask.index(1)
# Mask prompt tokens
labels[i][start_position:start_position + prompt_length + 1] = [-100] * (prompt_length + 1) # +1 for eos_token after prompt
# Mask padding tokens
for j, mask in enumerate(attention_mask):
if mask == 0:
labels[i][j] = -100 # Mask padding tokens
# Shift labels left
labels[i] = labels[i][1:] + [-100] # Shift left and add -100 to the end
tokenized['labels'] = labels
return tokenizedEditor is loading...
Leave a Comment