Untitled
unknown
python
a year ago
1.3 kB
5
Indexable
def tokenize_function_wo_promtloss(examples, max_length=220): inputs = examples['prompt'] outputs = examples['completion'] texts = [inp + out + tokenizer.eos_token for inp, out in zip(inputs, outputs)] tokenized = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length) # Create labels by deeply copying input_ids to avoid modification of the original input_ids labels = copy.deepcopy(tokenized['input_ids']) for i, input_ids in enumerate(tokenized['input_ids']): attention_mask = tokenized['attention_mask'][i] prompt_length = len(tokenizer.encode(inputs[i], add_special_tokens=False)) # Find the actual start position of the prompt in the padded sequence start_position = attention_mask.index(1) # Mask prompt tokens labels[i][start_position:start_position + prompt_length + 1] = [-100] * (prompt_length + 1) # +1 for eos_token after prompt # Mask padding tokens for j, mask in enumerate(attention_mask): if mask == 0: labels[i][j] = -100 # Mask padding tokens # Shift labels left labels[i] = labels[i][1:] + [-100] # Shift left and add -100 to the end tokenized['labels'] = labels return tokenized
Editor is loading...
Leave a Comment