Untitled

mail@pastecode.io avatar
unknown
python
4 days ago
1.3 kB
2
Indexable
Never
def tokenize_function_wo_promtloss(examples, max_length=220):
    inputs = examples['prompt']
    outputs = examples['completion']
    texts = [inp + out + tokenizer.eos_token for inp, out in zip(inputs, outputs)]
    tokenized = tokenizer(texts, padding='max_length',
                          truncation=True, max_length=max_length)

    # Create labels by deeply copying input_ids to avoid modification of the original input_ids
    labels = copy.deepcopy(tokenized['input_ids'])

    for i, input_ids in enumerate(tokenized['input_ids']):
        attention_mask = tokenized['attention_mask'][i]
        prompt_length = len(tokenizer.encode(inputs[i], add_special_tokens=False))

        # Find the actual start position of the prompt in the padded sequence
        start_position = attention_mask.index(1)

        # Mask prompt tokens
        labels[i][start_position:start_position + prompt_length + 1] = [-100] * (prompt_length + 1)  # +1 for eos_token after prompt

        # Mask padding tokens
        for j, mask in enumerate(attention_mask):
            if mask == 0:
                labels[i][j] = -100  # Mask padding tokens

        # Shift labels left
        labels[i] = labels[i][1:] + [-100]  # Shift left and add -100 to the end

    tokenized['labels'] = labels
    return tokenized
Leave a Comment