Untitled

mail@pastecode.io avatar
unknown
plain_text
10 days ago
5.7 kB
2
Indexable
Never
# Install necessary libraries
!pip install transformers datasets


from datasets import load_dataset

# Load the SQuAD dataset
squad = load_dataset("squad")


from transformers import RobertaTokenizer

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")

def preprocess_data(examples):
    questions = [q.strip() for q in examples['question']]
    contexts = [c.strip() for c in examples['context']]
    
    # Tokenize inputs
    inputs = tokenizer(questions, contexts, truncation=True, padding="max_length", max_length=384, return_offsets_mapping=True)
    
    start_positions = []
    end_positions = []

    for i in range(len(examples['answers'])):
        answer = examples['answers'][i]['text'][0]
        start_char = examples['answers'][i]['answer_start'][0]
        end_char = start_char + len(answer)
        
        # Find start and end token indices
        offsets = inputs["offset_mapping"][i]
        start_position = next(idx for idx, offset in enumerate(offsets) if offset[0] <= start_char and offset[1] > start_char)
        end_position = next(idx for idx, offset in enumerate(offsets) if offset[0] < end_char and offset[1] >= end_char)
        
        start_positions.append(start_position)
        end_positions.append(end_position)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    # Remove offset mappings
    inputs = {k: v for k, v in inputs.items() if k != 'offset_mapping'}
    
    return inputs

# Preprocess the dataset
train_data = squad["train"].map(preprocess_data, batched=True, remove_columns=squad["train"].column_names)
validation_data = squad["validation"].map(preprocess_data, batched=True, remove_columns=squad["validation"].column_names)


from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch

# Convert lists to tensors
train_dataset = TensorDataset(
    torch.tensor(train_data['input_ids']),
    torch.tensor(train_data['attention_mask']),
    torch.tensor(train_data['start_positions']),
    torch.tensor(train_data['end_positions'])
)

validation_dataset = TensorDataset(
    torch.tensor(validation_data['input_ids']),
    torch.tensor(validation_data['attention_mask']),
    torch.tensor(validation_data['start_positions']),
    torch.tensor(validation_data['end_positions'])
)

# Create DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=8
)

validation_dataloader = DataLoader(
    validation_dataset,
    sampler=SequentialSampler(validation_dataset),
    batch_size=8
)


from transformers import RobertaForQuestionAnswering

# Load the model
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

from transformers import AdamW

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Training loop
model.train()
epochs = 3

for epoch in range(epochs):
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        # Unpack the inputs from the DataLoader
        input_ids, attention_mask, start_positions, end_positions = batch
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        print(f"Epoch: {epoch}, Loss: {loss.item()}")


# Ensure the model is in evaluation mode
model.eval()

# Sample question and context from the validation set (you can also use your own)
sample_idx = 0  # Change this index to see different examples from the validation set
sample_input_ids = validation_data['input_ids'][sample_idx]
sample_attention_mask = validation_data['attention_mask'][sample_idx]
sample_context = squad['validation']['context'][sample_idx]
sample_question = squad['validation']['question'][sample_idx]

# Run the model on the sample
with torch.no_grad():
    outputs = model(input_ids=torch.tensor([sample_input_ids]), attention_mask=torch.tensor([sample_attention_mask]))
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

# Get the most likely beginning and end of the answer
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert tokens to the answer string
tokens = tokenizer.convert_ids_to_tokens(sample_input_ids[start_index:end_index+1])
answer = tokenizer.convert_tokens_to_string(tokens)

# Print the results
print("Question:", sample_question)
print("Context:", sample_context)
print("\nPredicted Answer:", answer)


# Your custom question and context
custom_question = "What is RoBERTa?"
custom_context = "RoBERTa is an optimized version of BERT, which stands for Robustly Optimized BERT Approach. It was trained with more data and longer sequences, and is generally more robust for various NLP tasks."

# Tokenize the inputs
inputs = tokenizer(custom_question, custom_context, return_tensors='pt', truncation=True, padding="max_length", max_length=384)

# Run the model on the custom input
with torch.no_grad():
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

# Get the most likely beginning and end of the answer
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert tokens to the answer string
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index+1])
answer = tokenizer.convert_tokens_to_string(tokens)

# Print the results
print("Question:", custom_question)
print("Context:", custom_context)
print("\nPredicted Answer:", answer)



Leave a Comment