Untitled
unknown
plain_text
a year ago
5.7 kB
8
Indexable
# Install necessary libraries
!pip install transformers datasets
from datasets import load_dataset
# Load the SQuAD dataset
squad = load_dataset("squad")
from transformers import RobertaTokenizer
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")
def preprocess_data(examples):
questions = [q.strip() for q in examples['question']]
contexts = [c.strip() for c in examples['context']]
# Tokenize inputs
inputs = tokenizer(questions, contexts, truncation=True, padding="max_length", max_length=384, return_offsets_mapping=True)
start_positions = []
end_positions = []
for i in range(len(examples['answers'])):
answer = examples['answers'][i]['text'][0]
start_char = examples['answers'][i]['answer_start'][0]
end_char = start_char + len(answer)
# Find start and end token indices
offsets = inputs["offset_mapping"][i]
start_position = next(idx for idx, offset in enumerate(offsets) if offset[0] <= start_char and offset[1] > start_char)
end_position = next(idx for idx, offset in enumerate(offsets) if offset[0] < end_char and offset[1] >= end_char)
start_positions.append(start_position)
end_positions.append(end_position)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
# Remove offset mappings
inputs = {k: v for k, v in inputs.items() if k != 'offset_mapping'}
return inputs
# Preprocess the dataset
train_data = squad["train"].map(preprocess_data, batched=True, remove_columns=squad["train"].column_names)
validation_data = squad["validation"].map(preprocess_data, batched=True, remove_columns=squad["validation"].column_names)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
# Convert lists to tensors
train_dataset = TensorDataset(
torch.tensor(train_data['input_ids']),
torch.tensor(train_data['attention_mask']),
torch.tensor(train_data['start_positions']),
torch.tensor(train_data['end_positions'])
)
validation_dataset = TensorDataset(
torch.tensor(validation_data['input_ids']),
torch.tensor(validation_data['attention_mask']),
torch.tensor(validation_data['start_positions']),
torch.tensor(validation_data['end_positions'])
)
# Create DataLoaders
train_dataloader = DataLoader(
train_dataset,
sampler=RandomSampler(train_dataset),
batch_size=8
)
validation_dataloader = DataLoader(
validation_dataset,
sampler=SequentialSampler(validation_dataset),
batch_size=8
)
from transformers import RobertaForQuestionAnswering
# Load the model
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
from transformers import AdamW
# Set up optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)
# Training loop
model.train()
epochs = 3
for epoch in range(epochs):
for batch in train_dataloader:
optimizer.zero_grad()
# Unpack the inputs from the DataLoader
input_ids, attention_mask, start_positions, end_positions = batch
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
# Backward pass
loss.backward()
optimizer.step()
print(f"Epoch: {epoch}, Loss: {loss.item()}")
# Ensure the model is in evaluation mode
model.eval()
# Sample question and context from the validation set (you can also use your own)
sample_idx = 0 # Change this index to see different examples from the validation set
sample_input_ids = validation_data['input_ids'][sample_idx]
sample_attention_mask = validation_data['attention_mask'][sample_idx]
sample_context = squad['validation']['context'][sample_idx]
sample_question = squad['validation']['question'][sample_idx]
# Run the model on the sample
with torch.no_grad():
outputs = model(input_ids=torch.tensor([sample_input_ids]), attention_mask=torch.tensor([sample_attention_mask]))
start_logits = outputs.start_logits
end_logits = outputs.end_logits
# Get the most likely beginning and end of the answer
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)
# Convert tokens to the answer string
tokens = tokenizer.convert_ids_to_tokens(sample_input_ids[start_index:end_index+1])
answer = tokenizer.convert_tokens_to_string(tokens)
# Print the results
print("Question:", sample_question)
print("Context:", sample_context)
print("\nPredicted Answer:", answer)
# Your custom question and context
custom_question = "What is RoBERTa?"
custom_context = "RoBERTa is an optimized version of BERT, which stands for Robustly Optimized BERT Approach. It was trained with more data and longer sequences, and is generally more robust for various NLP tasks."
# Tokenize the inputs
inputs = tokenizer(custom_question, custom_context, return_tensors='pt', truncation=True, padding="max_length", max_length=384)
# Run the model on the custom input
with torch.no_grad():
outputs = model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
# Get the most likely beginning and end of the answer
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)
# Convert tokens to the answer string
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index+1])
answer = tokenizer.convert_tokens_to_string(tokens)
# Print the results
print("Question:", custom_question)
print("Context:", custom_context)
print("\nPredicted Answer:", answer)
Editor is loading...
Leave a Comment