Untitled
# Install necessary libraries !pip install transformers datasets from datasets import load_dataset # Load the SQuAD dataset squad = load_dataset("squad") from transformers import RobertaTokenizer # Load the tokenizer tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2") def preprocess_data(examples): questions = [q.strip() for q in examples['question']] contexts = [c.strip() for c in examples['context']] # Tokenize inputs inputs = tokenizer(questions, contexts, truncation=True, padding="max_length", max_length=384, return_offsets_mapping=True) start_positions = [] end_positions = [] for i in range(len(examples['answers'])): answer = examples['answers'][i]['text'][0] start_char = examples['answers'][i]['answer_start'][0] end_char = start_char + len(answer) # Find start and end token indices offsets = inputs["offset_mapping"][i] start_position = next(idx for idx, offset in enumerate(offsets) if offset[0] <= start_char and offset[1] > start_char) end_position = next(idx for idx, offset in enumerate(offsets) if offset[0] < end_char and offset[1] >= end_char) start_positions.append(start_position) end_positions.append(end_position) inputs["start_positions"] = start_positions inputs["end_positions"] = end_positions # Remove offset mappings inputs = {k: v for k, v in inputs.items() if k != 'offset_mapping'} return inputs # Preprocess the dataset train_data = squad["train"].map(preprocess_data, batched=True, remove_columns=squad["train"].column_names) validation_data = squad["validation"].map(preprocess_data, batched=True, remove_columns=squad["validation"].column_names) from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset import torch # Convert lists to tensors train_dataset = TensorDataset( torch.tensor(train_data['input_ids']), torch.tensor(train_data['attention_mask']), torch.tensor(train_data['start_positions']), torch.tensor(train_data['end_positions']) ) validation_dataset = TensorDataset( torch.tensor(validation_data['input_ids']), torch.tensor(validation_data['attention_mask']), torch.tensor(validation_data['start_positions']), torch.tensor(validation_data['end_positions']) ) # Create DataLoaders train_dataloader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), batch_size=8 ) validation_dataloader = DataLoader( validation_dataset, sampler=SequentialSampler(validation_dataset), batch_size=8 ) from transformers import RobertaForQuestionAnswering # Load the model model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") from transformers import AdamW # Set up optimizer optimizer = AdamW(model.parameters(), lr=3e-5) # Training loop model.train() epochs = 3 for epoch in range(epochs): for batch in train_dataloader: optimizer.zero_grad() # Unpack the inputs from the DataLoader input_ids, attention_mask, start_positions, end_positions = batch # Forward pass outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions) loss = outputs.loss # Backward pass loss.backward() optimizer.step() print(f"Epoch: {epoch}, Loss: {loss.item()}") # Ensure the model is in evaluation mode model.eval() # Sample question and context from the validation set (you can also use your own) sample_idx = 0 # Change this index to see different examples from the validation set sample_input_ids = validation_data['input_ids'][sample_idx] sample_attention_mask = validation_data['attention_mask'][sample_idx] sample_context = squad['validation']['context'][sample_idx] sample_question = squad['validation']['question'][sample_idx] # Run the model on the sample with torch.no_grad(): outputs = model(input_ids=torch.tensor([sample_input_ids]), attention_mask=torch.tensor([sample_attention_mask])) start_logits = outputs.start_logits end_logits = outputs.end_logits # Get the most likely beginning and end of the answer start_index = torch.argmax(start_logits) end_index = torch.argmax(end_logits) # Convert tokens to the answer string tokens = tokenizer.convert_ids_to_tokens(sample_input_ids[start_index:end_index+1]) answer = tokenizer.convert_tokens_to_string(tokens) # Print the results print("Question:", sample_question) print("Context:", sample_context) print("\nPredicted Answer:", answer) # Your custom question and context custom_question = "What is RoBERTa?" custom_context = "RoBERTa is an optimized version of BERT, which stands for Robustly Optimized BERT Approach. It was trained with more data and longer sequences, and is generally more robust for various NLP tasks." # Tokenize the inputs inputs = tokenizer(custom_question, custom_context, return_tensors='pt', truncation=True, padding="max_length", max_length=384) # Run the model on the custom input with torch.no_grad(): outputs = model(**inputs) start_logits = outputs.start_logits end_logits = outputs.end_logits # Get the most likely beginning and end of the answer start_index = torch.argmax(start_logits) end_index = torch.argmax(end_logits) # Convert tokens to the answer string tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index+1]) answer = tokenizer.convert_tokens_to_string(tokens) # Print the results print("Question:", custom_question) print("Context:", custom_context) print("\nPredicted Answer:", answer)
Leave a Comment