Untitled
unknown
python
2 years ago
3.0 kB
10
Indexable
# Install necessary libraries
!pip install torch torchvision
!pip install transformers datasets
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
# Load Bengali paraphrase dataset
dataset = load_dataset("dataset_name") # Replace "dataset_name" with the name of your dataset
# Preprocess the dataset
def preprocess(example):
return {"source_text": example["source_text"], "target_text": example["target_text"]}
dataset = dataset.map(preprocess)
# Load pre-trained T5 model and tokenizer
model_name = "t5-small" # You can change this to any other T5 variant
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Tokenize and prepare dataset for training
def tokenize_function(examples):
return tokenizer(
examples["source_text"],
max_length=128,
padding="max_length",
truncation=True
)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Define training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
logging_dir="./logs",
logging_steps=100,
save_steps=500,
evaluation_strategy="steps",
eval_steps=500,
overwrite_output_dir=True,
)
# Define data collator
def data_collator(features):
labels = features["target_ids"]
labels[labels[:, :] == tokenizer.pad_token_id] = -100
return {
"input_ids": features["input_ids"],
"attention_mask": features["attention_mask"],
"labels": labels,
}
# Instantiate Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
data_collator=data_collator,
)
# Fine-tune the model
trainer.train()
# Evaluate the model
trainer.evaluate()
# Save the model
model.save_pretrained("path/to/save/model")
# Save the model
output_dir = "path/to/save/model"
model.save_pretrained(output_dir)
# Load the fine-tuned model
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(output_dir)
fine_tuned_tokenizer = T5Tokenizer.from_pretrained(output_dir)
# Example of inference
def paraphrase_sentence(sentence, model, tokenizer):
input_ids = tokenizer.encode(sentence, return_tensors="pt", max_length=128, truncation=True)
output_ids = model.generate(input_ids=input_ids, max_length=128, num_beams=4, early_stopping=True)
paraphrase = tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
return paraphrase
# Example usage
input_sentence = "আমি বাংলায় কথা বলতে পারি।"
paraphrase = paraphrase_sentence(input_sentence, fine_tuned_model, fine_tuned_tokenizer)
print("Paraphrase:", paraphrase)
Editor is loading...
Leave a Comment