Untitled

mail@pastecode.io avatar
unknown
python
a month ago
3.0 kB
3
Indexable
Never
# Install necessary libraries
!pip install torch torchvision
!pip install transformers datasets

import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load Bengali paraphrase dataset
dataset = load_dataset("dataset_name")  # Replace "dataset_name" with the name of your dataset

# Preprocess the dataset
def preprocess(example):
    return {"source_text": example["source_text"], "target_text": example["target_text"]}

dataset = dataset.map(preprocess)

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"  # You can change this to any other T5 variant
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize and prepare dataset for training
def tokenize_function(examples):
    return tokenizer(
        examples["source_text"], 
        max_length=128, 
        padding="max_length", 
        truncation=True
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    overwrite_output_dir=True,
)

# Define data collator
def data_collator(features):
    labels = features["target_ids"]
    labels[labels[:, :] == tokenizer.pad_token_id] = -100
    return {
        "input_ids": features["input_ids"],
        "attention_mask": features["attention_mask"],
        "labels": labels,
    }

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
model.save_pretrained("path/to/save/model")








# Save the model
output_dir = "path/to/save/model"
model.save_pretrained(output_dir)

# Load the fine-tuned model
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(output_dir)
fine_tuned_tokenizer = T5Tokenizer.from_pretrained(output_dir)

# Example of inference
def paraphrase_sentence(sentence, model, tokenizer):
    input_ids = tokenizer.encode(sentence, return_tensors="pt", max_length=128, truncation=True)
    output_ids = model.generate(input_ids=input_ids, max_length=128, num_beams=4, early_stopping=True)
    paraphrase = tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return paraphrase

# Example usage
input_sentence = "আমি বাংলায় কথা বলতে পারি।"
paraphrase = paraphrase_sentence(input_sentence, fine_tuned_model, fine_tuned_tokenizer)
print("Paraphrase:", paraphrase)

Leave a Comment