Untitled
unknown
python
a year ago
3.0 kB
8
Indexable
# Install necessary libraries !pip install torch torchvision !pip install transformers datasets import torch from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments from datasets import load_dataset # Load Bengali paraphrase dataset dataset = load_dataset("dataset_name") # Replace "dataset_name" with the name of your dataset # Preprocess the dataset def preprocess(example): return {"source_text": example["source_text"], "target_text": example["target_text"]} dataset = dataset.map(preprocess) # Load pre-trained T5 model and tokenizer model_name = "t5-small" # You can change this to any other T5 variant tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) # Tokenize and prepare dataset for training def tokenize_function(examples): return tokenizer( examples["source_text"], max_length=128, padding="max_length", truncation=True ) tokenized_dataset = dataset.map(tokenize_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8, logging_dir="./logs", logging_steps=100, save_steps=500, evaluation_strategy="steps", eval_steps=500, overwrite_output_dir=True, ) # Define data collator def data_collator(features): labels = features["target_ids"] labels[labels[:, :] == tokenizer.pad_token_id] = -100 return { "input_ids": features["input_ids"], "attention_mask": features["attention_mask"], "labels": labels, } # Instantiate Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], data_collator=data_collator, ) # Fine-tune the model trainer.train() # Evaluate the model trainer.evaluate() # Save the model model.save_pretrained("path/to/save/model") # Save the model output_dir = "path/to/save/model" model.save_pretrained(output_dir) # Load the fine-tuned model fine_tuned_model = T5ForConditionalGeneration.from_pretrained(output_dir) fine_tuned_tokenizer = T5Tokenizer.from_pretrained(output_dir) # Example of inference def paraphrase_sentence(sentence, model, tokenizer): input_ids = tokenizer.encode(sentence, return_tensors="pt", max_length=128, truncation=True) output_ids = model.generate(input_ids=input_ids, max_length=128, num_beams=4, early_stopping=True) paraphrase = tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) return paraphrase # Example usage input_sentence = "আমি বাংলায় কথা বলতে পারি।" paraphrase = paraphrase_sentence(input_sentence, fine_tuned_model, fine_tuned_tokenizer) print("Paraphrase:", paraphrase)
Editor is loading...
Leave a Comment