from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
# Example English sentences
english_sentences = ["Hello, how are you?", "I am fine, thank you.", "What is your name?",
"How old are you?", "Where are you from?", "What do you like to do?",
"Can you speak Hindi?", "I love listening to music."]
# Tokenize the English sentences
inputs = tokenizer(english_sentences, padding=True, truncation=True, return_tensors="pt")
# Set the batch size
batch_size = 8
# Generate the Hindi translations in batches
hindi_sentences = []
for i in range(0, len(inputs.input_ids), batch_size):
input_ids_batch = inputs.input_ids[i:i+batch_size]
attention_mask_batch = inputs.attention_mask[i:i+batch_size]
outputs = model.generate(input_ids_batch, attention_mask=attention_mask_batch,
max_length=40, num_beams=4, early_stopping=True)
for output in outputs:
hindi_sentence = tokenizer.decode(output, skip_special_tokens=True)
hindi_sentences.append(hindi_sentence)
print(hindi_sentences)