Untitled

mail@pastecode.io avatar
unknown
python
a year ago
1.3 kB
3
Indexable
Never
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# Example English sentences
english_sentences = ["Hello, how are you?", "I am fine, thank you.", "What is your name?", 
                     "How old are you?", "Where are you from?", "What do you like to do?", 
                     "Can you speak Hindi?", "I love listening to music."]

# Tokenize the English sentences
inputs = tokenizer(english_sentences, padding=True, truncation=True, return_tensors="pt")

# Set the batch size
batch_size = 8

# Generate the Hindi translations in batches
hindi_sentences = []
for i in range(0, len(inputs.input_ids), batch_size):
    input_ids_batch = inputs.input_ids[i:i+batch_size]
    attention_mask_batch = inputs.attention_mask[i:i+batch_size]
    outputs = model.generate(input_ids_batch, attention_mask=attention_mask_batch, 
                             max_length=40, num_beams=4, early_stopping=True)
    for output in outputs:
        hindi_sentence = tokenizer.decode(output, skip_special_tokens=True)
        hindi_sentences.append(hindi_sentence)

print(hindi_sentences)