Untitled
unknown
python
a year ago
1.3 kB
3
Indexable
Never
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi") model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi") # Example English sentences english_sentences = ["Hello, how are you?", "I am fine, thank you.", "What is your name?", "How old are you?", "Where are you from?", "What do you like to do?", "Can you speak Hindi?", "I love listening to music."] # Tokenize the English sentences inputs = tokenizer(english_sentences, padding=True, truncation=True, return_tensors="pt") # Set the batch size batch_size = 8 # Generate the Hindi translations in batches hindi_sentences = [] for i in range(0, len(inputs.input_ids), batch_size): input_ids_batch = inputs.input_ids[i:i+batch_size] attention_mask_batch = inputs.attention_mask[i:i+batch_size] outputs = model.generate(input_ids_batch, attention_mask=attention_mask_batch, max_length=40, num_beams=4, early_stopping=True) for output in outputs: hindi_sentence = tokenizer.decode(output, skip_special_tokens=True) hindi_sentences.append(hindi_sentence) print(hindi_sentences)