Untitled
ptrdung
plain_text
2 years ago
4.7 kB
6
Indexable
import argparse
import glob
import gzip
import multiprocessing as mp
import os
from os.path import basename, exists, join
import ipdb
from pytorch_pretrained_bert import BertTokenizer
from tqdm import tqdm
from cytoolz import curry, partition_all
import streamlit as st
import sentencepiece as spm
import ctranslate2
from nltk import sent_tokenize
st.set_page_config(page_title="NMT", page_icon="🤖")
IN_WORD = '@@' # This prefix is used for reconstructing the original
# tokenization after generation. (BERT tokenizer does not
# preserve white spaces)
# it seems not conflicting for the corpus we test on
# special chars in moses tokenizer
MOSES_SPECIALS = {'&': '&', '|': '|', '<': '<', '>': '>',
''': "'", '"': '"', '[': '[', ']': ']'}
HYPHEN = '@-@'
UNK = '<unk>'
BUF = 65536
CHUNK = 4096
IN_WORD = '@@'
BERT_IN_WORD = '##'
# special chars in moses tokenizer
MOSES_SPECIALS = {'|': '|', '<': '<', '>': '>',
"'": ''', '"': '"', '[': '[', ']': ']'}
AMP = '&'
AMP_MOSES = '&'
UNK = '<unk>'
sent = ""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
def tokenize(line):
if IN_WORD in line:
# safe guard if the corpus cotains the IN_WORD tag
raise ValueError()
line = line.strip()
# Gigaword test set
line = line.replace(' UNK ', f' {UNK} ')
if line.startswith('UNK '):
line = UNK + line[3:]
if line.endswith(' UNK'):
line = line[:-3] + UNK
words = []
for word in line.split():
if word[0] == '&':
for special, char in MOSES_SPECIALS.items():
if word.startswith(special):
words.append(char)
words.append(IN_WORD+word[len(special):])
break
else:
raise ValueError()
else:
words.append(word)
tokens = []
for word in words:
if word == UNK:
tokens.append(word)
elif word == HYPHEN:
tokens.append(word)
elif word.startswith(IN_WORD):
tokens.extend(IN_WORD+tok
for tok in tokenizer.tokenize(word[len(IN_WORD):]))
else:
tokens.extend(tok if i == 0 else IN_WORD+tok
for i, tok in enumerate(tokenizer.tokenize(word)))
return tokens
def convert_moses(tok):
if tok in MOSES_SPECIALS:
return MOSES_SPECIALS[tok]
return tok
def detokenize(line, moses=True):
word = ''
words = []
for tok in line.split():
if tok.startswith(IN_WORD):
tok = tok[2:]
if tok.startswith(BERT_IN_WORD):
tok = tok[2:]
tok = tok.replace(AMP, AMP_MOSES)
if moses:
tok = convert_moses(tok)
word += tok
else:
if tok.startswith(BERT_IN_WORD):
ipdb.set_trace()
raise ValueError()
words.append(word)
tok = tok.replace(AMP, AMP_MOSES)
if moses:
tok = convert_moses(tok)
word = tok
words.append(word)
text = ' '.join(words).strip()
return text
def translate(source, translator):
source_sentences = tokenize(source)
translations = translator.translate_batch([source_sentences])[0]
textList = list(translations)[0]['tokens']
res = ''
for text in textList:
res += text + ' '
translations = detokenize(res)
return translations
# [Modify] File paths here to the CTranslate2 SentencePiece models.
ct_model_path = "/home/t12/QuangNV/mini_project_stp/code/c2translate/"
# Create objects of CTranslate2 Translator and SentencePieceProcessor to load the models
translator = ctranslate2.Translator(ct_model_path, "cpu") # or "cuda" for GPU
# Title for the page and nice icon
# Header
st.title("Translate")
# Form to add your items
with st.form("my_form"):
# Textarea to type the source text.
user_input = st.text_area("Source Text", max_chars=200)
# Translate with CTranslate2 model
translation = translate(user_input, translator)
# Create a button
submitted = st.form_submit_button("Translate")
# If the button pressed, print the translation
# Here, we use "st.info", but you can try "st.write", "st.code", or "st.success".
if submitted:
st.write("Translation")
st.info(translation)
Editor is loading...
Leave a Comment