Untitled

 avatar
ptrdung
plain_text
a year ago
4.7 kB
4
Indexable
import argparse
import glob
import gzip
import multiprocessing as mp
import os
from os.path import basename, exists, join
import ipdb
from pytorch_pretrained_bert import BertTokenizer
from tqdm import tqdm
from cytoolz import curry, partition_all
import streamlit as st
import sentencepiece as spm
import ctranslate2
from nltk import sent_tokenize

st.set_page_config(page_title="NMT", page_icon="🤖")



IN_WORD = '@@'  # This prefix is used for reconstructing the original
                # tokenization after generation. (BERT tokenizer does not
                # preserve white spaces)
                # it seems not conflicting for the corpus we test on

# special chars in moses tokenizer
MOSES_SPECIALS = {'&amp;': '&', '&#124;': '|', '&lt;': '<', '&gt;': '>',
                  '&apos;': "'", '&quot;': '"', '&#91;': '[', '&#93;': ']'}
HYPHEN = '@-@'

UNK = '<unk>'

BUF = 65536
CHUNK = 4096

IN_WORD = '@@'
BERT_IN_WORD = '##'

# special chars in moses tokenizer
MOSES_SPECIALS = {'|': '&#124;', '<': '&lt;', '>': '&gt;',
                  "'": '&apos;', '"': '&quot;', '[': '&#91;', ']': '&#93;'}
AMP = '&'
AMP_MOSES = '&amp;'
UNK = '<unk>'

sent = ""

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize(line):
    if IN_WORD in line:
        # safe guard if the corpus cotains the IN_WORD tag
        raise ValueError()
    line = line.strip()
    # Gigaword test set
    line = line.replace(' UNK ', f' {UNK} ')
    if line.startswith('UNK '):
        line = UNK + line[3:]
    if line.endswith(' UNK'):
        line = line[:-3] + UNK

    words = []
    for word in line.split():
        if word[0] == '&':
            for special, char in MOSES_SPECIALS.items():
                if word.startswith(special):
                    words.append(char)
                    words.append(IN_WORD+word[len(special):])
                    break
            else:
                raise ValueError()
        else:
            words.append(word)

    tokens = []
    for word in words:
        if word == UNK:
            tokens.append(word)
        elif word == HYPHEN:
            tokens.append(word)
        elif word.startswith(IN_WORD):
            tokens.extend(IN_WORD+tok
                          for tok in tokenizer.tokenize(word[len(IN_WORD):]))
        else:
            tokens.extend(tok if i == 0 else IN_WORD+tok
                          for i, tok in enumerate(tokenizer.tokenize(word)))
    return tokens

def convert_moses(tok):
    if tok in MOSES_SPECIALS:
        return MOSES_SPECIALS[tok]
    return tok


def detokenize(line, moses=True):
    word = ''
    words = []
    for tok in line.split():
        if tok.startswith(IN_WORD):
            tok = tok[2:]
            if tok.startswith(BERT_IN_WORD):
                tok = tok[2:]
            tok = tok.replace(AMP, AMP_MOSES)
            if moses:
                tok = convert_moses(tok)
            word += tok
        else:
            if tok.startswith(BERT_IN_WORD):
                ipdb.set_trace()
                raise ValueError()
            words.append(word)
            tok = tok.replace(AMP, AMP_MOSES)
            if moses:
                tok = convert_moses(tok)
            word = tok
    words.append(word)
    text = ' '.join(words).strip()
    return text

def translate(source, translator):
    source_sentences = tokenize(source)
    translations = translator.translate_batch([source_sentences])[0]
    textList = list(translations)[0]['tokens']
    res = ''
    for text in textList:
        res += text + ' '
    translations = detokenize(res)
    return translations


# [Modify] File paths here to the CTranslate2 SentencePiece models.
ct_model_path = "/home/t12/QuangNV/mini_project_stp/code/c2translate/"


# Create objects of CTranslate2 Translator and SentencePieceProcessor to load the models
translator = ctranslate2.Translator(ct_model_path, "cpu")    # or "cuda" for GPU



# Title for the page and nice icon
# Header
st.title("Translate")

# Form to add your items
with st.form("my_form"):
    # Textarea to type the source text.
    user_input = st.text_area("Source Text", max_chars=200)
    # Translate with CTranslate2 model
    translation = translate(user_input, translator)

    # Create a button
    submitted = st.form_submit_button("Translate")
    # If the button pressed, print the translation
    # Here, we use "st.info", but you can try "st.write", "st.code", or "st.success".
    if submitted:
        st.write("Translation")
        st.info(translation)

Editor is loading...
Leave a Comment