Untitled
ptrdung
plain_text
a year ago
4.7 kB
4
Indexable
import argparse import glob import gzip import multiprocessing as mp import os from os.path import basename, exists, join import ipdb from pytorch_pretrained_bert import BertTokenizer from tqdm import tqdm from cytoolz import curry, partition_all import streamlit as st import sentencepiece as spm import ctranslate2 from nltk import sent_tokenize st.set_page_config(page_title="NMT", page_icon="🤖") IN_WORD = '@@' # This prefix is used for reconstructing the original # tokenization after generation. (BERT tokenizer does not # preserve white spaces) # it seems not conflicting for the corpus we test on # special chars in moses tokenizer MOSES_SPECIALS = {'&': '&', '|': '|', '<': '<', '>': '>', ''': "'", '"': '"', '[': '[', ']': ']'} HYPHEN = '@-@' UNK = '<unk>' BUF = 65536 CHUNK = 4096 IN_WORD = '@@' BERT_IN_WORD = '##' # special chars in moses tokenizer MOSES_SPECIALS = {'|': '|', '<': '<', '>': '>', "'": ''', '"': '"', '[': '[', ']': ']'} AMP = '&' AMP_MOSES = '&' UNK = '<unk>' sent = "" tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') def tokenize(line): if IN_WORD in line: # safe guard if the corpus cotains the IN_WORD tag raise ValueError() line = line.strip() # Gigaword test set line = line.replace(' UNK ', f' {UNK} ') if line.startswith('UNK '): line = UNK + line[3:] if line.endswith(' UNK'): line = line[:-3] + UNK words = [] for word in line.split(): if word[0] == '&': for special, char in MOSES_SPECIALS.items(): if word.startswith(special): words.append(char) words.append(IN_WORD+word[len(special):]) break else: raise ValueError() else: words.append(word) tokens = [] for word in words: if word == UNK: tokens.append(word) elif word == HYPHEN: tokens.append(word) elif word.startswith(IN_WORD): tokens.extend(IN_WORD+tok for tok in tokenizer.tokenize(word[len(IN_WORD):])) else: tokens.extend(tok if i == 0 else IN_WORD+tok for i, tok in enumerate(tokenizer.tokenize(word))) return tokens def convert_moses(tok): if tok in MOSES_SPECIALS: return MOSES_SPECIALS[tok] return tok def detokenize(line, moses=True): word = '' words = [] for tok in line.split(): if tok.startswith(IN_WORD): tok = tok[2:] if tok.startswith(BERT_IN_WORD): tok = tok[2:] tok = tok.replace(AMP, AMP_MOSES) if moses: tok = convert_moses(tok) word += tok else: if tok.startswith(BERT_IN_WORD): ipdb.set_trace() raise ValueError() words.append(word) tok = tok.replace(AMP, AMP_MOSES) if moses: tok = convert_moses(tok) word = tok words.append(word) text = ' '.join(words).strip() return text def translate(source, translator): source_sentences = tokenize(source) translations = translator.translate_batch([source_sentences])[0] textList = list(translations)[0]['tokens'] res = '' for text in textList: res += text + ' ' translations = detokenize(res) return translations # [Modify] File paths here to the CTranslate2 SentencePiece models. ct_model_path = "/home/t12/QuangNV/mini_project_stp/code/c2translate/" # Create objects of CTranslate2 Translator and SentencePieceProcessor to load the models translator = ctranslate2.Translator(ct_model_path, "cpu") # or "cuda" for GPU # Title for the page and nice icon # Header st.title("Translate") # Form to add your items with st.form("my_form"): # Textarea to type the source text. user_input = st.text_area("Source Text", max_chars=200) # Translate with CTranslate2 model translation = translate(user_input, translator) # Create a button submitted = st.form_submit_button("Translate") # If the button pressed, print the translation # Here, we use "st.info", but you can try "st.write", "st.code", or "st.success". if submitted: st.write("Translation") st.info(translation)
Editor is loading...
Leave a Comment