Uni Leipzig corpus to Full and Dual Pinyin v3

 avatar
unknown
python
5 months ago
5.3 kB
7
No Index
infiles =  [
    'zho_news_2020_30K\\zho_news_2020_30K-sentences.txt',
    'zho_wikipedia_2018_30K\\zho_wikipedia_2018_30K-sentences.txt',
    'zho-cn_web_2015_30K\\zho-cn_web_2015_30K-sentences.txt',
    'zho-mo_web_2016_30K\\zho-mo_web_2016_30K-sentences.txt',
    'zho-simp-tw_web_2014_30K\\zho-simp-tw_web_2014_30K-sentences.txt',
    'zho-trad_newscrawl_2011_30K\\zho-trad_newscrawl_2011_30K-sentences.txt'
]
skip_n = 2
spaces = True
spaces_factor = 0.8
out = 'pinyin-6x30k' + ('_skip' + str(skip_n) if skip_n > 1 else '') + ('_spaces' if spaces else '') + '.txt'
out_dual = 'pinyin-6x30k_dual' + ('_skip' + str(skip_n) if skip_n > 1 else '') + ('_spaces' if spaces else '') + '.txt'
dual_methods = ['ms', 'abc', 'natural']

from pypinyin import pinyin, lazy_pinyin, Style
import csv, random, pynlpir

pynlpir.open(encoding_errors = "ignore")

# for full width symbol --> ascii conversion
# https://pastebin.com/cKYugjMN
with open("translate.txt", "r", encoding="utf-8") as f:
    translate_dict = {ord(rows[0]):rows[1] for rows in csv.reader(f, delimiter="\t", quotechar='"', escapechar="\\")}
    
# for double pinyin
# mappings are from https://zh.wikipedia.org/zh-cn/%E5%8F%8C%E6%8B%BC
# https://pastecode.io/s/0wpfmb2d (updated 2024-12-17)
dual_initials = {}
dual_finals_orig = {}
dual_finals = {}
dual_cleanup = {}
for d in dual_methods:
    with open("initials_" + d + ".txt", "r", encoding="utf-8") as f:
        dual_initials[d] = {rows[0]:rows[1] for rows in csv.reader(f, delimiter="\t")}   
    with open("finals_" + d + ".txt", "r", encoding="utf-8") as f:
        dual_finals_orig[d] = {rows[0]:rows[1] for rows in csv.reader(f, delimiter="\t")}
    # format: pinyin <TAB> fixed initial <TAB> fixed final
    with open("translate_cleanup_" + d + ".txt", "r", encoding="utf-8") as f:
        dual_cleanup[d] = {rows[0]:(rows[1],rows[2]) for rows in csv.reader(f, delimiter="\t")}

# handle finals where the lazy_pinyin output doesn't match what needs to be typed, specifically
finals_pre_clean = {
    "iou" : "iu",
    "uei" : "ui",
    "uen" : "un",
    "van" : "uan",
    "v"   : "u",
    "ve"  : "ue",
    "vn"  : "un"
}
for d in dual_methods:
    dual_finals[d] = dual_finals_orig[d].copy()
    for k, v in finals_pre_clean.items():
        dual_finals[d][k] = dual_finals_orig[d][v]
    # workaround for typing v as u in all cases except these
    for f in ["nv", "nve", "lv", "lve"]:
        dual_finals[d][f] = dual_finals_orig[d][f[1:]]

i_read = 0
i_processed = 0
f_out_dual = {d:open(out_dual.replace(".txt", "_" + d + ".txt"), "w", encoding="utf-8") for d in dual_methods}
with open(out, "w", encoding="utf-8") as f_out:
    for inf in infiles:
        for line in open(inf, "r", encoding="utf-8"):
            i_read = i_read + 1
            if i_read % skip_n != 0:
                continue
            i_processed = i_processed + 1
            
            # remove sentence number from corpus and convert full width punctuation to ASCII
            text = line.split("\t")[1].translate(translate_dict)
            
            # parse to words 
            segments = pynlpir.segment(text)
            
            sentence_full = ''
            sentence_dual = {d:'' for d in dual_methods}
            for word, pos in segments:
                # convert to pinyin
                pinyin =   lazy_pinyin(word)
                initials = lazy_pinyin(word, style = Style.INITIALS)
                finals =   lazy_pinyin(word, style = Style.FINALS)
            
                # model hitting the space bar to select an IME suggestion
                # would in reality be a number key if the first suggestion doesn't fit
                space_char = ''
                if spaces:
                    # factor less then 1 models sometimes converting multiple words at once
                    # no spaces after text that doesn't need IME conversion
                    if random.uniform(0, 1) < spaces_factor and pos != "punctuation mark" and pos != "numeral":
                        space_char = ' '
                       
                sentence_full = sentence_full + ''.join(pinyin) + space_char
                        
                for d in dual_methods:
                    # cleanup for zero initials (where key sequence != lazy_pinyin output)
                    initials_out = initials.copy()
                    finals_out   = finals.copy()
                    for idx, p in enumerate(pinyin):
                        if p in dual_cleanup[d]:
                            initials_out[idx] = dual_cleanup[d][p][0]
                            finals_out[idx]   = dual_cleanup[d][p][1]
                    
                    # convert to dual pinyin
                    dual = [dual_initials[d].get(i,i) + dual_finals[d].get(f,f) if (i != f) else p for i,f,p in zip(initials_out, finals_out, pinyin)]
                    sentence_dual[d] = sentence_dual[d] + ''.join(dual) + space_char
                    
            # merge 5 sentences to one paragraph (excessive newlines skew analysis results)
            f_out.write(sentence_full + ('\n' if (i_processed % 5 == 4) else ' '))
            for d in dual_methods:
                f_out_dual[d].write(sentence_dual[d] + ('\n' if (i_processed % 5 == 4) else ' '))
  
            print(i_read)
            # input()
  
Editor is loading...
Leave a Comment