Uni Leipzig corpus to Full and Dual Pinyin v3
unknown
python
5 months ago
5.3 kB
7
No Index
infiles = [ 'zho_news_2020_30K\\zho_news_2020_30K-sentences.txt', 'zho_wikipedia_2018_30K\\zho_wikipedia_2018_30K-sentences.txt', 'zho-cn_web_2015_30K\\zho-cn_web_2015_30K-sentences.txt', 'zho-mo_web_2016_30K\\zho-mo_web_2016_30K-sentences.txt', 'zho-simp-tw_web_2014_30K\\zho-simp-tw_web_2014_30K-sentences.txt', 'zho-trad_newscrawl_2011_30K\\zho-trad_newscrawl_2011_30K-sentences.txt' ] skip_n = 2 spaces = True spaces_factor = 0.8 out = 'pinyin-6x30k' + ('_skip' + str(skip_n) if skip_n > 1 else '') + ('_spaces' if spaces else '') + '.txt' out_dual = 'pinyin-6x30k_dual' + ('_skip' + str(skip_n) if skip_n > 1 else '') + ('_spaces' if spaces else '') + '.txt' dual_methods = ['ms', 'abc', 'natural'] from pypinyin import pinyin, lazy_pinyin, Style import csv, random, pynlpir pynlpir.open(encoding_errors = "ignore") # for full width symbol --> ascii conversion # https://pastebin.com/cKYugjMN with open("translate.txt", "r", encoding="utf-8") as f: translate_dict = {ord(rows[0]):rows[1] for rows in csv.reader(f, delimiter="\t", quotechar='"', escapechar="\\")} # for double pinyin # mappings are from https://zh.wikipedia.org/zh-cn/%E5%8F%8C%E6%8B%BC # https://pastecode.io/s/0wpfmb2d (updated 2024-12-17) dual_initials = {} dual_finals_orig = {} dual_finals = {} dual_cleanup = {} for d in dual_methods: with open("initials_" + d + ".txt", "r", encoding="utf-8") as f: dual_initials[d] = {rows[0]:rows[1] for rows in csv.reader(f, delimiter="\t")} with open("finals_" + d + ".txt", "r", encoding="utf-8") as f: dual_finals_orig[d] = {rows[0]:rows[1] for rows in csv.reader(f, delimiter="\t")} # format: pinyin <TAB> fixed initial <TAB> fixed final with open("translate_cleanup_" + d + ".txt", "r", encoding="utf-8") as f: dual_cleanup[d] = {rows[0]:(rows[1],rows[2]) for rows in csv.reader(f, delimiter="\t")} # handle finals where the lazy_pinyin output doesn't match what needs to be typed, specifically finals_pre_clean = { "iou" : "iu", "uei" : "ui", "uen" : "un", "van" : "uan", "v" : "u", "ve" : "ue", "vn" : "un" } for d in dual_methods: dual_finals[d] = dual_finals_orig[d].copy() for k, v in finals_pre_clean.items(): dual_finals[d][k] = dual_finals_orig[d][v] # workaround for typing v as u in all cases except these for f in ["nv", "nve", "lv", "lve"]: dual_finals[d][f] = dual_finals_orig[d][f[1:]] i_read = 0 i_processed = 0 f_out_dual = {d:open(out_dual.replace(".txt", "_" + d + ".txt"), "w", encoding="utf-8") for d in dual_methods} with open(out, "w", encoding="utf-8") as f_out: for inf in infiles: for line in open(inf, "r", encoding="utf-8"): i_read = i_read + 1 if i_read % skip_n != 0: continue i_processed = i_processed + 1 # remove sentence number from corpus and convert full width punctuation to ASCII text = line.split("\t")[1].translate(translate_dict) # parse to words segments = pynlpir.segment(text) sentence_full = '' sentence_dual = {d:'' for d in dual_methods} for word, pos in segments: # convert to pinyin pinyin = lazy_pinyin(word) initials = lazy_pinyin(word, style = Style.INITIALS) finals = lazy_pinyin(word, style = Style.FINALS) # model hitting the space bar to select an IME suggestion # would in reality be a number key if the first suggestion doesn't fit space_char = '' if spaces: # factor less then 1 models sometimes converting multiple words at once # no spaces after text that doesn't need IME conversion if random.uniform(0, 1) < spaces_factor and pos != "punctuation mark" and pos != "numeral": space_char = ' ' sentence_full = sentence_full + ''.join(pinyin) + space_char for d in dual_methods: # cleanup for zero initials (where key sequence != lazy_pinyin output) initials_out = initials.copy() finals_out = finals.copy() for idx, p in enumerate(pinyin): if p in dual_cleanup[d]: initials_out[idx] = dual_cleanup[d][p][0] finals_out[idx] = dual_cleanup[d][p][1] # convert to dual pinyin dual = [dual_initials[d].get(i,i) + dual_finals[d].get(f,f) if (i != f) else p for i,f,p in zip(initials_out, finals_out, pinyin)] sentence_dual[d] = sentence_dual[d] + ''.join(dual) + space_char # merge 5 sentences to one paragraph (excessive newlines skew analysis results) f_out.write(sentence_full + ('\n' if (i_processed % 5 == 4) else ' ')) for d in dual_methods: f_out_dual[d].write(sentence_dual[d] + ('\n' if (i_processed % 5 == 4) else ' ')) print(i_read) # input()
Editor is loading...
Leave a Comment