Uni Leipzig corpus to Full and Dual Pinyin v3
unknown
python
10 months ago
5.3 kB
7
No Index
infiles = [
'zho_news_2020_30K\\zho_news_2020_30K-sentences.txt',
'zho_wikipedia_2018_30K\\zho_wikipedia_2018_30K-sentences.txt',
'zho-cn_web_2015_30K\\zho-cn_web_2015_30K-sentences.txt',
'zho-mo_web_2016_30K\\zho-mo_web_2016_30K-sentences.txt',
'zho-simp-tw_web_2014_30K\\zho-simp-tw_web_2014_30K-sentences.txt',
'zho-trad_newscrawl_2011_30K\\zho-trad_newscrawl_2011_30K-sentences.txt'
]
skip_n = 2
spaces = True
spaces_factor = 0.8
out = 'pinyin-6x30k' + ('_skip' + str(skip_n) if skip_n > 1 else '') + ('_spaces' if spaces else '') + '.txt'
out_dual = 'pinyin-6x30k_dual' + ('_skip' + str(skip_n) if skip_n > 1 else '') + ('_spaces' if spaces else '') + '.txt'
dual_methods = ['ms', 'abc', 'natural']
from pypinyin import pinyin, lazy_pinyin, Style
import csv, random, pynlpir
pynlpir.open(encoding_errors = "ignore")
# for full width symbol --> ascii conversion
# https://pastebin.com/cKYugjMN
with open("translate.txt", "r", encoding="utf-8") as f:
translate_dict = {ord(rows[0]):rows[1] for rows in csv.reader(f, delimiter="\t", quotechar='"', escapechar="\\")}
# for double pinyin
# mappings are from https://zh.wikipedia.org/zh-cn/%E5%8F%8C%E6%8B%BC
# https://pastecode.io/s/0wpfmb2d (updated 2024-12-17)
dual_initials = {}
dual_finals_orig = {}
dual_finals = {}
dual_cleanup = {}
for d in dual_methods:
with open("initials_" + d + ".txt", "r", encoding="utf-8") as f:
dual_initials[d] = {rows[0]:rows[1] for rows in csv.reader(f, delimiter="\t")}
with open("finals_" + d + ".txt", "r", encoding="utf-8") as f:
dual_finals_orig[d] = {rows[0]:rows[1] for rows in csv.reader(f, delimiter="\t")}
# format: pinyin <TAB> fixed initial <TAB> fixed final
with open("translate_cleanup_" + d + ".txt", "r", encoding="utf-8") as f:
dual_cleanup[d] = {rows[0]:(rows[1],rows[2]) for rows in csv.reader(f, delimiter="\t")}
# handle finals where the lazy_pinyin output doesn't match what needs to be typed, specifically
finals_pre_clean = {
"iou" : "iu",
"uei" : "ui",
"uen" : "un",
"van" : "uan",
"v" : "u",
"ve" : "ue",
"vn" : "un"
}
for d in dual_methods:
dual_finals[d] = dual_finals_orig[d].copy()
for k, v in finals_pre_clean.items():
dual_finals[d][k] = dual_finals_orig[d][v]
# workaround for typing v as u in all cases except these
for f in ["nv", "nve", "lv", "lve"]:
dual_finals[d][f] = dual_finals_orig[d][f[1:]]
i_read = 0
i_processed = 0
f_out_dual = {d:open(out_dual.replace(".txt", "_" + d + ".txt"), "w", encoding="utf-8") for d in dual_methods}
with open(out, "w", encoding="utf-8") as f_out:
for inf in infiles:
for line in open(inf, "r", encoding="utf-8"):
i_read = i_read + 1
if i_read % skip_n != 0:
continue
i_processed = i_processed + 1
# remove sentence number from corpus and convert full width punctuation to ASCII
text = line.split("\t")[1].translate(translate_dict)
# parse to words
segments = pynlpir.segment(text)
sentence_full = ''
sentence_dual = {d:'' for d in dual_methods}
for word, pos in segments:
# convert to pinyin
pinyin = lazy_pinyin(word)
initials = lazy_pinyin(word, style = Style.INITIALS)
finals = lazy_pinyin(word, style = Style.FINALS)
# model hitting the space bar to select an IME suggestion
# would in reality be a number key if the first suggestion doesn't fit
space_char = ''
if spaces:
# factor less then 1 models sometimes converting multiple words at once
# no spaces after text that doesn't need IME conversion
if random.uniform(0, 1) < spaces_factor and pos != "punctuation mark" and pos != "numeral":
space_char = ' '
sentence_full = sentence_full + ''.join(pinyin) + space_char
for d in dual_methods:
# cleanup for zero initials (where key sequence != lazy_pinyin output)
initials_out = initials.copy()
finals_out = finals.copy()
for idx, p in enumerate(pinyin):
if p in dual_cleanup[d]:
initials_out[idx] = dual_cleanup[d][p][0]
finals_out[idx] = dual_cleanup[d][p][1]
# convert to dual pinyin
dual = [dual_initials[d].get(i,i) + dual_finals[d].get(f,f) if (i != f) else p for i,f,p in zip(initials_out, finals_out, pinyin)]
sentence_dual[d] = sentence_dual[d] + ''.join(dual) + space_char
# merge 5 sentences to one paragraph (excessive newlines skew analysis results)
f_out.write(sentence_full + ('\n' if (i_processed % 5 == 4) else ' '))
for d in dual_methods:
f_out_dual[d].write(sentence_dual[d] + ('\n' if (i_processed % 5 == 4) else ' '))
print(i_read)
# input()
Editor is loading...
Leave a Comment