Spaces:
Running
Running
import os | |
import sys | |
import re | |
from pypinyin import lazy_pinyin, BOPOMOFO | |
import jieba | |
import cn2an | |
import logging | |
logging.getLogger('jieba').setLevel(logging.WARNING) | |
jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0])) + '/jieba/dict.txt') | |
jieba.initialize() | |
# List of (Latin alphabet, bopomofo) pairs: | |
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ | |
('a', 'ㄟˉ'), | |
('b', 'ㄅㄧˋ'), | |
('c', 'ㄙㄧˉ'), | |
('d', 'ㄉㄧˋ'), | |
('e', 'ㄧˋ'), | |
('f', 'ㄝˊㄈㄨˋ'), | |
('g', 'ㄐㄧˋ'), | |
('h', 'ㄝˇㄑㄩˋ'), | |
('i', 'ㄞˋ'), | |
('j', 'ㄐㄟˋ'), | |
('k', 'ㄎㄟˋ'), | |
('l', 'ㄝˊㄛˋ'), | |
('m', 'ㄝˊㄇㄨˋ'), | |
('n', 'ㄣˉ'), | |
('o', 'ㄡˉ'), | |
('p', 'ㄆㄧˉ'), | |
('q', 'ㄎㄧㄡˉ'), | |
('r', 'ㄚˋ'), | |
('s', 'ㄝˊㄙˋ'), | |
('t', 'ㄊㄧˋ'), | |
('u', 'ㄧㄡˉ'), | |
('v', 'ㄨㄧˉ'), | |
('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), | |
('x', 'ㄝˉㄎㄨˋㄙˋ'), | |
('y', 'ㄨㄞˋ'), | |
('z', 'ㄗㄟˋ') | |
]] | |
# List of (bopomofo, romaji) pairs: | |
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
('ㄅㄛ', 'p⁼wo'), | |
('ㄆㄛ', 'pʰwo'), | |
('ㄇㄛ', 'mwo'), | |
('ㄈㄛ', 'fwo'), | |
('ㄅ', 'p⁼'), | |
('ㄆ', 'pʰ'), | |
('ㄇ', 'm'), | |
('ㄈ', 'f'), | |
('ㄉ', 't⁼'), | |
('ㄊ', 'tʰ'), | |
('ㄋ', 'n'), | |
('ㄌ', 'l'), | |
('ㄍ', 'k⁼'), | |
('ㄎ', 'kʰ'), | |
('ㄏ', 'h'), | |
('ㄐ', 'ʧ⁼'), | |
('ㄑ', 'ʧʰ'), | |
('ㄒ', 'ʃ'), | |
('ㄓ', 'ʦ`⁼'), | |
('ㄔ', 'ʦ`ʰ'), | |
('ㄕ', 's`'), | |
('ㄖ', 'ɹ`'), | |
('ㄗ', 'ʦ⁼'), | |
('ㄘ', 'ʦʰ'), | |
('ㄙ', 's'), | |
('ㄚ', 'a'), | |
('ㄛ', 'o'), | |
('ㄜ', 'ə'), | |
('ㄝ', 'e'), | |
('ㄞ', 'ai'), | |
('ㄟ', 'ei'), | |
('ㄠ', 'au'), | |
('ㄡ', 'ou'), | |
('ㄧㄢ', 'yeNN'), | |
('ㄢ', 'aNN'), | |
('ㄧㄣ', 'iNN'), | |
('ㄣ', 'əNN'), | |
('ㄤ', 'aNg'), | |
('ㄧㄥ', 'iNg'), | |
('ㄨㄥ', 'uNg'), | |
('ㄩㄥ', 'yuNg'), | |
('ㄥ', 'əNg'), | |
('ㄦ', 'əɻ'), | |
('ㄧ', 'i'), | |
('ㄨ', 'u'), | |
('ㄩ', 'ɥ'), | |
('ˉ', '→'), | |
('ˊ', '↑'), | |
('ˇ', '↓↑'), | |
('ˋ', '↓'), | |
('˙', ''), | |
(',', ','), | |
('。', '.'), | |
('!', '!'), | |
('?', '?'), | |
('—', '-') | |
]] | |
# List of (romaji, ipa) pairs: | |
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ | |
('ʃy', 'ʃ'), | |
('ʧʰy', 'ʧʰ'), | |
('ʧ⁼y', 'ʧ⁼'), | |
('NN', 'n'), | |
('Ng', 'ŋ'), | |
('y', 'j'), | |
('h', 'x') | |
]] | |
# List of (bopomofo, ipa) pairs: | |
_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
('ㄅㄛ', 'p⁼wo'), | |
('ㄆㄛ', 'pʰwo'), | |
('ㄇㄛ', 'mwo'), | |
('ㄈㄛ', 'fwo'), | |
('ㄅ', 'p⁼'), | |
('ㄆ', 'pʰ'), | |
('ㄇ', 'm'), | |
('ㄈ', 'f'), | |
('ㄉ', 't⁼'), | |
('ㄊ', 'tʰ'), | |
('ㄋ', 'n'), | |
('ㄌ', 'l'), | |
('ㄍ', 'k⁼'), | |
('ㄎ', 'kʰ'), | |
('ㄏ', 'x'), | |
('ㄐ', 'tʃ⁼'), | |
('ㄑ', 'tʃʰ'), | |
('ㄒ', 'ʃ'), | |
('ㄓ', 'ts`⁼'), | |
('ㄔ', 'ts`ʰ'), | |
('ㄕ', 's`'), | |
('ㄖ', 'ɹ`'), | |
('ㄗ', 'ts⁼'), | |
('ㄘ', 'tsʰ'), | |
('ㄙ', 's'), | |
('ㄚ', 'a'), | |
('ㄛ', 'o'), | |
('ㄜ', 'ə'), | |
('ㄝ', 'ɛ'), | |
('ㄞ', 'aɪ'), | |
('ㄟ', 'eɪ'), | |
('ㄠ', 'ɑʊ'), | |
('ㄡ', 'oʊ'), | |
('ㄧㄢ', 'jɛn'), | |
('ㄩㄢ', 'ɥæn'), | |
('ㄢ', 'an'), | |
('ㄧㄣ', 'in'), | |
('ㄩㄣ', 'ɥn'), | |
('ㄣ', 'ən'), | |
('ㄤ', 'ɑŋ'), | |
('ㄧㄥ', 'iŋ'), | |
('ㄨㄥ', 'ʊŋ'), | |
('ㄩㄥ', 'jʊŋ'), | |
('ㄥ', 'əŋ'), | |
('ㄦ', 'əɻ'), | |
('ㄧ', 'i'), | |
('ㄨ', 'u'), | |
('ㄩ', 'ɥ'), | |
('ˉ', '→'), | |
('ˊ', '↑'), | |
('ˇ', '↓↑'), | |
('ˋ', '↓'), | |
('˙', ''), | |
(',', ','), | |
('。', '.'), | |
('!', '!'), | |
('?', '?'), | |
('—', '-') | |
]] | |
# List of (bopomofo, ipa2) pairs: | |
_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
('ㄅㄛ', 'pwo'), | |
('ㄆㄛ', 'pʰwo'), | |
('ㄇㄛ', 'mwo'), | |
('ㄈㄛ', 'fwo'), | |
('ㄅ', 'p'), | |
('ㄆ', 'pʰ'), | |
('ㄇ', 'm'), | |
('ㄈ', 'f'), | |
('ㄉ', 't'), | |
('ㄊ', 'tʰ'), | |
('ㄋ', 'n'), | |
('ㄌ', 'l'), | |
('ㄍ', 'k'), | |
('ㄎ', 'kʰ'), | |
('ㄏ', 'h'), | |
('ㄐ', 'tɕ'), | |
('ㄑ', 'tɕʰ'), | |
('ㄒ', 'ɕ'), | |
('ㄓ', 'tʂ'), | |
('ㄔ', 'tʂʰ'), | |
('ㄕ', 'ʂ'), | |
('ㄖ', 'ɻ'), | |
('ㄗ', 'ts'), | |
('ㄘ', 'tsʰ'), | |
('ㄙ', 's'), | |
('ㄚ', 'a'), | |
('ㄛ', 'o'), | |
('ㄜ', 'ɤ'), | |
('ㄝ', 'ɛ'), | |
('ㄞ', 'aɪ'), | |
('ㄟ', 'eɪ'), | |
('ㄠ', 'ɑʊ'), | |
('ㄡ', 'oʊ'), | |
('ㄧㄢ', 'jɛn'), | |
('ㄩㄢ', 'yæn'), | |
('ㄢ', 'an'), | |
('ㄧㄣ', 'in'), | |
('ㄩㄣ', 'yn'), | |
('ㄣ', 'ən'), | |
('ㄤ', 'ɑŋ'), | |
('ㄧㄥ', 'iŋ'), | |
('ㄨㄥ', 'ʊŋ'), | |
('ㄩㄥ', 'jʊŋ'), | |
('ㄥ', 'ɤŋ'), | |
('ㄦ', 'əɻ'), | |
('ㄧ', 'i'), | |
('ㄨ', 'u'), | |
('ㄩ', 'y'), | |
('ˉ', '˥'), | |
('ˊ', '˧˥'), | |
('ˇ', '˨˩˦'), | |
('ˋ', '˥˩'), | |
('˙', ''), | |
(',', ','), | |
('。', '.'), | |
('!', '!'), | |
('?', '?'), | |
('—', '-') | |
]] | |
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [ | |
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'), | |
('([0-9]+)/([0-9]+)', r'\2分之\1'), | |
('\+', r'加'), | |
('([0-9]+)-([0-9]+)', r'\1减\2'), | |
('×', r'乘以'), | |
('([0-9]+)x([0-9]+)', r'\1乘以\2'), | |
('([0-9]+)\*([0-9]+)', r'\1乘以\2'), | |
('÷', r'除以'), | |
('=', r'等于'), | |
('≠', r'不等于'), | |
]] | |
def symbols_to_chinese(text): | |
for regex, replacement in _symbols_to_chinese: | |
text = re.sub(regex, replacement, text) | |
return text | |
def number_to_chinese(text): | |
numbers = re.findall(r'[0-9]+(?:\.?[0-9]+)?', text) | |
for number in numbers: | |
text = text.replace(number, cn2an.an2cn(number), 1) | |
return text | |
def chinese_to_bopomofo(text): | |
text = text.replace('、', ',').replace(';', ',').replace(':', ',') | |
words = jieba.lcut(text, cut_all=False) | |
text = '' | |
for word in words: | |
bopomofos = lazy_pinyin(word, BOPOMOFO) | |
if not re.search('[\u4e00-\u9fff]', word): | |
text += word | |
continue | |
for i in range(len(bopomofos)): | |
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i]) | |
if text != '': | |
text += ' ' | |
text += ''.join(bopomofos) | |
return text | |
def latin_to_bopomofo(text): | |
for regex, replacement in _latin_to_bopomofo: | |
text = re.sub(regex, replacement, text) | |
return text | |
def bopomofo_to_romaji(text): | |
for regex, replacement in _bopomofo_to_romaji: | |
text = re.sub(regex, replacement, text) | |
return text | |
def bopomofo_to_ipa(text): | |
for regex, replacement in _bopomofo_to_ipa: | |
text = re.sub(regex, replacement, text) | |
return text | |
def bopomofo_to_ipa2(text): | |
for regex, replacement in _bopomofo_to_ipa2: | |
text = re.sub(regex, replacement, text) | |
return text | |
def chinese_to_romaji(text): | |
text = symbols_to_chinese(text) | |
text = number_to_chinese(text) | |
text = chinese_to_bopomofo(text) | |
text = latin_to_bopomofo(text) | |
text = bopomofo_to_romaji(text) | |
text = re.sub('i([aoe])', r'y\1', text) | |
text = re.sub('u([aoəe])', r'w\1', text) | |
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', | |
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') | |
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) | |
return text | |
def chinese_to_lazy_ipa(text): | |
text = chinese_to_romaji(text) | |
for regex, replacement in _romaji_to_ipa: | |
text = re.sub(regex, replacement, text) | |
return text | |
def chinese_to_ipa(text): | |
text = symbols_to_chinese(text) | |
text = number_to_chinese(text) | |
text = chinese_to_bopomofo(text) | |
text = latin_to_bopomofo(text) | |
text = bopomofo_to_ipa(text) | |
text = re.sub('i([aoe])', r'j\1', text) | |
text = re.sub('u([aoəe])', r'w\1', text) | |
text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', | |
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') | |
text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) | |
return text | |
def chinese_to_ipa2(text): | |
text = symbols_to_chinese(text) | |
text = number_to_chinese(text) | |
text = chinese_to_bopomofo(text) | |
text = latin_to_bopomofo(text) | |
text = bopomofo_to_ipa2(text) | |
text = re.sub(r'i([aoe])', r'j\1', text) | |
text = re.sub(r'u([aoəe])', r'w\1', text) | |
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text) | |
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text) | |
return text | |