Spaces:
Running
on
Zero
Running
on
Zero
# https://github.com/polm/cutlet/blob/master/cutlet/cutlet.py | |
from dataclasses import dataclass | |
from fugashi import Tagger | |
from num2kana import Convert | |
import mojimoji | |
import re | |
import unicodedata | |
HEPBURN = { | |
chr(12449):'a', #ァ | |
chr(12450):'a', #ア | |
chr(12451):'i', #ィ | |
chr(12452):'i', #イ | |
chr(12453):'ɯ', #ゥ | |
chr(12454):'ɯ', #ウ | |
chr(12455):'e', #ェ | |
chr(12456):'e', #エ | |
chr(12457):'o', #ォ | |
chr(12458):'o', #オ | |
chr(12459):'ka', #カ | |
chr(12460):'ɡa', #ガ | |
chr(12461):'ki', #キ | |
chr(12462):'ɡi', #ギ | |
chr(12463):'kɯ', #ク | |
chr(12464):'ɡɯ', #グ | |
chr(12465):'ke', #ケ | |
chr(12466):'ɡe', #ゲ | |
chr(12467):'ko', #コ | |
chr(12468):'ɡo', #ゴ | |
chr(12469):'sa', #サ | |
chr(12470):'za', #ザ | |
chr(12471):'ɕi', #シ | |
chr(12472):'dʑi', #ジ | |
chr(12473):'sɨ', #ス | |
chr(12474):'zɨ', #ズ | |
chr(12475):'se', #セ | |
chr(12476):'ze', #ゼ | |
chr(12477):'so', #ソ | |
chr(12478):'zo', #ゾ | |
chr(12479):'ta', #タ | |
chr(12480):'da', #ダ | |
chr(12481):'tɕi', #チ | |
chr(12482):'dʑi', #ヂ | |
# chr(12483) #ッ | |
chr(12484):'tsɨ', #ツ | |
chr(12485):'zɨ', #ヅ | |
chr(12486):'te', #テ | |
chr(12487):'de', #デ | |
chr(12488):'to', #ト | |
chr(12489):'do', #ド | |
chr(12490):'na', #ナ | |
chr(12491):'ɲi', #ニ | |
chr(12492):'nɯ', #ヌ | |
chr(12493):'ne', #ネ | |
chr(12494):'no', #ノ | |
chr(12495):'ha', #ハ | |
chr(12496):'ba', #バ | |
chr(12497):'pa', #パ | |
chr(12498):'çi', #ヒ | |
chr(12499):'bi', #ビ | |
chr(12500):'pi', #ピ | |
chr(12501):'ɸɯ', #フ | |
chr(12502):'bɯ', #ブ | |
chr(12503):'pɯ', #プ | |
chr(12504):'he', #ヘ | |
chr(12505):'be', #ベ | |
chr(12506):'pe', #ペ | |
chr(12507):'ho', #ホ | |
chr(12508):'bo', #ボ | |
chr(12509):'po', #ポ | |
chr(12510):'ma', #マ | |
chr(12511):'mi', #ミ | |
chr(12512):'mɯ', #ム | |
chr(12513):'me', #メ | |
chr(12514):'mo', #モ | |
chr(12515):'ja', #ャ | |
chr(12516):'ja', #ヤ | |
chr(12517):'jɯ', #ュ | |
chr(12518):'jɯ', #ユ | |
chr(12519):'jo', #ョ | |
chr(12520):'jo', #ヨ | |
chr(12521):'ra', #ラ | |
chr(12522):'ri', #リ | |
chr(12523):'rɯ', #ル | |
chr(12524):'re', #レ | |
chr(12525):'ro', #ロ | |
chr(12526):'wa', #ヮ | |
chr(12527):'wa', #ワ | |
chr(12528):'i', #ヰ | |
chr(12529):'e', #ヱ | |
chr(12530):'o', #ヲ | |
# chr(12531) #ン | |
chr(12532):'vɯ', #ヴ | |
chr(12533):'ka', #ヵ | |
chr(12534):'ke', #ヶ | |
} | |
assert len(HEPBURN) == 84 and all(i in {12483, 12531} or chr(i) in HEPBURN for i in range(12449, 12535)) | |
for k, v in list(HEPBURN.items()): | |
HEPBURN[chr(ord(k)-96)] = v | |
assert len(HEPBURN) == 84*2 | |
HEPBURN.update({ | |
chr(12535):'va', #ヷ | |
chr(12536):'vi', #ヸ | |
chr(12537):'ve', #ヹ | |
chr(12538):'vo', #ヺ | |
}) | |
assert len(HEPBURN) == 84*2+4 and all(chr(i) in HEPBURN for i in range(12535, 12539)) | |
HEPBURN.update({ | |
chr(12784):'kɯ', #ㇰ | |
chr(12785):'ɕi', #ㇱ | |
chr(12786):'sɨ', #ㇲ | |
chr(12787):'to', #ㇳ | |
chr(12788):'nɯ', #ㇴ | |
chr(12789):'ha', #ㇵ | |
chr(12790):'çi', #ㇶ | |
chr(12791):'ɸɯ', #ㇷ | |
chr(12792):'he', #ㇸ | |
chr(12793):'ho', #ㇹ | |
chr(12794):'mɯ', #ㇺ | |
chr(12795):'ra', #ㇻ | |
chr(12796):'ri', #ㇼ | |
chr(12797):'rɯ', #ㇽ | |
chr(12798):'re', #ㇾ | |
chr(12799):'ro', #ㇿ | |
}) | |
assert len(HEPBURN) == 84*2+4+16 and all(chr(i) in HEPBURN for i in range(12784, 12800)) | |
HEPBURN.update({ | |
chr(12452)+chr(12455):'je', #イェ | |
chr(12454)+chr(12451):'wi', #ウィ | |
chr(12454)+chr(12455):'we', #ウェ | |
chr(12454)+chr(12457):'wo', #ウォ | |
chr(12461)+chr(12455):'kʲe', #キェ | |
chr(12461)+chr(12515):'kʲa', #キャ | |
chr(12461)+chr(12517):'kʲɨ', #キュ | |
chr(12461)+chr(12519):'kʲo', #キョ | |
chr(12462)+chr(12515):'ɡʲa', #ギャ | |
chr(12462)+chr(12517):'ɡʲɨ', #ギュ | |
chr(12462)+chr(12519):'ɡʲo', #ギョ | |
chr(12463)+chr(12449):'kʷa', #クァ | |
chr(12463)+chr(12451):'kʷi', #クィ | |
chr(12463)+chr(12455):'kʷe', #クェ | |
chr(12463)+chr(12457):'kʷo', #クォ | |
chr(12464)+chr(12449):'ɡʷa', #グァ | |
chr(12464)+chr(12451):'ɡʷi', #グィ | |
chr(12464)+chr(12455):'ɡʷe', #グェ | |
chr(12464)+chr(12457):'ɡʷo', #グォ | |
chr(12471)+chr(12455):'ɕe', #シェ | |
chr(12471)+chr(12515):'ɕa', #シャ | |
chr(12471)+chr(12517):'ɕɨ', #シュ | |
chr(12471)+chr(12519):'ɕo', #ショ | |
chr(12472)+chr(12455):'dʑe', #ジェ | |
chr(12472)+chr(12515):'dʑa', #ジャ | |
chr(12472)+chr(12517):'dʑɨ', #ジュ | |
chr(12472)+chr(12519):'dʑo', #ジョ | |
chr(12481)+chr(12455):'tɕe', #チェ | |
chr(12481)+chr(12515):'tɕa', #チャ | |
chr(12481)+chr(12517):'tɕɨ', #チュ | |
chr(12481)+chr(12519):'tɕo', #チョ | |
chr(12482)+chr(12515):'dʑa', #ヂャ | |
chr(12482)+chr(12517):'dʑɨ', #ヂュ | |
chr(12482)+chr(12519):'dʑo', #ヂョ | |
chr(12484)+chr(12449):'tsa', #ツァ | |
chr(12484)+chr(12451):'tsi', #ツィ | |
chr(12484)+chr(12455):'tse', #ツェ | |
chr(12484)+chr(12457):'tso', #ツォ | |
chr(12486)+chr(12451):'ti', #ティ | |
chr(12486)+chr(12517):'tʲɨ', #テュ | |
chr(12487)+chr(12451):'di', #ディ | |
chr(12487)+chr(12517):'dʲɨ', #デュ | |
chr(12488)+chr(12453):'tɯ', #トゥ | |
chr(12489)+chr(12453):'dɯ', #ドゥ | |
chr(12491)+chr(12455):'ɲe', #ニェ | |
chr(12491)+chr(12515):'ɲa', #ニャ | |
chr(12491)+chr(12517):'ɲɨ', #ニュ | |
chr(12491)+chr(12519):'ɲo', #ニョ | |
chr(12498)+chr(12455):'çe', #ヒェ | |
chr(12498)+chr(12515):'ça', #ヒャ | |
chr(12498)+chr(12517):'çɨ', #ヒュ | |
chr(12498)+chr(12519):'ço', #ヒョ | |
chr(12499)+chr(12515):'bʲa', #ビャ | |
chr(12499)+chr(12517):'bʲɨ', #ビュ | |
chr(12499)+chr(12519):'bʲo', #ビョ | |
chr(12500)+chr(12515):'pʲa', #ピャ | |
chr(12500)+chr(12517):'pʲɨ', #ピュ | |
chr(12500)+chr(12519):'pʲo', #ピョ | |
chr(12501)+chr(12449):'ɸa', #ファ | |
chr(12501)+chr(12451):'ɸi', #フィ | |
chr(12501)+chr(12455):'ɸe', #フェ | |
chr(12501)+chr(12457):'ɸo', #フォ | |
chr(12501)+chr(12517):'ɸʲɨ', #フュ | |
chr(12501)+chr(12519):'ɸʲo', #フョ | |
chr(12511)+chr(12515):'mʲa', #ミャ | |
chr(12511)+chr(12517):'mʲɨ', #ミュ | |
chr(12511)+chr(12519):'mʲo', #ミョ | |
chr(12522)+chr(12515):'rʲa', #リャ | |
chr(12522)+chr(12517):'rʲɨ', #リュ | |
chr(12522)+chr(12519):'rʲo', #リョ | |
chr(12532)+chr(12449):'va', #ヴァ | |
chr(12532)+chr(12451):'vi', #ヴィ | |
chr(12532)+chr(12455):'ve', #ヴェ | |
chr(12532)+chr(12457):'vo', #ヴォ | |
chr(12532)+chr(12517):'vʲɨ', #ヴュ | |
chr(12532)+chr(12519):'vʲo', #ヴョ | |
}) | |
assert len(HEPBURN) == 84*2+4+16+76 | |
for k, v in list(HEPBURN.items()): | |
if len(k) != 2: | |
continue | |
a, b = k | |
assert a in HEPBURN and b in HEPBURN, (a, b) | |
a = chr(ord(a)-96) | |
b = chr(ord(b)-96) | |
assert a in HEPBURN and b in HEPBURN, (a, b) | |
HEPBURN[a+b] = v | |
assert len(HEPBURN) == 84*2+4+16+76*2 | |
HEPBURN.update({ | |
# symbols | |
# 'ー': '-', # 長音符, only used when repeated | |
'。': '.', | |
'、': ',', | |
'?': '?', | |
'!': '!', | |
'「': '"', | |
'」': '"', | |
'『': '"', | |
'』': '"', | |
':': ':', | |
'(': '(', | |
')': ')', | |
'《': '(', | |
'》': ')', | |
'【': '[', | |
'】': ']', | |
'・': ' ',#'/', | |
',': ',', | |
'~': '—', | |
'〜': '—', | |
'—': '—', | |
'«': '«', | |
'»': '»', | |
# other | |
'゚': '', # combining handakuten by itself, just discard | |
'゙': '', # combining dakuten by itself | |
}) | |
def add_dakuten(kk): | |
"""Given a kana (single-character string), add a dakuten.""" | |
try: | |
# ii = 'かきくけこさしすせそたちつてとはひふへほ'.index(kk) | |
ii = 'カキクケコサシスセソタチツテトハヒフヘホ'.index(kk) | |
return 'ガギグゲゴザジズゼゾダヂヅデドバビブベボ'[ii] | |
# return 'がぎぐげござじずぜぞだぢづでどばびぶべぼ'[ii] | |
except ValueError: | |
# this is normal if the input is nonsense | |
return None | |
SUTEGANA = 'ャュョァィゥェォ' #'ゃゅょぁぃぅぇぉ' | |
PUNCT = '\'".!?(),;:-' | |
ODORI = '々〃ゝゞヽゞ' | |
class Token: | |
surface: str | |
space: bool # if a space should follow | |
def __str__(self): | |
sp = " " if self.space else "" | |
return f"{self.surface}{sp}" | |
class Katsu: | |
def __init__(self): | |
"""Create a Katsu object, which holds configuration as well as | |
tokenizer state. | |
Typical usage: | |
```python | |
katsu = Katsu() | |
roma = katsu.romaji("カツカレーを食べた") | |
# "Cutlet curry wo tabeta" | |
``` | |
""" | |
self.tagger = Tagger() | |
self.table = dict(HEPBURN) # make a copy so we can modify it | |
self.exceptions = {} | |
def romaji(self, text): | |
"""Build a complete string from input text.""" | |
if not text: | |
return '' | |
text = self._normalize_text(text) | |
words = self.tagger(text) | |
tokens = self._romaji_tokens(words) | |
out = ''.join([str(tok) for tok in tokens]) | |
return re.sub(r'\s+', ' ', out.strip()) | |
def phonemize(self, texts): | |
# espeak-ng API | |
return [self.romaji(text) for text in texts] | |
def _normalize_text(self, text): | |
"""Given text, normalize variations in Japanese. | |
This specifically removes variations that are meaningless for romaji | |
conversion using the following steps: | |
- Unicode NFKC normalization | |
- Full-width Latin to half-width | |
- Half-width katakana to full-width | |
""" | |
# perform unicode normalization | |
text = re.sub(r'[〜~](?=\d)', 'から', text) # wave dash range | |
text = unicodedata.normalize('NFKC', text) | |
# convert all full-width alphanum to half-width, since it can go out as-is | |
text = mojimoji.zen_to_han(text, kana=False) | |
# replace half-width katakana with full-width | |
text = mojimoji.han_to_zen(text, digit=False, ascii=False) | |
return ''.join([(' '+Convert(t)) if t.isdigit() else t for t in re.findall(r'\d+|\D+', text)]) | |
def _romaji_tokens(self, words): | |
"""Build a list of tokens from input nodes.""" | |
out = [] | |
for wi, word in enumerate(words): | |
po = out[-1] if out else None | |
pw = words[wi - 1] if wi > 0 else None | |
nw = words[wi + 1] if wi < len(words) - 1 else None | |
roma = self._romaji_word(word) | |
tok = Token(roma, False) | |
# handle punctuation with atypical spacing | |
surface = word.surface#['orig'] | |
if surface in '「『' or roma in '([': | |
if po: | |
po.space = True | |
elif surface in '」』' or roma in ']).,?!:': | |
if po: | |
po.space = False | |
tok.space = True | |
elif roma == ' ': | |
tok.space = False | |
else: | |
tok.space = True | |
out.append(tok) | |
# remove any leftover sokuon | |
for tok in out: | |
tok.surface = tok.surface.replace(chr(12483), '') | |
return out | |
def _romaji_word(self, word): | |
"""Return the romaji for a single word (node).""" | |
surface = word.surface#['orig'] | |
if surface in self.exceptions: | |
return self.exceptions[surface] | |
assert not surface.isdigit(), surface | |
if surface.isascii(): | |
return surface | |
kana = word.feature.pron or word.feature.kana or surface | |
if word.is_unk: | |
if word.char_type == 7: # katakana | |
pass | |
elif word.char_type == 3: # symbol | |
return ''.join(map(lambda c: self.table.get(c, c), surface)) | |
else: | |
return '' # TODO: silently fail | |
out = '' | |
for ki, char in enumerate(kana): | |
nk = kana[ki + 1] if ki < len(kana) - 1 else None | |
pk = kana[ki - 1] if ki > 0 else None | |
out += self._get_single_mapping(pk, char, nk) | |
return out | |
def _get_single_mapping(self, pk, kk, nk): | |
"""Given a single kana and its neighbors, return the mapped romaji.""" | |
# handle odoriji | |
# NOTE: This is very rarely useful at present because odoriji are not | |
# left in readings for dictionary words, and we can't follow kana | |
# across word boundaries. | |
if kk in ODORI: | |
if kk in 'ゝヽ': | |
if pk: return pk | |
else: return '' # invalid but be nice | |
if kk in 'ゞヾ': # repeat with voicing | |
if not pk: return '' | |
vv = add_dakuten(pk) | |
if vv: return self.table[vv] | |
else: return '' | |
# remaining are 々 for kanji and 〃 for symbols, but we can't | |
# infer their span reliably (or handle rendaku) | |
return '' | |
# handle digraphs | |
if pk and (pk + kk) in self.table: | |
return self.table[pk + kk] | |
if nk and (kk + nk) in self.table: | |
return '' | |
if nk and nk in SUTEGANA: | |
if kk == 'ッ': return '' # never valid, just ignore | |
return self.table[kk][:-1] + self.table[nk] | |
if kk in SUTEGANA: | |
return '' | |
if kk == 'ー': # 長音符 | |
return 'ː' | |
if ord(kk) in {12387, 12483}: # っ or ッ | |
tnk = self.table.get(nk) | |
if tnk and tnk[0] in 'bdɸɡhçijkmnɲoprstɯvwz': | |
return tnk[0] | |
return kk | |
if ord(kk) in {12435, 12531}: # ん or ン | |
# https://en.wikipedia.org/wiki/N_(kana) | |
# m before m,p,b | |
# ŋ before k,g | |
# ɲ before ɲ,tɕ,dʑ | |
# n before n,t,d,r,z | |
# ɴ otherwise | |
tnk = self.table.get(nk) | |
if tnk: | |
if tnk[0] in 'mpb': | |
return 'm' | |
elif tnk[0] in 'kɡ': | |
return 'ŋ' | |
elif any(tnk.startswith(p) for p in ('ɲ','tɕ','dʑ')): | |
return 'ɲ' | |
elif tnk[0] in 'ntdrz': | |
return 'n' | |
return 'ɴ' | |
return self.table.get(kk, '') | |