File size: 1,703 Bytes
bcdb559 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import re
from Utils.phonemize.cotlet_phon import phonemize
from Utils.phonemize.cotlet_phon_dir_backend import latn_phonemize
# make sure you have correct spacing when using a mixture of japanese and romaji otherwise it goes into alphabet reading mode.
def is_japanese(text):
japanese_ranges = [
(0x3040, 0x309F), # Hiragana
(0x30A0, 0x30FF), # Katakana
(0x4E00, 0x9FFF), # Kanji
]
for char in text:
char_code = ord(char)
for start, end in japanese_ranges:
if start <= char_code <= end:
return True
return False
def has_only_japanese(text):
# Remove spaces and check if all remaining characters are Japanese
text_no_spaces = ''.join(char for char in text if not char.isspace())
return all(is_japanese(char) for char in text_no_spaces)
def has_only_romaji(text):
# Remove spaces and check if all remaining characters are ASCII
text_no_spaces = ''.join(char for char in text if not char.isspace())
return all(ord(char) < 128 for char in text_no_spaces)
def mixed_phonemize(text):
# Split text into words while preserving spaces
words = re.findall(r'\S+|\s+', text)
result = []
for word in words:
if word.isspace():
result.append(word)
continue
if is_japanese(word):
result.append(phonemize(word))
else:
result.append(latn_phonemize(word))
return ''.join(result)
def smart_phonemize(text):
if has_only_japanese(text):
return phonemize(text)
elif has_only_romaji(text):
return latn_phonemize(text)
else:
return mixed_phonemize(text) |