import re | |
from Utils.phonemize.cotlet_phon import phonemize | |
from Utils.phonemize.cotlet_phon_dir_backend import latn_phonemize | |
# make sure you have correct spacing when using a mixture of japanese and romaji otherwise it goes into alphabet reading mode. | |
def is_japanese(text): | |
japanese_ranges = [ | |
(0x3040, 0x309F), # Hiragana | |
(0x30A0, 0x30FF), # Katakana | |
(0x4E00, 0x9FFF), # Kanji | |
] | |
for char in text: | |
char_code = ord(char) | |
for start, end in japanese_ranges: | |
if start <= char_code <= end: | |
return True | |
return False | |
def has_only_japanese(text): | |
# Remove spaces and check if all remaining characters are Japanese | |
text_no_spaces = ''.join(char for char in text if not char.isspace()) | |
return all(is_japanese(char) for char in text_no_spaces) | |
def has_only_romaji(text): | |
# Remove spaces and check if all remaining characters are ASCII | |
text_no_spaces = ''.join(char for char in text if not char.isspace()) | |
return all(ord(char) < 128 for char in text_no_spaces) | |
def mixed_phonemize(text): | |
# Split text into words while preserving spaces | |
words = re.findall(r'\S+|\s+', text) | |
result = [] | |
for word in words: | |
if word.isspace(): | |
result.append(word) | |
continue | |
if is_japanese(word): | |
result.append(phonemize(word)) | |
else: | |
result.append(latn_phonemize(word)) | |
return ''.join(result) | |
def smart_phonemize(text): | |
if has_only_japanese(text): | |
return phonemize(text) | |
elif has_only_romaji(text): | |
return latn_phonemize(text) | |
else: | |
return mixed_phonemize(text) |