Respair
/

Tsukasa_Speech

Model card Files Files and versions Community

Tsukasa_Speech / Utils /phonemize /mixed_phon.py

Respair's picture

Upload folder using huggingface_hub

bcdb559 verified 23 days ago

1.7 kB

	import re
	from Utils.phonemize.cotlet_phon import phonemize
	from Utils.phonemize.cotlet_phon_dir_backend import latn_phonemize

	# make sure you have correct spacing when using a mixture of japanese and romaji otherwise it goes into alphabet reading mode.

	def is_japanese(text):

	japanese_ranges = [
	(0x3040, 0x309F), # Hiragana
	(0x30A0, 0x30FF), # Katakana
	(0x4E00, 0x9FFF), # Kanji
	]

	for char in text:
	char_code = ord(char)
	for start, end in japanese_ranges:
	if start <= char_code <= end:
	return True
	return False

	def has_only_japanese(text):
	# Remove spaces and check if all remaining characters are Japanese
	text_no_spaces = ''.join(char for char in text if not char.isspace())
	return all(is_japanese(char) for char in text_no_spaces)

	def has_only_romaji(text):
	# Remove spaces and check if all remaining characters are ASCII
	text_no_spaces = ''.join(char for char in text if not char.isspace())
	return all(ord(char) < 128 for char in text_no_spaces)

	def mixed_phonemize(text):
	# Split text into words while preserving spaces
	words = re.findall(r'\S+\|\s+', text)
	result = []

	for word in words:
	if word.isspace():
	result.append(word)
	continue

	if is_japanese(word):
	result.append(phonemize(word))
	else:
	result.append(latn_phonemize(word))

	return ''.join(result)

	def smart_phonemize(text):
	if has_only_japanese(text):
	return phonemize(text)
	elif has_only_romaji(text):
	return latn_phonemize(text)
	else:
	return mixed_phonemize(text)