File size: 5,320 Bytes
bcdb559 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
from Utils.phonemize.cotlet_utils import *
import cutlet
katsu = cutlet.Cutlet(ensure_ascii=False)
katsu.use_foreign_spelling = False
def process_japanese_text(ml):
# Check for small characters and replace them
if any(char in ml for char in "ぁぃぅぇぉ"):
ml = ml.replace("ぁ", "あ")
ml = ml.replace("ぃ", "い")
ml = ml.replace("ぅ", "う")
ml = ml.replace("ぇ", "え")
ml = ml.replace("ぉ", "お")
# Initialize Cutlet for romaji conversion
# Convert to romaji and apply transformations
# output = katsu.romaji(ml, capitalize=False).lower()
output = katsu.romaji(apply_transformations(alphabetreading(ml)), capitalize=False).lower()
# Replace specific romaji sequences
if 'j' in output:
output = output.replace('j', "dʑ")
if 'tt' in output:
output = output.replace('tt', "ʔt")
if 't t' in output:
output = output.replace('t t', "ʔt")
if ' ʔt' in output:
output = output.replace(' ʔt', "ʔt")
if 'ssh' in output:
output = output.replace('ssh', "ɕɕ")
# Convert romaji to IPA
output = Roma2IPA(convert_numbers_in_string(output))
output = hira2ipa(output)
# Apply additional transformations
output = replace_chars_2(output)
output = replace_repeated_chars(replace_tashdid_2(output))
output = nasal_mapper(output)
# Final adjustments
if " ɴ" in output:
output = output.replace(" ɴ", "ɴ")
if ' neɽitai ' in output:
output = output.replace(' neɽitai ', "naɽitai")
if 'harɯdʑisama' in output:
output = output.replace('harɯdʑisama', "arɯdʑisama")
if "ki ni ɕinai" in output:
output = re.sub(r'(?<!\s)ki ni ɕinai', r' ki ni ɕinai', output)
if 'ʔt' in output:
output = re.sub(r'(?<!\s)ʔt', r'ʔt', output)
if 'de aɽoɯ' in output:
output = re.sub(r'(?<!\s)de aɽoɯ', r' de aɽoɯ', output)
return output.lstrip()
# def replace_repeating_patterns(text):
# def replace_repeats(match):
# pattern = match.group(1)
# if len(match.group(0)) // len(pattern) >= 3:
# return pattern + "~~~"
# return match.group(0)
# # Pattern for space-separated repeats
# pattern1 = r'((?:\S+\s+){1,5}?)(?:\1){2,}'
# # Pattern for continuous repeats without spaces
# pattern2 = r'(.+?)\1{2,}'
# text = re.sub(pattern1, replace_repeats, text)
# text = re.sub(pattern2, replace_repeats, text)
# return text
def replace_repeating_a(output):
# Define patterns and their replacements
patterns = [
(r'(aː)\s*\1+\s*', r'\1~'), # Replace repeating "aː" with "aː~~"
(r'(aːa)\s*aː', r'\1~'), # Replace "aːa aː" with "aː~~"
(r'aːa', r'aː~'), # Replace "aːa" with "aː~"
(r'naː\s*aː', r'naː~'), # Replace "naː aː" with "naː~"
(r'(oː)\s*\1+\s*', r'\1~'), # Replace repeating "oː" with "oː~~"
(r'(oːo)\s*oː', r'\1~'), # Replace "oːo oː" with "oː~~"
(r'oːo', r'oː~'), # Replace "oːo" with "oː~"
(r'(eː)\s*\1+\s*', r'\1~'),
(r'(e)\s*\1+\s*', r'\1~'),
(r'(eːe)\s*eː', r'\1~'),
(r'eːe', r'eː~'),
(r'neː\s*eː', r'neː~'),
]
# Apply each pattern to the output
for pattern, replacement in patterns:
output = re.sub(pattern, replacement, output)
return output
def phonemize(text):
# if "っ" in text:
# text = text.replace("っ","ʔ")
output = post_fix(process_japanese_text(text))
#output = text
if " ɴ" in output:
output = output.replace(" ɴ", "ɴ")
if "y" in output:
output = output.replace("y", "j")
if "ɯa" in output:
output = output.replace("ɯa", "wa")
if "a aː" in output:
output = output.replace("a aː","a~")
if "a a" in output:
output = output.replace("a a","a~")
output = replace_repeating_a((output))
output = re.sub(r'\s+~', '~', output)
if "oː~o oː~ o" in output:
output = output.replace("oː~o oː~ o","oː~~~~~~")
if "aː~aː" in output:
output = output.replace("aː~aː","aː~~~")
if "oɴ naː" in output:
output = output.replace("oɴ naː","onnaː")
if "aː~~ aː" in output:
output = output.replace("aː~~ aː","aː~~~~")
if "oː~o" in output:
output = output.replace("oː~o","oː~~")
if "oː~~o o" in output:
output = output.replace("oː~~o o","oː~~~~") # yeah I'm too tired to learn regex how did you know
output = random_space_fix(output)
output = random_sym_fix(output) # fixing some symbols, if they have a specific white space such as miku& sakura -> miku ando sakura
output = random_sym_fix_no_space(output) # same as above but for those without white space such as miku&sakura -> miku ando sakura
# if "ɯ" in output:
# output = output.replace("ɯ","U")ss
# if "ʔ" in output:
# output = output.replace("ʔ","!")
return output.lstrip()
# def process_row(row):
# return {'phonemes': [phonemize(word) for word in row['phonemes']]}
|