from Utils.phonemize.cotlet_utils import * import cutlet katsu = cutlet.Cutlet(ensure_ascii=False) katsu.use_foreign_spelling = False def process_latin_text(ml): # Check for small characters and replace them # Initialize Cutlet for romaji conversion # Convert to romaji and apply transformations # output = katsu.romaji(ml, capitalize=False).lower() output = ml.lower() # Replace specific romaji sequences if 'j' in output: output = output.replace('j', "dʑ") if 'y' in output: output = output.replace('y', "j") if 'tt' in output: output = output.replace('tt', "ʔt") if 't t' in output: output = output.replace('t t', "ʔt") if ' ʔt' in output: output = output.replace(' ʔt', "ʔt") if 'ssh' in output: output = output.replace('ssh', "ɕɕ") # Convert romaji to IPA output = Roma2IPA(convert_numbers_in_string(output)) output = hira2ipa(output) # Apply additional transformations output = replace_chars_2(output) output = replace_repeated_chars(replace_tashdid_2(output)) output = nasal_mapper(output) # Final adjustments if " ɴ" in output: output = output.replace(" ɴ", "ɴ") if ' neɽitai ' in output: output = output.replace(' neɽitai ', "naɽitai") if 'harɯdʑisama' in output: output = output.replace('harɯdʑisama', "arɯdʑisama") if "ki ni ɕinai" in output: output = re.sub(r'(? miku ando sakura output = random_sym_fix_no_space(output) # same as above but for those without white space such as miku&sakura -> miku ando sakura # if "ɯ" in output: # output = output.replace("ɯ","U")ss # if "ʔ" in output: # output = output.replace("ʔ","!") return output.lstrip() # def process_row(row): # return {'phonemes': [phonemize(word) for word in row['phonemes']]}