# IPA Phonemizer: https://github.com/bootphon/phonemizer | |
_pad = "$" | |
_punctuation = ';:,.!?¡¿—…"«»“” ' | |
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" | |
# Export all symbols: | |
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) | |
dicts = {} | |
for i in range(len((symbols))): | |
dicts[symbols[i]] = i | |
class TextCleaner: | |
def __init__(self, dummy=None): | |
self.word_index_dictionary = dicts | |
print(len(dicts)) | |
def __call__(self, text): | |
indexes = [] | |
for char in text: | |
try: | |
indexes.append(self.word_index_dictionary[char]) | |
except KeyError: | |
print(text) | |
return indexes | |