Flux9665 commited on
Commit
606f0bc
1 Parent(s): 46f4c3c

Upload 7 files

Browse files
ArticulatoryTextFrontend.py ADDED
@@ -0,0 +1,1073 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+
4
+ import json
5
+ import logging
6
+ import re
7
+
8
+ import torch
9
+ from articulatory_features import generate_feature_table
10
+ from articulatory_features import get_feature_to_index_lookup
11
+ from articulatory_features import get_phone_to_id
12
+ from dragonmapper.transcriptions import pinyin_to_ipa
13
+ from phonemizer.backend import EspeakBackend
14
+ from pypinyin import pinyin
15
+
16
+
17
+ def load_json_from_path(path): # redundant to the one in utils, but necessary to avoid circular imports
18
+ with open(path, "r", encoding="utf8") as f:
19
+ obj = json.loads(f.read())
20
+
21
+ return obj
22
+
23
+
24
+ class ArticulatoryTextFrontend:
25
+
26
+ def __init__(self,
27
+ language,
28
+ add_silence_to_end=True, # empirically, I found that most TTS systems benefit from having a pause token at the end
29
+ use_word_boundaries=True, # in multilingual scenarios, this can help. Only really works on languages that use whitespace
30
+ use_explicit_end_of_utterance_token=True, # this can help with autoregressive models
31
+ device="cpu"):
32
+ """
33
+ Mostly preparing ID lookups
34
+ """
35
+
36
+ # this locks the device, so it has to happen here and not at the top
37
+ from transphone.g2p import read_g2p
38
+
39
+ self.language = language
40
+ self.use_explicit_eos = use_explicit_end_of_utterance_token
41
+ self.add_silence_to_end = add_silence_to_end
42
+ self.use_word_boundaries = use_word_boundaries
43
+
44
+ register_to_height = {
45
+ "˥": 5,
46
+ "˦": 4,
47
+ "˧": 3,
48
+ "˨": 2,
49
+ "˩": 1
50
+ }
51
+ self.rising_perms = list()
52
+ self.falling_perms = list()
53
+ self.peaking_perms = list()
54
+ self.dipping_perms = list()
55
+
56
+ for first_tone in ["˥", "˦", "˧", "˨", "˩"]:
57
+ for second_tone in ["˥", "˦", "˧", "˨", "˩"]:
58
+ if register_to_height[first_tone] > register_to_height[second_tone]:
59
+ self.falling_perms.append(first_tone + second_tone)
60
+ else:
61
+ self.rising_perms.append(first_tone + second_tone)
62
+ for third_tone in ["˥", "˦", "˧", "˨", "˩"]:
63
+ if register_to_height[first_tone] > register_to_height[second_tone] < register_to_height[third_tone]:
64
+ self.dipping_perms.append(first_tone + second_tone + third_tone)
65
+ elif register_to_height[first_tone] < register_to_height[second_tone] > register_to_height[third_tone]:
66
+ self.peaking_perms.append(first_tone + second_tone + third_tone)
67
+
68
+ if language == "eng" or language == "en-us":
69
+ self.g2p_lang = "en-us" # English as spoken in USA
70
+ self.expand_abbreviations = english_text_expansion
71
+ self.phonemizer = "espeak"
72
+
73
+ elif language == "deu":
74
+ self.g2p_lang = "de" # German
75
+ self.expand_abbreviations = lambda x: x
76
+ self.phonemizer = "espeak"
77
+
78
+ elif language == "ell":
79
+ self.g2p_lang = "el" # Greek
80
+ self.expand_abbreviations = lambda x: x
81
+ self.phonemizer = "espeak"
82
+
83
+ elif language == "spa":
84
+ self.g2p_lang = "es" # Spanish
85
+ self.expand_abbreviations = lambda x: x
86
+ self.phonemizer = "espeak"
87
+
88
+ elif language == "spa-lat":
89
+ self.g2p_lang = "es-419" # Spanish
90
+ self.expand_abbreviations = lambda x: x
91
+ self.phonemizer = "espeak"
92
+
93
+ elif language == "fin":
94
+ self.g2p_lang = "fi" # Finnish
95
+ self.expand_abbreviations = lambda x: x
96
+ self.phonemizer = "espeak"
97
+
98
+ elif language == "rus":
99
+ self.g2p_lang = "ru" # Russian
100
+ self.expand_abbreviations = lambda x: x
101
+ self.phonemizer = "espeak"
102
+
103
+ elif language == "hun":
104
+ self.g2p_lang = "hu" # Hungarian
105
+ self.expand_abbreviations = lambda x: x
106
+ self.phonemizer = "espeak"
107
+
108
+ elif language == "nld":
109
+ self.g2p_lang = "nl" # Dutch
110
+ self.expand_abbreviations = lambda x: x
111
+ self.phonemizer = "espeak"
112
+
113
+ elif language == "fra":
114
+ self.g2p_lang = "fr-fr" # French
115
+ self.expand_abbreviations = remove_french_spacing
116
+ self.phonemizer = "espeak"
117
+
118
+ elif language == "fr-be":
119
+ self.g2p_lang = "fr-be" # French
120
+ self.expand_abbreviations = remove_french_spacing
121
+ self.phonemizer = "espeak"
122
+
123
+ elif language == "fr-sw":
124
+ self.g2p_lang = "fr-ch" # French
125
+ self.expand_abbreviations = remove_french_spacing
126
+ self.phonemizer = "espeak"
127
+
128
+ elif language == "ita":
129
+ self.g2p_lang = "it" # Italian
130
+ self.expand_abbreviations = lambda x: x
131
+ self.phonemizer = "espeak"
132
+
133
+ elif language == "por":
134
+ self.g2p_lang = "pt" # Portuguese
135
+ self.expand_abbreviations = lambda x: x
136
+ self.phonemizer = "espeak"
137
+
138
+ elif language == "pt-br":
139
+ self.g2p_lang = "pt-br" # Portuguese
140
+ self.expand_abbreviations = lambda x: x
141
+ self.phonemizer = "espeak"
142
+
143
+ elif language == "pol":
144
+ self.g2p_lang = "pl" # Polish
145
+ self.expand_abbreviations = lambda x: x
146
+ self.phonemizer = "espeak"
147
+
148
+ elif language == "cmn":
149
+ self.g2p_lang = "cmn" # Mandarin
150
+ self.expand_abbreviations = convert_kanji_to_pinyin_mandarin
151
+ self.phonemizer = "dragonmapper"
152
+
153
+ elif language == "vie":
154
+ self.g2p_lang = "vi" # Northern Vietnamese
155
+ self.expand_abbreviations = lambda x: x
156
+ self.phonemizer = "espeak"
157
+
158
+ elif language == "vi-ctr":
159
+ self.g2p_lang = "vi-vn-x-central" # Central Vietnamese
160
+ self.expand_abbreviations = lambda x: x
161
+ self.phonemizer = "espeak"
162
+
163
+ elif language == "vi-so":
164
+ self.g2p_lang = "vi-vn-x-south" # Southern Vietnamese
165
+ self.expand_abbreviations = lambda x: x
166
+ self.phonemizer = "espeak"
167
+
168
+ elif language == "ukr":
169
+ self.g2p_lang = "uk" # Ukrainian
170
+ self.expand_abbreviations = lambda x: x
171
+ self.phonemizer = "espeak"
172
+
173
+ elif language == "pes":
174
+ self.g2p_lang = "fa" # Western Farsi
175
+ self.expand_abbreviations = lambda x: x
176
+ self.phonemizer = "espeak"
177
+
178
+ elif language == "afr":
179
+ self.g2p_lang = "af" # Afrikaans
180
+ self.expand_abbreviations = lambda x: x
181
+ self.phonemizer = "espeak"
182
+
183
+ elif language == "aln":
184
+ self.g2p_lang = "sq" # Albanian
185
+ self.expand_abbreviations = lambda x: x
186
+ self.phonemizer = "espeak"
187
+
188
+ elif language == "amh":
189
+ self.g2p_lang = "am" # Amharic
190
+ self.expand_abbreviations = lambda x: x
191
+ self.phonemizer = "espeak"
192
+
193
+ elif language == "arb":
194
+ self.g2p_lang = "ar" # Arabic
195
+ self.expand_abbreviations = lambda x: x
196
+ self.phonemizer = "espeak"
197
+
198
+ elif language == "arg":
199
+ self.g2p_lang = "an" # Aragonese
200
+ self.expand_abbreviations = lambda x: x
201
+ self.phonemizer = "espeak"
202
+
203
+ elif language == "hye":
204
+ self.g2p_lang = "hy" # East Armenian
205
+ self.expand_abbreviations = lambda x: x
206
+ self.phonemizer = "espeak"
207
+
208
+ elif language == "hyw":
209
+ self.g2p_lang = "hyw" # West Armenian
210
+ self.expand_abbreviations = lambda x: x
211
+ self.phonemizer = "espeak"
212
+
213
+ elif language == "azj":
214
+ self.g2p_lang = "az" # Azerbaijani
215
+ self.expand_abbreviations = lambda x: x
216
+ self.phonemizer = "espeak"
217
+
218
+ elif language == "bak":
219
+ self.g2p_lang = "ba" # Bashkir
220
+ self.expand_abbreviations = lambda x: x
221
+ self.phonemizer = "espeak"
222
+
223
+ elif language == "eus":
224
+ self.g2p_lang = "eu" # Basque
225
+ self.expand_abbreviations = lambda x: x
226
+ self.phonemizer = "espeak"
227
+
228
+ elif language == "bel":
229
+ self.g2p_lang = "be" # Belarusian
230
+ self.expand_abbreviations = lambda x: x
231
+ self.phonemizer = "espeak"
232
+
233
+ elif language == "ben":
234
+ self.g2p_lang = "bn" # Bengali
235
+ self.expand_abbreviations = lambda x: x
236
+ self.phonemizer = "espeak"
237
+
238
+ elif language == "bpy":
239
+ self.g2p_lang = "bpy" # Bishnupriya Manipuri
240
+ self.expand_abbreviations = lambda x: x
241
+ self.phonemizer = "espeak"
242
+
243
+ elif language == "bos":
244
+ self.g2p_lang = "bs" # Bosnian
245
+ self.expand_abbreviations = lambda x: x
246
+ self.phonemizer = "espeak"
247
+
248
+ elif language == "bul":
249
+ self.g2p_lang = "bg" # Bulgarian
250
+ self.expand_abbreviations = lambda x: x
251
+ self.phonemizer = "espeak"
252
+
253
+ elif language == "mya":
254
+ self.g2p_lang = "my" # Burmese
255
+ self.expand_abbreviations = lambda x: x
256
+ self.phonemizer = "espeak"
257
+
258
+ elif language == "chr":
259
+ self.g2p_lang = "chr" # Cherokee
260
+ self.expand_abbreviations = lambda x: x
261
+ self.phonemizer = "espeak"
262
+
263
+ elif language == "yue":
264
+ self.g2p_lang = "yue" # Chinese Cantonese
265
+ self.expand_abbreviations = lambda x: x
266
+ self.phonemizer = "espeak"
267
+
268
+ elif language == "hak":
269
+ self.g2p_lang = "hak" # Chinese Hakka
270
+ self.expand_abbreviations = lambda x: x
271
+ self.phonemizer = "espeak"
272
+
273
+ elif language == "haw":
274
+ self.g2p_lang = "haw" # Hawaiian
275
+ self.expand_abbreviations = lambda x: x
276
+ self.phonemizer = "espeak"
277
+
278
+ elif language == "hrv":
279
+ self.g2p_lang = "hr" # Croatian
280
+ self.expand_abbreviations = lambda x: x
281
+ self.phonemizer = "espeak"
282
+
283
+ elif language == "ces":
284
+ self.g2p_lang = "cs" # Czech
285
+ self.expand_abbreviations = lambda x: x
286
+ self.phonemizer = "espeak"
287
+
288
+ elif language == "dan":
289
+ self.g2p_lang = "da" # Danish
290
+ self.expand_abbreviations = lambda x: x
291
+ self.phonemizer = "espeak"
292
+
293
+ elif language == "ekk":
294
+ self.g2p_lang = "et" # Estonian
295
+ self.expand_abbreviations = lambda x: x
296
+ self.phonemizer = "espeak"
297
+
298
+ elif language == "gle":
299
+ self.g2p_lang = "ga" # Gaelic Irish
300
+ self.expand_abbreviations = lambda x: x
301
+ self.phonemizer = "espeak"
302
+
303
+ elif language == "gla":
304
+ self.g2p_lang = "gd" # Gaelic Scottish
305
+ self.expand_abbreviations = lambda x: x
306
+ self.phonemizer = "espeak"
307
+
308
+ elif language == "en-sc":
309
+ self.g2p_lang = "en-gb-scotland"
310
+ self.expand_abbreviations = lambda x: x
311
+ self.phonemizer = "espeak"
312
+
313
+ elif language == "kat":
314
+ self.g2p_lang = "ka" # Georgian
315
+ self.expand_abbreviations = lambda x: x
316
+ self.phonemizer = "espeak"
317
+
318
+ elif language == "kal":
319
+ self.g2p_lang = "kl" # Greenlandic
320
+ self.expand_abbreviations = lambda x: x
321
+ self.phonemizer = "espeak"
322
+
323
+ elif language == "guj":
324
+ self.g2p_lang = "gu" # Gujarati
325
+ self.expand_abbreviations = lambda x: x
326
+ self.phonemizer = "espeak"
327
+
328
+ elif language == "heb":
329
+ self.g2p_lang = "he" # Hebrew
330
+ self.expand_abbreviations = lambda x: x
331
+ self.phonemizer = "espeak"
332
+
333
+ elif language == "hin":
334
+ self.g2p_lang = "hi" # Hindi
335
+ self.expand_abbreviations = lambda x: x
336
+ self.phonemizer = "espeak"
337
+
338
+ elif language == "isl":
339
+ self.g2p_lang = "is" # Icelandic
340
+ self.expand_abbreviations = lambda x: x
341
+ self.phonemizer = "espeak"
342
+
343
+ elif language == "ind":
344
+ self.g2p_lang = "id" # Indonesian
345
+ self.expand_abbreviations = lambda x: x
346
+ self.phonemizer = "espeak"
347
+
348
+ elif language == "jpn":
349
+ import pykakasi
350
+
351
+ self.kakasi = pykakasi.Kakasi() # this is not a satisfactory solution, but it is the best one I could come up with so far.
352
+ self.expand_abbreviations = lambda x: " ".join([chunk["hepburn"] for chunk in self.kakasi.convert(x)])
353
+ self.g2p_lang = language
354
+ self.phonemizer = "transphone"
355
+ self.transphone = read_g2p(device=device)
356
+
357
+ elif language == "kan":
358
+ self.g2p_lang = "kn" # Kannada
359
+ self.expand_abbreviations = lambda x: x
360
+ self.phonemizer = "espeak"
361
+
362
+ elif language == "knn":
363
+ self.g2p_lang = "kok" # Konkani
364
+ self.expand_abbreviations = lambda x: x
365
+ self.phonemizer = "espeak"
366
+
367
+ elif language == "kor":
368
+ self.g2p_lang = "ko" # Korean
369
+ self.expand_abbreviations = lambda x: x
370
+ self.phonemizer = "espeak"
371
+
372
+ elif language == "ckb":
373
+ self.g2p_lang = "ku" # Kurdish
374
+ self.expand_abbreviations = lambda x: x
375
+ self.phonemizer = "espeak"
376
+
377
+ elif language == "kaz":
378
+ self.g2p_lang = "kk" # Kazakh
379
+ self.expand_abbreviations = lambda x: x
380
+ self.phonemizer = "espeak"
381
+
382
+ elif language == "kir":
383
+ self.g2p_lang = "ky" # Kyrgyz
384
+ self.expand_abbreviations = lambda x: x
385
+ self.phonemizer = "espeak"
386
+
387
+ elif language == "lat":
388
+ self.g2p_lang = "la" # Latin
389
+ self.expand_abbreviations = lambda x: x
390
+ self.phonemizer = "espeak"
391
+
392
+ elif language == "ltz":
393
+ self.g2p_lang = "lb" # Luxembourgish
394
+ self.expand_abbreviations = lambda x: x
395
+ self.phonemizer = "espeak"
396
+
397
+ elif language == "lvs":
398
+ self.g2p_lang = "lv" # Latvian
399
+ self.expand_abbreviations = lambda x: x
400
+ self.phonemizer = "espeak"
401
+
402
+ elif language == "lit":
403
+ self.g2p_lang = "lt" # Lithuanian
404
+ self.expand_abbreviations = lambda x: x
405
+ self.phonemizer = "espeak"
406
+
407
+ elif language == "mri":
408
+ self.g2p_lang = "mi" # Māori
409
+ self.expand_abbreviations = lambda x: x
410
+ self.phonemizer = "espeak"
411
+
412
+ elif language == "mkd":
413
+ self.g2p_lang = "mk" # Macedonian
414
+ self.expand_abbreviations = lambda x: x
415
+ self.phonemizer = "espeak"
416
+
417
+ elif language == "zlm":
418
+ self.g2p_lang = "ms" # Malay
419
+ self.expand_abbreviations = lambda x: x
420
+ self.phonemizer = "espeak"
421
+
422
+ elif language == "mal":
423
+ self.g2p_lang = "ml" # Malayalam
424
+ self.expand_abbreviations = lambda x: x
425
+ self.phonemizer = "espeak"
426
+
427
+ elif language == "mlt":
428
+ self.g2p_lang = "mt" # Maltese
429
+ self.expand_abbreviations = lambda x: x
430
+ self.phonemizer = "espeak"
431
+
432
+ elif language == "mar":
433
+ self.g2p_lang = "mr" # Marathi
434
+ self.expand_abbreviations = lambda x: x
435
+ self.phonemizer = "espeak"
436
+
437
+ elif language == "nci":
438
+ self.g2p_lang = "nci" # Nahuatl
439
+ self.expand_abbreviations = lambda x: x
440
+ self.phonemizer = "espeak"
441
+
442
+ elif language == "npi":
443
+ self.g2p_lang = "ne" # Nepali
444
+ self.expand_abbreviations = lambda x: x
445
+ self.phonemizer = "espeak"
446
+
447
+ elif language == "nob":
448
+ self.g2p_lang = "nb" # Norwegian Bokmål
449
+ self.expand_abbreviations = lambda x: x
450
+ self.phonemizer = "espeak"
451
+
452
+ elif language == "nog":
453
+ self.g2p_lang = "nog" # Nogai
454
+ self.expand_abbreviations = lambda x: x
455
+ self.phonemizer = "espeak"
456
+
457
+ elif language == "ory":
458
+ self.g2p_lang = "or" # Oriya
459
+ self.expand_abbreviations = lambda x: x
460
+ self.phonemizer = "espeak"
461
+
462
+ elif language == "gaz":
463
+ self.g2p_lang = "om" # Oromo
464
+ self.expand_abbreviations = lambda x: x
465
+ self.phonemizer = "espeak"
466
+
467
+ elif language == "pap":
468
+ self.g2p_lang = "pap" # Papiamento
469
+ self.expand_abbreviations = lambda x: x
470
+ self.phonemizer = "espeak"
471
+
472
+ elif language == "pan":
473
+ self.g2p_lang = "pa" # Punjabi
474
+ self.expand_abbreviations = lambda x: x
475
+ self.phonemizer = "espeak"
476
+
477
+ elif language == "ron":
478
+ self.g2p_lang = "ro" # Romanian
479
+ self.expand_abbreviations = lambda x: x
480
+ self.phonemizer = "espeak"
481
+
482
+ elif language == "lav":
483
+ self.g2p_lang = "ru-lv" # Russian Latvia
484
+ self.expand_abbreviations = lambda x: x
485
+ self.phonemizer = "espeak"
486
+
487
+ elif language == "srp":
488
+ self.g2p_lang = "sr" # Serbian
489
+ self.expand_abbreviations = lambda x: x
490
+ self.phonemizer = "espeak"
491
+
492
+ elif language == "tsn":
493
+ self.g2p_lang = "tn" # Setswana
494
+ self.expand_abbreviations = lambda x: x
495
+ self.phonemizer = "espeak"
496
+
497
+ elif language == "snd":
498
+ self.g2p_lang = "sd" # Sindhi
499
+ self.expand_abbreviations = lambda x: x
500
+ self.phonemizer = "espeak"
501
+
502
+ elif language == "slk":
503
+ self.g2p_lang = "sk" # Slovak
504
+ self.expand_abbreviations = lambda x: x
505
+ self.phonemizer = "espeak"
506
+
507
+ elif language == "slv":
508
+ self.g2p_lang = "sl" # Slovenian
509
+ self.expand_abbreviations = lambda x: x
510
+ self.phonemizer = "espeak"
511
+
512
+ elif language == "smj":
513
+ self.g2p_lang = "smj" # Lule Saami
514
+ self.expand_abbreviations = lambda x: x
515
+ self.phonemizer = "espeak"
516
+
517
+ elif language == "swh":
518
+ self.g2p_lang = "sw" # Swahili
519
+ self.expand_abbreviations = lambda x: x
520
+ self.phonemizer = "espeak"
521
+
522
+ elif language == "swe":
523
+ self.g2p_lang = "sv" # Swedish
524
+ self.expand_abbreviations = lambda x: x
525
+ self.phonemizer = "espeak"
526
+
527
+ elif language == "tam":
528
+ self.g2p_lang = "ta" # Tamil
529
+ self.expand_abbreviations = lambda x: x
530
+ self.phonemizer = "espeak"
531
+
532
+ elif language == "tha":
533
+ self.g2p_lang = "th" # Thai
534
+ self.expand_abbreviations = lambda x: x
535
+ self.phonemizer = "espeak"
536
+
537
+ elif language == "tuk":
538
+ self.g2p_lang = "tk" # Turkmen
539
+ self.expand_abbreviations = lambda x: x
540
+ self.phonemizer = "espeak"
541
+
542
+ elif language == "tat":
543
+ self.g2p_lang = "tt" # Tatar
544
+ self.expand_abbreviations = lambda x: x
545
+ self.phonemizer = "espeak"
546
+
547
+ elif language == "tel":
548
+ self.g2p_lang = "te" # Telugu
549
+ self.expand_abbreviations = lambda x: x
550
+ self.phonemizer = "espeak"
551
+
552
+ elif language == "tur":
553
+ self.g2p_lang = "tr" # Turkish
554
+ self.expand_abbreviations = lambda x: x
555
+ self.phonemizer = "espeak"
556
+
557
+ elif language == "uig":
558
+ self.g2p_lang = "ug" # Uyghur
559
+ self.expand_abbreviations = lambda x: x
560
+ self.phonemizer = "espeak"
561
+
562
+ elif language == "urd":
563
+ self.g2p_lang = "ur" # Urdu
564
+ self.expand_abbreviations = lambda x: x
565
+ self.phonemizer = "espeak"
566
+
567
+ elif language == "uzn":
568
+ self.g2p_lang = "uz" # Uzbek
569
+ self.expand_abbreviations = lambda x: x
570
+ self.phonemizer = "espeak"
571
+
572
+ elif language == "cym":
573
+ self.g2p_lang = "cy" # Welsh
574
+ self.expand_abbreviations = lambda x: x
575
+ self.phonemizer = "espeak"
576
+
577
+ else:
578
+ # blanket solution for the rest
579
+ print("Using Transphone. A specialized phonemizer might work better.")
580
+ self.g2p_lang = language
581
+ self.phonemizer = "transphone"
582
+ self.expand_abbreviations = lambda x: x
583
+ self.transphone = read_g2p(device=device)
584
+
585
+ # remember to also update get_language_id() below when adding something here, as well as the get_example_sentence function
586
+
587
+ if self.phonemizer == "espeak":
588
+ try:
589
+ self.phonemizer_backend = EspeakBackend(language=self.g2p_lang,
590
+ punctuation_marks=';:,.!?¡¿—…()"«»“”~/。【】、‥،؟“”؛',
591
+ preserve_punctuation=True,
592
+ language_switch='remove-flags',
593
+ with_stress=True,
594
+ logger=logging.getLogger(__file__))
595
+ except RuntimeError:
596
+ print("Error in loading espeak! \n"
597
+ "Maybe espeak is not installed on your system? \n"
598
+ "Falling back to transphone.")
599
+ from transphone.g2p import read_g2p
600
+ self.g2p_lang = self.language
601
+ self.phonemizer = "transphone"
602
+ self.expand_abbreviations = lambda x: x
603
+ self.transphone = read_g2p()
604
+ self.phone_to_vector = generate_feature_table()
605
+ self.phone_to_id = get_phone_to_id()
606
+ self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
607
+ self.text_vector_to_phone_cache = dict()
608
+
609
+ @staticmethod
610
+ def get_example_sentence(lang):
611
+ if lang == "eng":
612
+ return "This is a complex sentence, it even has a pause!"
613
+ elif lang == "deu":
614
+ return "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
615
+ elif lang == "ell":
616
+ return "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
617
+ elif lang == "spa":
618
+ return "Esta es una oración compleja, ¡incluso tiene una pausa!"
619
+ elif lang == "fin":
620
+ return "Tämä on monimutkainen lause, sillä on jopa tauko!"
621
+ elif lang == "rus":
622
+ return "Это сложное предложение, в нем даже есть пауза!"
623
+ elif lang == "hun":
624
+ return "Ez egy összetett mondat, még szünet is van benne!"
625
+ elif lang == "nld":
626
+ return "Dit is een complexe zin, er zit zelfs een pauze in!"
627
+ elif lang == "fra":
628
+ return "C'est une phrase complexe, elle a même une pause !"
629
+ elif lang == "por":
630
+ return "Esta é uma frase complexa, tem até uma pausa!"
631
+ elif lang == "pol":
632
+ return "To jest zdanie złożone, ma nawet pauzę!"
633
+ elif lang == "ita":
634
+ return "Questa è una frase complessa, ha anche una pausa!"
635
+ elif lang == "cmn":
636
+ return "这是一个复杂的句子,它甚至包含一个停顿。"
637
+ elif lang == "vie":
638
+ return "Đây là một câu phức tạp, nó thậm chí còn chứa một khoảng dừng."
639
+ else:
640
+ print(f"No example sentence specified for the language: {lang}\n "
641
+ f"Please specify an example sentence in the get_example_sentence function in Preprocessing/TextFrontend to track your progress.")
642
+ return None
643
+
644
+ def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
645
+ """
646
+ Fixes unicode errors, expands some abbreviations,
647
+ turns graphemes into phonemes and then vectorizes
648
+ the sequence as articulatory features
649
+ """
650
+ if input_phonemes:
651
+ phones = text
652
+ else:
653
+ phones = self.get_phone_string(text=text, include_eos_symbol=True, for_feature_extraction=True)
654
+ phones = phones.replace("ɚ", "��").replace("ᵻ", "ɨ")
655
+ if view:
656
+ print("Phonemes: \n{}\n".format(phones))
657
+ phones_vector = list()
658
+ # turn into numeric vectors
659
+ stressed_flag = False
660
+
661
+ for char in phones:
662
+ # affects following phoneme -----------------
663
+ if char.strip() == '\u02C8':
664
+ # primary stress
665
+ stressed_flag = True
666
+ # affects previous phoneme -----------------
667
+ elif char.strip() == '\u02D0':
668
+ # lengthened
669
+ phones_vector[-1][get_feature_to_index_lookup()["lengthened"]] = 1
670
+ elif char.strip() == '\u02D1':
671
+ # half length
672
+ phones_vector[-1][get_feature_to_index_lookup()["half-length"]] = 1
673
+ elif char.strip() == '\u0306':
674
+ # shortened
675
+ phones_vector[-1][get_feature_to_index_lookup()["shortened"]] = 1
676
+ elif char.strip() == '̃' and phones_vector[-1][get_feature_to_index_lookup()["nasal"]] != 1:
677
+ # nasalized (vowel)
678
+ phones_vector[-1][get_feature_to_index_lookup()["nasal"]] = 2
679
+ elif char.strip() == "̧" != phones_vector[-1][get_feature_to_index_lookup()["palatal"]] != 1:
680
+ # palatalized
681
+ phones_vector[-1][get_feature_to_index_lookup()["palatal"]] = 2
682
+ elif char.strip() == "ʷ" and phones_vector[-1][get_feature_to_index_lookup()["labial-velar"]] != 1:
683
+ # labialized
684
+ phones_vector[-1][get_feature_to_index_lookup()["labial-velar"]] = 2
685
+ elif char.strip() == "ʰ" and phones_vector[-1][get_feature_to_index_lookup()["aspirated"]] != 1:
686
+ # aspirated
687
+ phones_vector[-1][get_feature_to_index_lookup()["aspirated"]] = 2
688
+ elif char.strip() == "ˠ" and phones_vector[-1][get_feature_to_index_lookup()["velar"]] != 1:
689
+ # velarized
690
+ phones_vector[-1][get_feature_to_index_lookup()["velar"]] = 2
691
+ elif char.strip() == "ˁ" and phones_vector[-1][get_feature_to_index_lookup()["pharyngal"]] != 1:
692
+ # pharyngealized
693
+ phones_vector[-1][get_feature_to_index_lookup()["pharyngal"]] = 2
694
+ elif char.strip() == "ˀ" and phones_vector[-1][get_feature_to_index_lookup()["glottal"]] != 1:
695
+ # glottalized
696
+ phones_vector[-1][get_feature_to_index_lookup()["glottal"]] = 2
697
+ elif char.strip() == "ʼ" and phones_vector[-1][get_feature_to_index_lookup()["ejective"]] != 1:
698
+ # ejective
699
+ phones_vector[-1][get_feature_to_index_lookup()["ejective"]] = 2
700
+ elif char.strip() == "̹" and phones_vector[-1][get_feature_to_index_lookup()["rounded"]] != 1:
701
+ # rounding
702
+ phones_vector[-1][get_feature_to_index_lookup()["rounded"]] = 2
703
+ elif char.strip() == "̞" and phones_vector[-1][get_feature_to_index_lookup()["open"]] != 1:
704
+ # open
705
+ phones_vector[-1][get_feature_to_index_lookup()["open"]] = 2
706
+ elif char.strip() == "̪" and phones_vector[-1][get_feature_to_index_lookup()["dental"]] != 1:
707
+ # dental
708
+ phones_vector[-1][get_feature_to_index_lookup()["dental"]] = 2
709
+ elif char.strip() == "̬" and phones_vector[-1][get_feature_to_index_lookup()["voiced"]] != 1:
710
+ # voiced
711
+ phones_vector[-1][get_feature_to_index_lookup()["voiced"]] = 2
712
+ elif char.strip() == "̝" and phones_vector[-1][get_feature_to_index_lookup()["close"]] != 1:
713
+ # closed
714
+ phones_vector[-1][get_feature_to_index_lookup()["close"]] = 2
715
+ elif char.strip() == "̰" and phones_vector[-1][get_feature_to_index_lookup()["glottal"]] != 1 and phones_vector[-1][get_feature_to_index_lookup()["epiglottal"]] != 1:
716
+ # laryngalization
717
+ phones_vector[-1][get_feature_to_index_lookup()["glottal"]] = 2
718
+ phones_vector[-1][get_feature_to_index_lookup()["epiglottal"]] = 2
719
+ elif char.strip() == "̈" and phones_vector[-1][get_feature_to_index_lookup()["central"]] != 1:
720
+ # centralization
721
+ phones_vector[-1][get_feature_to_index_lookup()["central"]] = 2
722
+ elif char.strip() == "̜" and phones_vector[-1][get_feature_to_index_lookup()["unrounded"]] != 1:
723
+ # unrounded
724
+ phones_vector[-1][get_feature_to_index_lookup()["unrounded"]] = 2
725
+ elif char.strip() == "̥" and phones_vector[-1][get_feature_to_index_lookup()["unvoiced"]] != 1:
726
+ # voiceless
727
+ phones_vector[-1][get_feature_to_index_lookup()["unvoiced"]] = 2
728
+ elif char.strip() == "˥":
729
+ # very high tone
730
+ phones_vector[-1][get_feature_to_index_lookup()["very-high-tone"]] = 1
731
+ elif char.strip() == "˦":
732
+ # high tone
733
+ phones_vector[-1][get_feature_to_index_lookup()["high-tone"]] = 1
734
+ elif char.strip() == "˧":
735
+ # mid tone
736
+ phones_vector[-1][get_feature_to_index_lookup()["mid-tone"]] = 1
737
+ elif char.strip() == "˨":
738
+ # low tone
739
+ phones_vector[-1][get_feature_to_index_lookup()["low-tone"]] = 1
740
+ elif char.strip() == "˩":
741
+ # very low tone
742
+ phones_vector[-1][get_feature_to_index_lookup()["very-low-tone"]] = 1
743
+ elif char.strip() == "⭧":
744
+ # rising tone
745
+ phones_vector[-1][get_feature_to_index_lookup()["rising-tone"]] = 1
746
+ elif char.strip() == "⭨":
747
+ # falling tone
748
+ phones_vector[-1][get_feature_to_index_lookup()["falling-tone"]] = 1
749
+ elif char.strip() == "⮁":
750
+ # peaking tone
751
+ phones_vector[-1][get_feature_to_index_lookup()["peaking-tone"]] = 1
752
+ elif char.strip() == "⮃":
753
+ # dipping tone
754
+ phones_vector[-1][get_feature_to_index_lookup()["dipping-tone"]] = 1
755
+ else:
756
+ if handle_missing:
757
+ try:
758
+ phones_vector.append(self.phone_to_vector[char].copy())
759
+ except KeyError:
760
+ print("unknown phoneme: {}".format(char))
761
+ else:
762
+ phones_vector.append(self.phone_to_vector[char].copy()) # leave error handling to elsewhere
763
+ # the following lines try to emulate whispering by removing all voiced features
764
+ # phones_vector[-1][get_feature_to_index_lookup()["voiced"]] = 0
765
+ # phones_vector[-1][get_feature_to_index_lookup()["unvoiced"]] = 1
766
+ # the following lines explore what would happen, if the system is told to produce sounds a human cannot
767
+ # for dim, _ in enumerate(phones_vector[-1]):
768
+ # phones_vector[-1][dim] = 1
769
+ if stressed_flag:
770
+ stressed_flag = False
771
+ phones_vector[-1][get_feature_to_index_lookup()["stressed"]] = 1
772
+
773
+ return torch.Tensor(phones_vector, device=device)
774
+
775
+ def get_phone_string(self, text, include_eos_symbol=True, for_feature_extraction=False, for_plot_labels=False):
776
+ if text == "":
777
+ return ""
778
+ # expand abbreviations
779
+ utt = self.expand_abbreviations(text)
780
+
781
+ # convert the graphemes to phonemes here
782
+ if self.phonemizer == "espeak":
783
+ try:
784
+ phones = self.phonemizer_backend.phonemize([utt], strip=True)[0] # To use a different phonemizer, this is the only line that needs to be exchanged
785
+ except:
786
+ print(f"There was an error with espeak. \nFalling back to transphone.\nSentence: {utt} \nLanguage {self.g2p_lang}")
787
+ from transphone.g2p import read_g2p
788
+ self.g2p_lang = self.language
789
+ self.phonemizer = "transphone"
790
+ self.expand_abbreviations = lambda x: x
791
+ self.transphone = read_g2p()
792
+ return self.get_phone_string(text, include_eos_symbol, for_feature_extraction, for_plot_labels)
793
+ elif self.phonemizer == "transphone":
794
+ replacements = [
795
+ # punctuation in languages with non-latin script
796
+ ("。", "~"),
797
+ (",", "~"),
798
+ ("【", '~'),
799
+ ("】", '~'),
800
+ ("、", "~"),
801
+ ("‥", "~"),
802
+ ("؟", "~"),
803
+ ("،", "~"),
804
+ ("“", '~'),
805
+ ("”", '~'),
806
+ ("؛", "~"),
807
+ ("《", '~'),
808
+ ("》", '~'),
809
+ ("?", "~"),
810
+ ("!", "~"),
811
+ (" :", "~"),
812
+ (" ;", "~"),
813
+ ("-", "~"),
814
+ ("·", " "),
815
+ ("`", ""),
816
+ # symbols that indicate a pause or silence
817
+ ('"', "~"),
818
+ (" - ", "~ "),
819
+ ("- ", "~ "),
820
+ ("-", ""),
821
+ ("…", "~"),
822
+ (":", "~"),
823
+ (";", "~"),
824
+ (",", "~") # make sure this remains the final one when adding new ones
825
+ ]
826
+ for replacement in replacements:
827
+ utt = utt.replace(replacement[0], replacement[1])
828
+ utt = re.sub("~+", "~", utt)
829
+ utt = re.sub(r"\s+", " ", utt)
830
+ utt = re.sub(r"\.+", ".", utt)
831
+ chunk_list = list()
832
+ for chunk in utt.split("~"):
833
+ # unfortunately the transphone tokenizer is not suited for any languages besides English it seems
834
+ # this is not much better, but maybe a little.
835
+ word_list = list()
836
+ for word_by_whitespace in chunk.split():
837
+ word_list.append(self.transphone.inference(word_by_whitespace, self.g2p_lang))
838
+ chunk_list.append(" ".join(["".join(word) for word in word_list]))
839
+ phones = "~ ".join(chunk_list)
840
+ elif self.phonemizer == "dragonmapper":
841
+ phones = pinyin_to_ipa(utt)
842
+
843
+ # Unfortunately tonal languages don't agree on the tone, most tonal
844
+ # languages use different tones denoted by different numbering
845
+ # systems. At this point in the script, it is attempted to unify
846
+ # them all to the tones in the IPA standard.
847
+ if self.g2p_lang == "vi":
848
+ phones = phones.replace('1', "˧")
849
+ phones = phones.replace('2', "˨˩")
850
+ phones = phones.replace('ɜ', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
851
+ phones = phones.replace('3', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
852
+ phones = phones.replace('4', "˦˧˥")
853
+ phones = phones.replace('5', "˧˩˧")
854
+ phones = phones.replace('6', "˧˩˨ʔ") # very weird tone, because the tone introduces another phoneme
855
+ phones = phones.replace('7', "˧")
856
+ elif self.g2p_lang == "yue":
857
+ phones = phones.replace('1', "˥")
858
+ phones = phones.replace('2', "˧˥")
859
+ phones = phones.replace('3', "˧")
860
+ phones = phones.replace('4', "˧˩")
861
+ phones = phones.replace('5', "˩˧")
862
+ phones = phones.replace('6', "˨")
863
+ # more of this handling for more tonal languages can be added here, simply make an elif statement and check for the language.
864
+ return self.postprocess_phoneme_string(phones, for_feature_extraction, include_eos_symbol, for_plot_labels)
865
+
866
+ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, include_eos_symbol, for_plot_labels):
867
+ """
868
+ Takes as input a phoneme string and processes it to work best with the way we represent phonemes as featurevectors
869
+ """
870
+ replacements = [
871
+ # punctuation in languages with non-latin script
872
+ ("。", "."),
873
+ (",", ","),
874
+ ("【", '"'),
875
+ ("】", '"'),
876
+ ("、", ","),
877
+ ("‥", "…"),
878
+ ("؟", "?"),
879
+ ("،", ","),
880
+ ("“", '"'),
881
+ ("”", '"'),
882
+ ("؛", ","),
883
+ ("《", '"'),
884
+ ("》", '"'),
885
+ ("?", "?"),
886
+ ("!", "!"),
887
+ (" :", ":"),
888
+ (" ;", ";"),
889
+ ("-", "-"),
890
+ ("·", " "),
891
+ # latin script punctuation
892
+ ("/", " "),
893
+ ("—", ""),
894
+ ("(", "~"),
895
+ (")", "~"),
896
+ ("...", "…"),
897
+ ("\n", ", "),
898
+ ("\t", " "),
899
+ ("¡", ""),
900
+ ("¿", ""),
901
+ ("«", '"'),
902
+ ("»", '"'),
903
+ # unifying some phoneme representations
904
+ ("N", "ŋ"), # somehow transphone doesn't transform this to IPA
905
+ ("ɫ", "l"), # alveolopalatal
906
+ ("ɚ", "ə"),
907
+ ("g", "ɡ"),
908
+ ("ε", "e"),
909
+ ("ʦ", "ts"),
910
+ ("ˤ", "ˁ"),
911
+ ('ᵻ', 'ɨ'),
912
+ ("ɧ", "ç"), # velopalatal
913
+ ("ɥ", "j"), # labiopalatal
914
+ ("ɬ", "s"), # lateral
915
+ ("ɮ", "z"), # lateral
916
+ ('ɺ', 'ɾ'), # lateral
917
+ ('ʲ', 'j'), # decomposed palatalization
918
+ ('\u02CC', ""), # secondary stress
919
+ ('\u030B', "˥"),
920
+ ('\u0301', "˦"),
921
+ ('\u0304', "˧"),
922
+ ('\u0300', "˨"),
923
+ ('\u030F', "˩"),
924
+ ('\u0302', "⭨"),
925
+ ('\u030C', "⭧"),
926
+ ("꜖", "˩"),
927
+ ("꜕", "˨"),
928
+ ("꜔", "˧"),
929
+ ("꜓", "˦"),
930
+ ("꜒", "˥"),
931
+ # symbols that indicate a pause or silence
932
+ ('"', "~"),
933
+ (" - ", "~ "),
934
+ ("- ", "~ "),
935
+ ("-", ""),
936
+ ("…", "."),
937
+ (":", "~"),
938
+ (";", "~"),
939
+ (",", "~") # make sure this remains the final one when adding new ones
940
+ ]
941
+ unsupported_ipa_characters = {'̙', '̯', '̤', '̩', '̠', '̟', 'ꜜ', '̽', '|', '•', '↘',
942
+ '‖', '‿', 'ᷝ', 'ᷠ', '̚', '↗', 'ꜛ', '̻', '̘', '͡', '̺'}
943
+ # https://en.wikipedia.org/wiki/IPA_number
944
+ for char in unsupported_ipa_characters:
945
+ replacements.append((char, ""))
946
+
947
+ if not for_feature_extraction:
948
+ # in case we want to plot etc., we only need the segmental units, so we remove everything else.
949
+ replacements = replacements + [
950
+ ('\u02C8', ""), # primary stress
951
+ ('\u02D0', ""), # lengthened
952
+ ('\u02D1', ""), # half-length
953
+ ('\u0306', ""), # shortened
954
+ ("˥", ""), # very high tone
955
+ ("˦", ""), # high tone
956
+ ("˧", ""), # mid tone
957
+ ("˨", ""), # low tone
958
+ ("˩", ""), # very low tone
959
+ ('\u030C', ""), # rising tone
960
+ ('\u0302', ""), # falling tone
961
+ ('⭧', ""), # rising
962
+ ('⭨', ""), # falling
963
+ ('⮃', ""), # dipping
964
+ ('⮁', ""), # peaking
965
+ ('̃', ""), # nasalizing
966
+ ("̧", ""), # palatalized
967
+ ("ʷ", ""), # labialized
968
+ ("ʰ", ""), # aspirated
969
+ ("ˠ", ""), # velarized
970
+ ("ˁ", ""), # pharyngealized
971
+ ("ˀ", ""), # glottalized
972
+ ("ʼ", ""), # ejective
973
+ ("̹", ""), # rounding
974
+ ("̞", ""), # open
975
+ ("̪", ""), # dental
976
+ ("̬", ""), # voiced
977
+ ("̝", ""), # closed
978
+ ("̰", ""), # laryngalization
979
+ ("̈", ""), # centralization
980
+ ("̜", ""), # unrounded
981
+ ("̥", ""), # voiceless
982
+ ]
983
+ for replacement in replacements:
984
+ phoneme_string = phoneme_string.replace(replacement[0], replacement[1])
985
+ phones = re.sub("~+", "~", phoneme_string)
986
+ phones = re.sub(r"\s+", " ", phones)
987
+ phones = re.sub(r"\.+", ".", phones)
988
+ phones = phones.lstrip("~").rstrip("~")
989
+
990
+ # peaking tones
991
+ for peaking_perm in self.peaking_perms:
992
+ phones = phones.replace(peaking_perm, "⮁".join(peaking_perm))
993
+ # dipping tones
994
+ for dipping_perm in self.dipping_perms:
995
+ phones = phones.replace(dipping_perm, "⮃".join(dipping_perm))
996
+ # rising tones
997
+ for rising_perm in self.rising_perms:
998
+ phones = phones.replace(rising_perm, "⭧".join(rising_perm))
999
+ # falling tones
1000
+ for falling_perm in self.falling_perms:
1001
+ phones = phones.replace(falling_perm, "⭨".join(falling_perm))
1002
+
1003
+ if self.add_silence_to_end:
1004
+ phones += "~" # adding a silence in the end during inference produces more natural sounding prosody
1005
+ if include_eos_symbol:
1006
+ phones += "#"
1007
+ if not self.use_word_boundaries:
1008
+ phones = phones.replace(" ", "")
1009
+ if for_plot_labels:
1010
+ phones = phones.replace(" ", "|")
1011
+
1012
+ phones = "~" + phones
1013
+ phones = re.sub("~+", "~", phones)
1014
+
1015
+ return phones
1016
+
1017
+ def text_vectors_to_id_sequence(self, text_vector):
1018
+ tokens = list()
1019
+ for vector in text_vector:
1020
+ if vector[get_feature_to_index_lookup()["word-boundary"]] == 0:
1021
+ # we don't include word boundaries when performing alignment, since they are not always present in audio.
1022
+ features = vector.cpu().numpy().tolist()
1023
+ immutable_vector = tuple(features)
1024
+ if immutable_vector in self.text_vector_to_phone_cache:
1025
+ tokens.append(self.phone_to_id[self.text_vector_to_phone_cache[immutable_vector]])
1026
+ continue
1027
+ features = features[13:]
1028
+ # the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup
1029
+ for index in range(len(features)):
1030
+ if features[index] == 2:
1031
+ # we remove all features that stem from a modifier, so we can map back to the unmodified sound
1032
+ features[index] = 0
1033
+ for phone in self.phone_to_vector:
1034
+ if features == self.phone_to_vector[phone][13:]:
1035
+ tokens.append(self.phone_to_id[phone])
1036
+ self.text_vector_to_phone_cache[immutable_vector] = phone
1037
+ # this is terribly inefficient, but it's fine, since we're building a cache over time that makes this instant
1038
+ break
1039
+ return tokens
1040
+
1041
+
1042
+ def english_text_expansion(text):
1043
+ """
1044
+ Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
1045
+ See https://github.com/keithito/tacotron/
1046
+ Careful: Only apply to english datasets. Different languages need different cleaners.
1047
+ """
1048
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
1049
+ [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
1050
+ ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
1051
+ ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
1052
+ for regex, replacement in _abbreviations:
1053
+ text = re.sub(regex, replacement, text)
1054
+ return text
1055
+
1056
+
1057
+ def remove_french_spacing(text):
1058
+ text = text.replace(" »", '"').replace("« ", '"')
1059
+ for punc in ["!", ";", ":", ".", ",", "?", "-"]:
1060
+ text = text.replace(f" {punc}", punc)
1061
+ return text
1062
+
1063
+
1064
+ def convert_kanji_to_pinyin_mandarin(text):
1065
+ return " ".join([x[0] for x in pinyin(text)])
1066
+
1067
+
1068
+ def get_language_id(language, iso_lookup_path="./iso_lookup.json"):
1069
+ iso_codes_to_ids = load_json_from_path(iso_lookup_path)[-1]
1070
+ if language not in iso_codes_to_ids:
1071
+ print("Please specify the language as ISO 639-3 code (https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes)")
1072
+ return None
1073
+ return torch.LongTensor([iso_codes_to_ids[language]])
articulatory_features.py ADDED
@@ -0,0 +1,966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+
4
+ # partly derived from an open-source resource provided by Papercup Technologies Limited
5
+ # Resource-Author: Marlene Staib
6
+ # Modified by Florian Lux, 2021
7
+ # Further modified by Florian Lux, 2022
8
+
9
+
10
+ """
11
+ All phonemes in the IPA standard are supported.
12
+
13
+ zero-width characters are generally not supported, as
14
+ well as some other modifiers. Tone, stress and
15
+ lengthening are represented with placeholder dimensions,
16
+ however they need to be set manually, this conversion
17
+ from phonemes to features works on a character by
18
+ character basis. In a few cases, the place of
19
+ articulation is approximated because only one phoneme
20
+ had such a combination, which does not warrant a new
21
+ dimension.
22
+ """
23
+
24
+
25
+ def generate_feature_lookup():
26
+ return {
27
+ '~': {'symbol_type': 'silence'},
28
+ '#': {'symbol_type': 'end of sentence'},
29
+ '?': {'symbol_type': 'questionmark'},
30
+ '!': {'symbol_type': 'exclamationmark'},
31
+ '.': {'symbol_type': 'fullstop'},
32
+ ' ': {'symbol_type': 'word-boundary'},
33
+ 'ɜ': {
34
+ 'symbol_type' : 'phoneme',
35
+ 'vowel_consonant' : 'vowel',
36
+ 'VUV' : 'voiced',
37
+ 'vowel_frontness' : 'central',
38
+ 'vowel_openness' : 'open-mid',
39
+ 'vowel_roundedness': 'unrounded',
40
+ },
41
+ 'ə': {
42
+ 'symbol_type' : 'phoneme',
43
+ 'vowel_consonant' : 'vowel',
44
+ 'VUV' : 'voiced',
45
+ 'vowel_frontness' : 'central',
46
+ 'vowel_openness' : 'mid',
47
+ 'vowel_roundedness': 'unrounded',
48
+ },
49
+ 'a': {
50
+ 'symbol_type' : 'phoneme',
51
+ 'vowel_consonant' : 'vowel',
52
+ 'VUV' : 'voiced',
53
+ 'vowel_frontness' : 'front',
54
+ 'vowel_openness' : 'open',
55
+ 'vowel_roundedness': 'unrounded',
56
+ },
57
+ 'ð': {
58
+ 'symbol_type' : 'phoneme',
59
+ 'vowel_consonant' : 'consonant',
60
+ 'VUV' : 'voiced',
61
+ 'consonant_place' : 'dental',
62
+ 'consonant_manner': 'fricative'
63
+ },
64
+ 'ɛ': {
65
+ 'symbol_type' : 'phoneme',
66
+ 'vowel_consonant' : 'vowel',
67
+ 'VUV' : 'voiced',
68
+ 'vowel_frontness' : 'front',
69
+ 'vowel_openness' : 'open-mid',
70
+ 'vowel_roundedness': 'unrounded',
71
+ },
72
+ 'ɪ': {
73
+ 'symbol_type' : 'phoneme',
74
+ 'vowel_consonant' : 'vowel',
75
+ 'VUV' : 'voiced',
76
+ 'vowel_frontness' : 'front_central',
77
+ 'vowel_openness' : 'close_close-mid',
78
+ 'vowel_roundedness': 'unrounded',
79
+ },
80
+ 'ŋ': {
81
+ 'symbol_type' : 'phoneme',
82
+ 'vowel_consonant' : 'consonant',
83
+ 'VUV' : 'voiced',
84
+ 'consonant_place' : 'velar',
85
+ 'consonant_manner': 'nasal'
86
+ },
87
+ 'ɔ': {
88
+ 'symbol_type' : 'phoneme',
89
+ 'vowel_consonant' : 'vowel',
90
+ 'VUV' : 'voiced',
91
+ 'vowel_frontness' : 'back',
92
+ 'vowel_openness' : 'open-mid',
93
+ 'vowel_roundedness': 'rounded',
94
+ },
95
+ 'ɒ': {
96
+ 'symbol_type' : 'phoneme',
97
+ 'vowel_consonant' : 'vowel',
98
+ 'VUV' : 'voiced',
99
+ 'vowel_frontness' : 'back',
100
+ 'vowel_openness' : 'open',
101
+ 'vowel_roundedness': 'rounded',
102
+ },
103
+ 'ɾ': {
104
+ 'symbol_type' : 'phoneme',
105
+ 'vowel_consonant' : 'consonant',
106
+ 'VUV' : 'voiced',
107
+ 'consonant_place' : 'alveolar',
108
+ 'consonant_manner': 'flap'
109
+ },
110
+ 'ʃ': {
111
+ 'symbol_type' : 'phoneme',
112
+ 'vowel_consonant' : 'consonant',
113
+ 'VUV' : 'unvoiced',
114
+ 'consonant_place' : 'postalveolar',
115
+ 'consonant_manner': 'fricative'
116
+ },
117
+ 'θ': {
118
+ 'symbol_type' : 'phoneme',
119
+ 'vowel_consonant' : 'consonant',
120
+ 'VUV' : 'unvoiced',
121
+ 'consonant_place' : 'dental',
122
+ 'consonant_manner': 'fricative'
123
+ },
124
+ 'ʊ': {
125
+ 'symbol_type' : 'phoneme',
126
+ 'vowel_consonant' : 'vowel',
127
+ 'VUV' : 'voiced',
128
+ 'vowel_frontness' : 'central_back',
129
+ 'vowel_openness' : 'close_close-mid',
130
+ 'vowel_roundedness': 'unrounded'
131
+ },
132
+ 'ʌ': {
133
+ 'symbol_type' : 'phoneme',
134
+ 'vowel_consonant' : 'vowel',
135
+ 'VUV' : 'voiced',
136
+ 'vowel_frontness' : 'back',
137
+ 'vowel_openness' : 'open-mid',
138
+ 'vowel_roundedness': 'unrounded'
139
+ },
140
+ 'ʒ': {
141
+ 'symbol_type' : 'phoneme',
142
+ 'vowel_consonant' : 'consonant',
143
+ 'VUV' : 'voiced',
144
+ 'consonant_place' : 'postalveolar',
145
+ 'consonant_manner': 'fricative'
146
+ },
147
+ 'æ': {
148
+ 'symbol_type' : 'phoneme',
149
+ 'vowel_consonant' : 'vowel',
150
+ 'VUV' : 'voiced',
151
+ 'vowel_frontness' : 'front',
152
+ 'vowel_openness' : 'open-mid_open',
153
+ 'vowel_roundedness': 'unrounded'
154
+ },
155
+ 'b': {
156
+ 'symbol_type' : 'phoneme',
157
+ 'vowel_consonant' : 'consonant',
158
+ 'VUV' : 'voiced',
159
+ 'consonant_place' : 'bilabial',
160
+ 'consonant_manner': 'plosive'
161
+ },
162
+ 'ʔ': {
163
+ 'symbol_type' : 'phoneme',
164
+ 'vowel_consonant' : 'consonant',
165
+ 'VUV' : 'unvoiced',
166
+ 'consonant_place' : 'glottal',
167
+ 'consonant_manner': 'plosive'
168
+ },
169
+ 'd': {
170
+ 'symbol_type' : 'phoneme',
171
+ 'vowel_consonant' : 'consonant',
172
+ 'VUV' : 'voiced',
173
+ 'consonant_place' : 'alveolar',
174
+ 'consonant_manner': 'plosive'
175
+ },
176
+ 'e': {
177
+ 'symbol_type' : 'phoneme',
178
+ 'vowel_consonant' : 'vowel',
179
+ 'VUV' : 'voiced',
180
+ 'vowel_frontness' : 'front',
181
+ 'vowel_openness' : 'close-mid',
182
+ 'vowel_roundedness': 'unrounded'
183
+ },
184
+ 'f': {
185
+ 'symbol_type' : 'phoneme',
186
+ 'vowel_consonant' : 'consonant',
187
+ 'VUV' : 'unvoiced',
188
+ 'consonant_place' : 'labiodental',
189
+ 'consonant_manner': 'fricative'
190
+ },
191
+ 'ɡ': {
192
+ 'symbol_type' : 'phoneme',
193
+ 'vowel_consonant' : 'consonant',
194
+ 'VUV' : 'voiced',
195
+ 'consonant_place' : 'velar',
196
+ 'consonant_manner': 'plosive'
197
+ },
198
+ 'h': {
199
+ 'symbol_type' : 'phoneme',
200
+ 'vowel_consonant' : 'consonant',
201
+ 'VUV' : 'unvoiced',
202
+ 'consonant_place' : 'glottal',
203
+ 'consonant_manner': 'fricative'
204
+ },
205
+ 'i': {
206
+ 'symbol_type' : 'phoneme',
207
+ 'vowel_consonant' : 'vowel',
208
+ 'VUV' : 'voiced',
209
+ 'vowel_frontness' : 'front',
210
+ 'vowel_openness' : 'close',
211
+ 'vowel_roundedness': 'unrounded'
212
+ },
213
+ 'j': {
214
+ 'symbol_type' : 'phoneme',
215
+ 'vowel_consonant' : 'consonant',
216
+ 'VUV' : 'voiced',
217
+ 'consonant_place' : 'palatal',
218
+ 'consonant_manner': 'approximant'
219
+ },
220
+ 'k': {
221
+ 'symbol_type' : 'phoneme',
222
+ 'vowel_consonant' : 'consonant',
223
+ 'VUV' : 'unvoiced',
224
+ 'consonant_place' : 'velar',
225
+ 'consonant_manner': 'plosive'
226
+ },
227
+ 'l': {
228
+ 'symbol_type' : 'phoneme',
229
+ 'vowel_consonant' : 'consonant',
230
+ 'VUV' : 'voiced',
231
+ 'consonant_place' : 'alveolar',
232
+ 'consonant_manner': 'lateral-approximant'
233
+ },
234
+ 'm': {
235
+ 'symbol_type' : 'phoneme',
236
+ 'vowel_consonant' : 'consonant',
237
+ 'VUV' : 'voiced',
238
+ 'consonant_place' : 'bilabial',
239
+ 'consonant_manner': 'nasal'
240
+ },
241
+ 'n': {
242
+ 'symbol_type' : 'phoneme',
243
+ 'vowel_consonant' : 'consonant',
244
+ 'VUV' : 'voiced',
245
+ 'consonant_place' : 'alveolar',
246
+ 'consonant_manner': 'nasal'
247
+ },
248
+ 'ɳ': {
249
+ 'symbol_type' : 'phoneme',
250
+ 'vowel_consonant' : 'consonant',
251
+ 'VUV' : 'voiced',
252
+ 'consonant_place' : 'retroflex',
253
+ 'consonant_manner': 'nasal'
254
+ },
255
+ 'o': {
256
+ 'symbol_type' : 'phoneme',
257
+ 'vowel_consonant' : 'vowel',
258
+ 'VUV' : 'voiced',
259
+ 'vowel_frontness' : 'back',
260
+ 'vowel_openness' : 'close-mid',
261
+ 'vowel_roundedness': 'rounded'
262
+ },
263
+ 'p': {
264
+ 'symbol_type' : 'phoneme',
265
+ 'vowel_consonant' : 'consonant',
266
+ 'VUV' : 'unvoiced',
267
+ 'consonant_place' : 'bilabial',
268
+ 'consonant_manner': 'plosive'
269
+ },
270
+ 'ɹ': {
271
+ 'symbol_type' : 'phoneme',
272
+ 'vowel_consonant' : 'consonant',
273
+ 'VUV' : 'voiced',
274
+ 'consonant_place' : 'alveolar',
275
+ 'consonant_manner': 'approximant'
276
+ },
277
+ 'r': {
278
+ 'symbol_type' : 'phoneme',
279
+ 'vowel_consonant' : 'consonant',
280
+ 'VUV' : 'voiced',
281
+ 'consonant_place' : 'alveolar',
282
+ 'consonant_manner': 'trill'
283
+ },
284
+ 's': {
285
+ 'symbol_type' : 'phoneme',
286
+ 'vowel_consonant' : 'consonant',
287
+ 'VUV' : 'unvoiced',
288
+ 'consonant_place' : 'alveolar',
289
+ 'consonant_manner': 'fricative'
290
+ },
291
+ 't': {
292
+ 'symbol_type' : 'phoneme',
293
+ 'vowel_consonant' : 'consonant',
294
+ 'VUV' : 'unvoiced',
295
+ 'consonant_place' : 'alveolar',
296
+ 'consonant_manner': 'plosive'
297
+ },
298
+ 'u': {
299
+ 'symbol_type' : 'phoneme',
300
+ 'vowel_consonant' : 'vowel',
301
+ 'VUV' : 'voiced',
302
+ 'vowel_frontness' : 'back',
303
+ 'vowel_openness' : 'close',
304
+ 'vowel_roundedness': 'rounded',
305
+ },
306
+ 'v': {
307
+ 'symbol_type' : 'phoneme',
308
+ 'vowel_consonant' : 'consonant',
309
+ 'VUV' : 'voiced',
310
+ 'consonant_place' : 'labiodental',
311
+ 'consonant_manner': 'fricative'
312
+ },
313
+ 'w': {
314
+ 'symbol_type' : 'phoneme',
315
+ 'vowel_consonant' : 'consonant',
316
+ 'VUV' : 'voiced',
317
+ 'consonant_place' : 'labial-velar',
318
+ 'consonant_manner': 'approximant'
319
+ },
320
+ 'x': {
321
+ 'symbol_type' : 'phoneme',
322
+ 'vowel_consonant' : 'consonant',
323
+ 'VUV' : 'unvoiced',
324
+ 'consonant_place' : 'velar',
325
+ 'consonant_manner': 'fricative'
326
+ },
327
+ 'z': {
328
+ 'symbol_type' : 'phoneme',
329
+ 'vowel_consonant' : 'consonant',
330
+ 'VUV' : 'voiced',
331
+ 'consonant_place' : 'alveolar',
332
+ 'consonant_manner': 'fricative'
333
+ },
334
+ 'ʀ': {
335
+ 'symbol_type' : 'phoneme',
336
+ 'vowel_consonant' : 'consonant',
337
+ 'VUV' : 'voiced',
338
+ 'consonant_place' : 'uvular',
339
+ 'consonant_manner': 'trill'
340
+ },
341
+ 'ø': {
342
+ 'symbol_type' : 'phoneme',
343
+ 'vowel_consonant' : 'vowel',
344
+ 'VUV' : 'voiced',
345
+ 'vowel_frontness' : 'front',
346
+ 'vowel_openness' : 'close-mid',
347
+ 'vowel_roundedness': 'rounded'
348
+ },
349
+ 'ç': {
350
+ 'symbol_type' : 'phoneme',
351
+ 'vowel_consonant' : 'consonant',
352
+ 'VUV' : 'unvoiced',
353
+ 'consonant_place' : 'palatal',
354
+ 'consonant_manner': 'fricative'
355
+ },
356
+ 'ɐ': {
357
+ 'symbol_type' : 'phoneme',
358
+ 'vowel_consonant' : 'vowel',
359
+ 'VUV' : 'voiced',
360
+ 'vowel_frontness' : 'central',
361
+ 'vowel_openness' : 'open',
362
+ 'vowel_roundedness': 'unrounded'
363
+ },
364
+ 'œ': {
365
+ 'symbol_type' : 'phoneme',
366
+ 'vowel_consonant' : 'vowel',
367
+ 'VUV' : 'voiced',
368
+ 'vowel_frontness' : 'front',
369
+ 'vowel_openness' : 'open-mid',
370
+ 'vowel_roundedness': 'rounded'
371
+ },
372
+ 'y': {
373
+ 'symbol_type' : 'phoneme',
374
+ 'vowel_consonant' : 'vowel',
375
+ 'VUV' : 'voiced',
376
+ 'vowel_frontness' : 'front',
377
+ 'vowel_openness' : 'close',
378
+ 'vowel_roundedness': 'rounded'
379
+ },
380
+ 'ʏ': {
381
+ 'symbol_type' : 'phoneme',
382
+ 'vowel_consonant' : 'vowel',
383
+ 'VUV' : 'voiced',
384
+ 'vowel_frontness' : 'front_central',
385
+ 'vowel_openness' : 'close_close-mid',
386
+ 'vowel_roundedness': 'rounded'
387
+ },
388
+ 'ɑ': {
389
+ 'symbol_type' : 'phoneme',
390
+ 'vowel_consonant' : 'vowel',
391
+ 'VUV' : 'voiced',
392
+ 'vowel_frontness' : 'back',
393
+ 'vowel_openness' : 'open',
394
+ 'vowel_roundedness': 'unrounded'
395
+ },
396
+ 'c': {
397
+ 'symbol_type' : 'phoneme',
398
+ 'vowel_consonant' : 'consonant',
399
+ 'VUV' : 'unvoiced',
400
+ 'consonant_place' : 'palatal',
401
+ 'consonant_manner': 'plosive'
402
+ },
403
+ 'ɲ': {
404
+ 'symbol_type' : 'phoneme',
405
+ 'vowel_consonant' : 'consonant',
406
+ 'VUV' : 'voiced',
407
+ 'consonant_place' : 'palatal',
408
+ 'consonant_manner': 'nasal'
409
+ },
410
+ 'ɣ': {
411
+ 'symbol_type' : 'phoneme',
412
+ 'vowel_consonant' : 'consonant',
413
+ 'VUV' : 'voiced',
414
+ 'consonant_place' : 'velar',
415
+ 'consonant_manner': 'fricative'
416
+ },
417
+ 'ʎ': {
418
+ 'symbol_type' : 'phoneme',
419
+ 'vowel_consonant' : 'consonant',
420
+ 'VUV' : 'voiced',
421
+ 'consonant_place' : 'palatal',
422
+ 'consonant_manner': 'lateral-approximant'
423
+ },
424
+ 'β': {
425
+ 'symbol_type' : 'phoneme',
426
+ 'vowel_consonant' : 'consonant',
427
+ 'VUV' : 'voiced',
428
+ 'consonant_place' : 'bilabial',
429
+ 'consonant_manner': 'fricative'
430
+ },
431
+ 'ʝ': {
432
+ 'symbol_type' : 'phoneme',
433
+ 'vowel_consonant' : 'consonant',
434
+ 'VUV' : 'voiced',
435
+ 'consonant_place' : 'palatal',
436
+ 'consonant_manner': 'fricative'
437
+ },
438
+ 'ɟ': {
439
+ 'symbol_type' : 'phoneme',
440
+ 'vowel_consonant' : 'consonant',
441
+ 'VUV' : 'voiced',
442
+ 'consonant_place' : 'palatal',
443
+ 'consonant_manner': 'plosive'
444
+ },
445
+ 'q': {
446
+ 'symbol_type' : 'phoneme',
447
+ 'vowel_consonant' : 'consonant',
448
+ 'VUV' : 'unvoiced',
449
+ 'consonant_place' : 'uvular',
450
+ 'consonant_manner': 'plosive'
451
+ },
452
+ 'ɕ': {
453
+ 'symbol_type' : 'phoneme',
454
+ 'vowel_consonant' : 'consonant',
455
+ 'VUV' : 'unvoiced',
456
+ 'consonant_place' : 'alveolopalatal',
457
+ 'consonant_manner': 'fricative'
458
+ },
459
+ 'ɭ': {
460
+ 'symbol_type' : 'phoneme',
461
+ 'vowel_consonant' : 'consonant',
462
+ 'VUV' : 'voiced',
463
+ 'consonant_place' : 'retroflex',
464
+ 'consonant_manner': 'lateral-approximant'
465
+ },
466
+ 'ɵ': {
467
+ 'symbol_type' : 'phoneme',
468
+ 'vowel_consonant' : 'vowel',
469
+ 'VUV' : 'voiced',
470
+ 'vowel_frontness' : 'central',
471
+ 'vowel_openness' : 'close-mid',
472
+ 'vowel_roundedness': 'rounded'
473
+ },
474
+ 'ʑ': {
475
+ 'symbol_type' : 'phoneme',
476
+ 'vowel_consonant' : 'consonant',
477
+ 'VUV' : 'voiced',
478
+ 'consonant_place' : 'alveolopalatal',
479
+ 'consonant_manner': 'fricative'
480
+ },
481
+ 'ʋ': {
482
+ 'symbol_type' : 'phoneme',
483
+ 'vowel_consonant' : 'consonant',
484
+ 'VUV' : 'voiced',
485
+ 'consonant_place' : 'labiodental',
486
+ 'consonant_manner': 'approximant'
487
+ },
488
+ 'ʁ': {
489
+ 'symbol_type' : 'phoneme',
490
+ 'vowel_consonant' : 'consonant',
491
+ 'VUV' : 'voiced',
492
+ 'consonant_place' : 'uvular',
493
+ 'consonant_manner': 'fricative'
494
+ },
495
+ 'ɨ': {
496
+ 'symbol_type' : 'phoneme',
497
+ 'vowel_consonant' : 'vowel',
498
+ 'VUV' : 'voiced',
499
+ 'vowel_frontness' : 'central',
500
+ 'vowel_openness' : 'close',
501
+ 'vowel_roundedness': 'unrounded'
502
+ },
503
+ 'ʂ': {
504
+ 'symbol_type' : 'phoneme',
505
+ 'vowel_consonant' : 'consonant',
506
+ 'VUV' : 'unvoiced',
507
+ 'consonant_place' : 'retroflex',
508
+ 'consonant_manner': 'fricative'
509
+ },
510
+ 'ɓ': {
511
+ 'symbol_type' : 'phoneme',
512
+ 'vowel_consonant' : 'consonant',
513
+ 'VUV' : 'voiced',
514
+ 'consonant_place' : 'bilabial',
515
+ 'consonant_manner': 'implosive'
516
+ },
517
+ 'ʙ': {
518
+ 'symbol_type' : 'phoneme',
519
+ 'vowel_consonant' : 'consonant',
520
+ 'VUV' : 'voiced',
521
+ 'consonant_place' : 'bilabial',
522
+ 'consonant_manner': 'vibrant'
523
+ },
524
+ 'ɗ': {
525
+ 'symbol_type' : 'phoneme',
526
+ 'vowel_consonant' : 'consonant',
527
+ 'VUV' : 'voiced',
528
+ 'consonant_place' : 'dental',
529
+ 'consonant_manner': 'implosive'
530
+ },
531
+ 'ɖ': {
532
+ 'symbol_type' : 'phoneme',
533
+ 'vowel_consonant' : 'consonant',
534
+ 'VUV' : 'voiced',
535
+ 'consonant_place' : 'retroflex',
536
+ 'consonant_manner': 'plosive'
537
+ },
538
+ 'χ': {
539
+ 'symbol_type' : 'phoneme',
540
+ 'vowel_consonant' : 'consonant',
541
+ 'VUV' : 'unvoiced',
542
+ 'consonant_place' : 'uvular',
543
+ 'consonant_manner': 'fricative'
544
+ },
545
+ 'ʛ': {
546
+ 'symbol_type' : 'phoneme',
547
+ 'vowel_consonant' : 'consonant',
548
+ 'VUV' : 'voiced',
549
+ 'consonant_place' : 'uvular',
550
+ 'consonant_manner': 'implosive'
551
+ },
552
+ 'ʟ': {
553
+ 'symbol_type' : 'phoneme',
554
+ 'vowel_consonant' : 'consonant',
555
+ 'VUV' : 'voiced',
556
+ 'consonant_place' : 'velar',
557
+ 'consonant_manner': 'lateral-approximant'
558
+ },
559
+ 'ɽ': {
560
+ 'symbol_type' : 'phoneme',
561
+ 'vowel_consonant' : 'consonant',
562
+ 'VUV' : 'voiced',
563
+ 'consonant_place' : 'retroflex',
564
+ 'consonant_manner': 'flap'
565
+ },
566
+ 'ɢ': {
567
+ 'symbol_type' : 'phoneme',
568
+ 'vowel_consonant' : 'consonant',
569
+ 'VUV' : 'voiced',
570
+ 'consonant_place' : 'uvular',
571
+ 'consonant_manner': 'plosive'
572
+ },
573
+ 'ɠ': {
574
+ 'symbol_type' : 'phoneme',
575
+ 'vowel_consonant' : 'consonant',
576
+ 'VUV' : 'voiced',
577
+ 'consonant_place' : 'velar',
578
+ 'consonant_manner': 'implosive'
579
+ },
580
+ 'ǂ': {
581
+ 'symbol_type' : 'phoneme',
582
+ 'vowel_consonant' : 'consonant',
583
+ 'VUV' : 'unvoiced',
584
+ 'consonant_place' : 'alveolopalatal',
585
+ 'consonant_manner': 'click'
586
+ },
587
+ 'ɦ': {
588
+ 'symbol_type' : 'phoneme',
589
+ 'vowel_consonant' : 'consonant',
590
+ 'VUV' : 'voiced',
591
+ 'consonant_place' : 'glottal',
592
+ 'consonant_manner': 'fricative'
593
+ },
594
+ 'ǁ': {
595
+ 'symbol_type' : 'phoneme',
596
+ 'vowel_consonant' : 'consonant',
597
+ 'VUV' : 'unvoiced',
598
+ 'consonant_place' : 'alveolar',
599
+ 'consonant_manner': 'click'
600
+ },
601
+ 'ĩ': { # identical description with i except nasal
602
+ 'symbol_type' : 'phoneme',
603
+ 'vowel_consonant' : 'vowel',
604
+ 'VUV' : 'voiced',
605
+ 'vowel_frontness' : 'front',
606
+ 'vowel_openness' : 'close',
607
+ 'vowel_roundedness': 'unrounded',
608
+ 'consonant_manner' : 'nasal'
609
+ },
610
+ 'ʍ': {
611
+ 'symbol_type' : 'phoneme',
612
+ 'vowel_consonant' : 'consonant',
613
+ 'VUV' : 'unvoiced',
614
+ 'consonant_place' : 'labial-velar',
615
+ 'consonant_manner': 'fricative'
616
+ },
617
+ 'ʕ': {
618
+ 'symbol_type' : 'phoneme',
619
+ 'vowel_consonant' : 'consonant',
620
+ 'VUV' : 'voiced',
621
+ 'consonant_place' : 'pharyngal',
622
+ 'consonant_manner': 'fricative'
623
+ },
624
+ 'ɻ': {
625
+ 'symbol_type' : 'phoneme',
626
+ 'vowel_consonant' : 'consonant',
627
+ 'VUV' : 'unvoiced',
628
+ 'consonant_place' : 'retroflex',
629
+ 'consonant_manner': 'approximant'
630
+ },
631
+ 'ʄ': {
632
+ 'symbol_type' : 'phoneme',
633
+ 'vowel_consonant' : 'consonant',
634
+ 'VUV' : 'voiced',
635
+ 'consonant_place' : 'palatal',
636
+ 'consonant_manner': 'implosive'
637
+ },
638
+ 'ũ': { # identical with u, but nasal
639
+ 'symbol_type' : 'phoneme',
640
+ 'vowel_consonant' : 'vowel',
641
+ 'VUV' : 'voiced',
642
+ 'vowel_frontness' : 'back',
643
+ 'vowel_openness' : 'close',
644
+ 'vowel_roundedness': 'rounded',
645
+ 'consonant_manner' : 'nasal'
646
+ },
647
+ 'ɤ': {
648
+ 'symbol_type' : 'phoneme',
649
+ 'vowel_consonant' : 'vowel',
650
+ 'VUV' : 'voiced',
651
+ 'vowel_frontness' : 'back',
652
+ 'vowel_openness' : 'close-mid',
653
+ 'vowel_roundedness': 'unrounded',
654
+ },
655
+ 'ɶ': {
656
+ 'symbol_type' : 'phoneme',
657
+ 'vowel_consonant' : 'vowel',
658
+ 'VUV' : 'voiced',
659
+ 'vowel_frontness' : 'front',
660
+ 'vowel_openness' : 'open',
661
+ 'vowel_roundedness': 'rounded',
662
+ },
663
+ 'õ': {
664
+ 'symbol_type' : 'phoneme',
665
+ 'vowel_consonant' : 'vowel',
666
+ 'VUV' : 'voiced',
667
+ 'vowel_frontness' : 'back',
668
+ 'vowel_openness' : 'close-mid',
669
+ 'vowel_roundedness': 'rounded',
670
+ 'consonant_manner' : 'nasal'
671
+ },
672
+ 'ʡ': {
673
+ 'symbol_type' : 'phoneme',
674
+ 'vowel_consonant' : 'consonant',
675
+ 'VUV' : 'unvoiced',
676
+ 'consonant_place' : 'epiglottal',
677
+ 'consonant_manner': 'plosive'
678
+ },
679
+ 'ʈ': {
680
+ 'symbol_type' : 'phoneme',
681
+ 'vowel_consonant' : 'consonant',
682
+ 'VUV' : 'unvoiced',
683
+ 'consonant_place' : 'retroflex',
684
+ 'consonant_manner': 'plosive'
685
+ },
686
+ 'ʜ': {
687
+ 'symbol_type' : 'phoneme',
688
+ 'vowel_consonant' : 'consonant',
689
+ 'VUV' : 'unvoiced',
690
+ 'consonant_place' : 'epiglottal',
691
+ 'consonant_manner': 'fricative'
692
+ },
693
+ 'ɱ': {
694
+ 'symbol_type' : 'phoneme',
695
+ 'vowel_consonant' : 'consonant',
696
+ 'VUV' : 'voiced',
697
+ 'consonant_place' : 'labiodental',
698
+ 'consonant_manner': 'nasal'
699
+ },
700
+ 'ɯ': {
701
+ 'symbol_type' : 'phoneme',
702
+ 'vowel_consonant' : 'vowel',
703
+ 'VUV' : 'voiced',
704
+ 'vowel_frontness' : 'back',
705
+ 'vowel_openness' : 'close',
706
+ 'vowel_roundedness': 'unrounded'
707
+ },
708
+ 'ǀ': {
709
+ 'symbol_type' : 'phoneme',
710
+ 'vowel_consonant' : 'consonant',
711
+ 'VUV' : 'unvoiced',
712
+ 'consonant_place' : 'dental',
713
+ 'consonant_manner': 'click'
714
+ },
715
+ 'ɸ': {
716
+ 'symbol_type' : 'phoneme',
717
+ 'vowel_consonant' : 'consonant',
718
+ 'VUV' : 'unvoiced',
719
+ 'consonant_place' : 'bilabial',
720
+ 'consonant_manner': 'fricative'
721
+ },
722
+ 'ʘ': {
723
+ 'symbol_type' : 'phoneme',
724
+ 'vowel_consonant' : 'consonant',
725
+ 'VUV' : 'unvoiced',
726
+ 'consonant_place' : 'bilabial',
727
+ 'consonant_manner': 'click'
728
+ },
729
+ 'ʐ': {
730
+ 'symbol_type' : 'phoneme',
731
+ 'vowel_consonant' : 'consonant',
732
+ 'VUV' : 'voiced',
733
+ 'consonant_place' : 'retroflex',
734
+ 'consonant_manner': 'fricative'
735
+ },
736
+ 'ɰ': {
737
+ 'symbol_type' : 'phoneme',
738
+ 'vowel_consonant' : 'consonant',
739
+ 'VUV' : 'voiced',
740
+ 'consonant_place' : 'velar',
741
+ 'consonant_manner': 'approximant'
742
+ },
743
+ 'ɘ': {
744
+ 'symbol_type' : 'phoneme',
745
+ 'vowel_consonant' : 'vowel',
746
+ 'VUV' : 'voiced',
747
+ 'vowel_frontness' : 'central',
748
+ 'vowel_openness' : 'close-mid',
749
+ 'vowel_roundedness': 'unrounded'
750
+ },
751
+ 'ħ': {
752
+ 'symbol_type' : 'phoneme',
753
+ 'vowel_consonant' : 'consonant',
754
+ 'VUV' : 'unvoiced',
755
+ 'consonant_place' : 'pharyngal',
756
+ 'consonant_manner': 'fricative'
757
+ },
758
+ 'ɞ': {
759
+ 'symbol_type' : 'phoneme',
760
+ 'vowel_consonant' : 'vowel',
761
+ 'VUV' : 'voiced',
762
+ 'vowel_frontness' : 'central',
763
+ 'vowel_openness' : 'open-mid',
764
+ 'vowel_roundedness': 'rounded'
765
+ },
766
+ 'ʉ': {
767
+ 'symbol_type' : 'phoneme',
768
+ 'vowel_consonant' : 'vowel',
769
+ 'VUV' : 'voiced',
770
+ 'vowel_frontness' : 'central',
771
+ 'vowel_openness' : 'close',
772
+ 'vowel_roundedness': 'rounded'
773
+ },
774
+ 'ɴ': {
775
+ 'symbol_type' : 'phoneme',
776
+ 'vowel_consonant' : 'consonant',
777
+ 'VUV' : 'voiced',
778
+ 'consonant_place' : 'uvular',
779
+ 'consonant_manner': 'nasal'
780
+ },
781
+ 'ʢ': {
782
+ 'symbol_type' : 'phoneme',
783
+ 'vowel_consonant' : 'consonant',
784
+ 'VUV' : 'voiced',
785
+ 'consonant_place' : 'epiglottal',
786
+ 'consonant_manner': 'fricative'
787
+ },
788
+ 'ѵ': {
789
+ 'symbol_type' : 'phoneme',
790
+ 'vowel_consonant' : 'consonant',
791
+ 'VUV' : 'voiced',
792
+ 'consonant_place' : 'labiodental',
793
+ 'consonant_manner': 'flap'
794
+ },
795
+ 'ǃ': { # looks deceivingly like an exclamation mark, but it's a different unicode entry
796
+ 'symbol_type' : 'phoneme',
797
+ 'vowel_consonant' : 'consonant',
798
+ 'VUV' : 'unvoiced',
799
+ 'consonant_place' : 'postalveolar',
800
+ 'consonant_manner': 'click'
801
+ },
802
+
803
+ } # REMEMBER to also add the phonemes added here to the ID lookup below as the new highest ID
804
+
805
+
806
+ def get_phone_to_id():
807
+ """
808
+ for the states of the ctc loss and dijkstra/mas in the aligner
809
+ cannot be extracted trivially from above because sets are unordered and the IDs need to be consistent
810
+ """
811
+ phone_to_id = dict()
812
+ for index, phone in enumerate("~#?!ǃ.ɜəaðɛɪŋɔɒɾʃθʊʌʒæbʔdefghijklmnɳopɡɹrstuvwxzʀøçɐœyʏɑcɲɣʎβʝɟqɕɭɵʑʋʁɨʂɓʙɗɖχʛʟɽɢɠǂɦǁĩʍʕɻʄũɤɶõʡʈʜɱɯǀɸʘʐɰɘħɞʉɴʢѵ"):
813
+ phone_to_id[phone] = index
814
+ # the following lines fix an issue with the aligner: While the different punctuation marks have
815
+ # different effects on their context, their realization in the signal is typically just silence.
816
+ # Since this is common for all of them, the CTC objective malfunctions for our purposes of
817
+ # alignment search. So it turned out that it's better to map all punctuation marks to silence.
818
+ phone_to_id["#"] = phone_to_id["~"]
819
+ phone_to_id["?"] = phone_to_id["~"]
820
+ phone_to_id["!"] = phone_to_id["~"]
821
+ phone_to_id["."] = phone_to_id["~"]
822
+ return phone_to_id
823
+
824
+
825
+ def get_feature_to_index_lookup():
826
+ return {
827
+ # MODIFIER
828
+ # -- stress: modified by the previous symbol
829
+ "stressed" : 0,
830
+ # -- tone: modified by the following symbol
831
+ "very-high-tone" : 1,
832
+ "high-tone" : 2,
833
+ "mid-tone" : 3,
834
+ "low-tone" : 4,
835
+ "very-low-tone" : 5,
836
+ "rising-tone" : 6,
837
+ "falling-tone" : 7,
838
+ "peaking-tone" : 8,
839
+ "dipping-tone" : 9,
840
+ # -- lengthening: modified by the following symbol
841
+ "lengthened" : 10,
842
+ "half-length" : 11,
843
+ "shortened" : 12,
844
+
845
+ # CATEGORIES
846
+ "consonant" : 13,
847
+ "vowel" : 14,
848
+ "phoneme" : 15,
849
+
850
+ # NON-SPEECH-MARKERS
851
+ "silence" : 16,
852
+ "end of sentence" : 17,
853
+ "questionmark" : 18,
854
+ "exclamationmark" : 19,
855
+ "fullstop" : 20,
856
+ "word-boundary" : 21,
857
+
858
+ # PLACE
859
+ "dental" : 22,
860
+ "postalveolar" : 23,
861
+ "velar" : 24,
862
+ "palatal" : 25,
863
+ "glottal" : 26,
864
+ "uvular" : 27,
865
+ "labiodental" : 28,
866
+ "labial-velar" : 29,
867
+ "alveolar" : 30,
868
+ "bilabial" : 31,
869
+ "alveolopalatal" : 32,
870
+ "retroflex" : 33,
871
+ "pharyngal" : 34,
872
+ "epiglottal" : 35,
873
+
874
+ # TONGUE POSITION
875
+ "central" : 36,
876
+ "back" : 37,
877
+ "front_central" : 38,
878
+ "front" : 39,
879
+ "central_back" : 40,
880
+
881
+ # MOUTH OPENNESS
882
+ "mid" : 41,
883
+ "close-mid" : 42,
884
+ "close" : 43,
885
+ "open-mid" : 44,
886
+ "close_close-mid" : 45,
887
+ "open-mid_open" : 46,
888
+ "open" : 47,
889
+
890
+ # MOUTH SHAPE
891
+ "rounded" : 48,
892
+ "unrounded" : 49,
893
+
894
+ # MANNER
895
+ "plosive" : 50,
896
+ "nasal" : 51,
897
+ "approximant" : 52,
898
+ "trill" : 53,
899
+ "flap" : 54,
900
+ "fricative" : 55,
901
+ "lateral-approximant": 56,
902
+ "implosive" : 57,
903
+ "vibrant" : 58,
904
+ "click" : 59,
905
+ "ejective" : 60,
906
+
907
+ # TYPE
908
+ "aspirated" : 61,
909
+ "unvoiced" : 62,
910
+ "voiced" : 63,
911
+ }
912
+
913
+
914
+ def generate_feature_table():
915
+ ipa_to_phonemefeats = generate_feature_lookup()
916
+
917
+ feat_types = set()
918
+ for ipa in ipa_to_phonemefeats:
919
+ if len(ipa) == 1:
920
+ [feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()]
921
+
922
+ feat_to_val_set = dict()
923
+ for feat in feat_types:
924
+ feat_to_val_set[feat] = set()
925
+ for ipa in ipa_to_phonemefeats:
926
+ if len(ipa) == 1:
927
+ for feat in ipa_to_phonemefeats[ipa]:
928
+ feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat])
929
+
930
+ # print(feat_to_val_set)
931
+
932
+ value_list = set()
933
+ for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]:
934
+ for value in val_set:
935
+ value_list.add(value)
936
+ # print("{")
937
+ # for index, value in enumerate(list(value_list)):
938
+ # print('"{}":{},'.format(value,index))
939
+ # print("}")
940
+
941
+ value_to_index = get_feature_to_index_lookup()
942
+
943
+ phone_to_vector = dict()
944
+ for ipa in ipa_to_phonemefeats:
945
+ if len(ipa) == 1:
946
+ phone_to_vector[ipa] = [0] * (15 + sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]]))
947
+ # 15 features come from modifiers, not from lexical sounds, so we have to add them to the ones we encounter naturally in the lexical sounds
948
+ for feat in ipa_to_phonemefeats[ipa]:
949
+ if ipa_to_phonemefeats[ipa][feat] in value_to_index:
950
+ phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1
951
+ if phone_to_vector[ipa][value_to_index["phoneme"]] != 1:
952
+ # it's not a phoneme, so we give it the silence marker, regardless of what it is.
953
+ phone_to_vector[ipa][value_to_index["silence"]] = 1
954
+
955
+ for feat in feat_to_val_set:
956
+ for value in feat_to_val_set[feat]:
957
+ if value not in value_to_index:
958
+ print(f"Unknown feature value in featureset! {value}")
959
+
960
+ # print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 49")
961
+
962
+ return phone_to_vector
963
+
964
+
965
+ if __name__ == '__main__':
966
+ print(generate_feature_table())
iso_lookup.json ADDED
The diff for this file is too large to render. See raw diff
 
iso_to_fullname.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch~=2.1.0
2
+ epitran==1.24
3
+ numpy~=1.23.4
4
+ pypinyin~=0.47.1
5
+ matplotlib~=3.7.0
6
+ phonemizer~=3.2.1
7
+ dragonmapper~=0.2.6
8
+ transphone==1.5.3
9
+ pykakasi~=2.2.1
10
+ jamo~=0.4.1
11
+ g2pk~=0.9.4
12
+ gradio~=4.26.0
run_feature_visualization.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+ import torch
4
+
5
+ from ArticulatoryTextFrontend import ArticulatoryTextFrontend
6
+
7
+
8
+ def visualize_one_hot_encoded_sequence(tensor, sentence, col_labels, cmap='BuGn'):
9
+ """
10
+ Visualize a 2D one-hot encoded tensor as a heatmap.
11
+ """
12
+ tensor = torch.clamp(tensor, min=0, max=1).transpose(0, 1).cpu().numpy()
13
+ if tensor.ndim != 2:
14
+ raise ValueError("Input tensor must be a 2D array")
15
+
16
+ # Check the size of labels matches the tensor dimensions
17
+ row_labels = ["stressed", "very-high-tone", "high-tone", "mid-tone", "low-tone", "very-low-tone", "rising-tone", "falling-tone", "peaking-tone", "dipping-tone", "lengthened", "half-length", "shortened", "consonant", "vowel", "phoneme", "silence", "end of sentence", "questionmark", "exclamationmark", "fullstop", "word-boundary", "dental", "postalveolar",
18
+ "velar", "palatal", "glottal", "uvular", "labiodental", "labial-velar", "alveolar", "bilabial", "alveolopalatal", "retroflex", "pharyngal", "epiglottal", "central", "back", "front_central", "front", "central_back", "mid", "close-mid", "close", "open-mid", "close_close-mid", "open-mid_open", "open", "rounded", "unrounded", "plosive",
19
+ "nasal", "approximant", "trill", "flap", "fricative", "lateral-approximant", "implosive", "vibrant", "click", "ejective", "aspirated", "unvoiced", "voiced"]
20
+
21
+ if row_labels and len(row_labels) != tensor.shape[0]:
22
+ raise ValueError("Number of row labels must match the number of rows in the tensor")
23
+ if col_labels and len(col_labels) != tensor.shape[1]:
24
+ raise ValueError("Number of column labels must match the number of columns in the tensor")
25
+
26
+ plt.figure(figsize=(10, 8))
27
+
28
+ # Create the heatmap
29
+ plt.imshow(tensor, cmap=cmap, aspect='auto')
30
+
31
+ # Add labels
32
+ if row_labels:
33
+ plt.yticks(np.arange(tensor.shape[0]), row_labels)
34
+ if col_labels:
35
+ plt.xticks(np.arange(tensor.shape[1]), col_labels, rotation=0)
36
+
37
+ plt.grid(False)
38
+ plt.xlabel('Phones')
39
+ plt.ylabel('Features')
40
+
41
+ # Display the heatmap
42
+ plt.title(f"»{sentence}«")
43
+ plt.tight_layout()
44
+ plt.show()
45
+
46
+
47
+ if __name__ == '__main__':
48
+ sentence = "Rằng: Trong Thánh trạch dồi dào."
49
+ language = "vie"
50
+
51
+ tf = ArticulatoryTextFrontend(language=language)
52
+ features = tf.string_to_tensor(sentence)
53
+ phones = tf.get_phone_string(sentence)
54
+
55
+ visualize_one_hot_encoded_sequence(tensor=features, sentence=sentence, col_labels=phones)
run_grapheme_to_feature_demo.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ArticulatoryTextFrontend import ArticulatoryTextFrontend, get_language_id
2
+
3
+ if __name__ == '__main__':
4
+
5
+ # demonstrating the language ID lookup
6
+ print(get_language_id("eng"))
7
+ print(get_language_id("deu"))
8
+ print(get_language_id("fra"))
9
+
10
+ # demonstrating the conversion from graphemes to features
11
+ print("\n\nEnglish Test")
12
+ tf = ArticulatoryTextFrontend(language="eng")
13
+ features = tf.string_to_tensor("This is a complex sentence, it even has a pause!", view=True)
14
+
15
+ print("\n\nChinese Test")
16
+ tf = ArticulatoryTextFrontend(language="cmn")
17
+ features = tf.string_to_tensor("这是一个复杂的句子,它甚至包含一个停顿。", view=True)
18
+ features = tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
19
+ features = tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
20
+
21
+ print("\n\nVietnamese Test")
22
+ tf = ArticulatoryTextFrontend(language="vie")
23
+ features = tf.string_to_tensor("Xin chào thế giới, quả là một ngày tốt lành để học nói tiếng Việt!", view=True)
24
+ features = tf.string_to_tensor("ba bà bá bạ bả bã", view=True)
25
+
26
+ print("\n\nJapanese Test")
27
+ tf = ArticulatoryTextFrontend(language="jpn")
28
+ features = tf.string_to_tensor("医師会がなくても、近隣の病院なら紹介してくれると思います。", view=True)
29
+
30
+ print("\n\nZero-Shot Test")
31
+ tf = ArticulatoryTextFrontend(language="acr")
32
+ features = tf.string_to_tensor("I don't know this language, but this is just a placeholder text anyway.", view=True)
33
+