Pendrokar commited on
Commit
2080fde
β€’
1 Parent(s): b2e5090

ionite34's h2p_parser and dep required for English

Browse files
Files changed (29) hide show
  1. requirements.txt +3 -0
  2. resources/app/python/xvapitch/text/h2p_parser/__init__.py +22 -0
  3. resources/app/python/xvapitch/text/h2p_parser/__main__.py +185 -0
  4. resources/app/python/xvapitch/text/h2p_parser/cmudictext.py +253 -0
  5. resources/app/python/xvapitch/text/h2p_parser/compat/__init__.py +7 -0
  6. resources/app/python/xvapitch/text/h2p_parser/compat/cmudict.py +19 -0
  7. resources/app/python/xvapitch/text/h2p_parser/data/__init__.py +0 -0
  8. resources/app/python/xvapitch/text/h2p_parser/data/cmudict-0.7b.txt +0 -0
  9. resources/app/python/xvapitch/text/h2p_parser/data/cmudict.dict +0 -0
  10. resources/app/python/xvapitch/text/h2p_parser/data/dict.json +1500 -0
  11. resources/app/python/xvapitch/text/h2p_parser/data/example.json +16 -0
  12. resources/app/python/xvapitch/text/h2p_parser/dict_reader.py +109 -0
  13. resources/app/python/xvapitch/text/h2p_parser/dictionary.py +85 -0
  14. resources/app/python/xvapitch/text/h2p_parser/filter.py +34 -0
  15. resources/app/python/xvapitch/text/h2p_parser/format_ph.py +99 -0
  16. resources/app/python/xvapitch/text/h2p_parser/h2p.py +123 -0
  17. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/PKG-INFO +14 -0
  18. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/SOURCES.txt +19 -0
  19. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/dependency_links.txt +1 -0
  20. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/requires.txt +2 -0
  21. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/top_level.txt +1 -0
  22. resources/app/python/xvapitch/text/h2p_parser/pos_parser.py +17 -0
  23. resources/app/python/xvapitch/text/h2p_parser/processors.py +392 -0
  24. resources/app/python/xvapitch/text/h2p_parser/symbols.py +82 -0
  25. resources/app/python/xvapitch/text/h2p_parser/text/__init__.py +0 -0
  26. resources/app/python/xvapitch/text/h2p_parser/text/numbers.py +166 -0
  27. resources/app/python/xvapitch/text/h2p_parser/utils/__init__.py +0 -0
  28. resources/app/python/xvapitch/text/h2p_parser/utils/converter.py +79 -0
  29. resources/app/python/xvapitch/text/h2p_parser/utils/parser.py +133 -0
requirements.txt CHANGED
@@ -26,9 +26,11 @@ idna==2.10
26
  importlib-metadata==2.0.0
27
  importlib-resources==5.2.2
28
  inflect==4.1.0
 
29
  jaconv==0.3
30
  joblib==0.17.0
31
  librosa
 
32
  num2words==0.5.10
33
  numpy
34
  omegaconf==2.1.1
@@ -43,6 +45,7 @@ pydub==0.25.1
43
  pykakasi==2.2.1
44
  pyparsing==2.4.7
45
  python-crfsuite==0.9.8
 
46
  PyYAML
47
  regex==2021.8.28
48
  requests==2.25.1
 
26
  importlib-metadata==2.0.0
27
  importlib-resources==5.2.2
28
  inflect==4.1.0
29
+ inquirerpy~=0.3.3
30
  jaconv==0.3
31
  joblib==0.17.0
32
  librosa
33
+ nltk~=3.7
34
  num2words==0.5.10
35
  numpy
36
  omegaconf==2.1.1
 
45
  pykakasi==2.2.1
46
  pyparsing==2.4.7
47
  python-crfsuite==0.9.8
48
+ pywordsegment~=0.2.1
49
  PyYAML
50
  regex==2021.8.28
51
  requests==2.25.1
resources/app/python/xvapitch/text/h2p_parser/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ h2p_parser
3
+
4
+ Heteronym to Phoneme Parser
5
+
6
+ """
7
+
8
+ import sys
9
+
10
+ if sys.version_info < (3, 9):
11
+ # In Python versions below 3.9, this is needed
12
+ from importlib_resources import files
13
+ else:
14
+ # Since python 3.9+, importlib.resources.files is built-in
15
+ from importlib.resources import files
16
+
17
+ __version__ = "1.0.0"
18
+
19
+ # Data module
20
+ DATA_PATH = files(__name__ + '.data')
21
+ # Iterable collection of all files in data.
22
+ DATA_FILES = DATA_PATH.iterdir()
resources/app/python/xvapitch/text/h2p_parser/__main__.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+
3
+ from InquirerPy import inquirer
4
+ from InquirerPy.utils import patched_print, color_print
5
+ from InquirerPy.base.control import Choice
6
+ from InquirerPy.validator import PathValidator
7
+ from h2p_parser.utils import converter
8
+ from h2p_parser.utils import parser
9
+
10
+
11
+ def convert_h2p(input_file, output_file, delimiter):
12
+ """
13
+ Converts a h2p dictionary file from one format to another.
14
+ """
15
+ converter.bin_delim_to_json(input_file, output_file, delimiter)
16
+ print('Converted h2p_dict to json.')
17
+
18
+
19
+ def prompt_action() -> str:
20
+ action = inquirer.select(
21
+ message='Select action:',
22
+ choices=[
23
+ "Convert",
24
+ "Parse",
25
+ Choice(value=None, name='Exit')
26
+ ],
27
+ default=0,
28
+ ).execute()
29
+ if not action:
30
+ exit(0)
31
+ return action
32
+
33
+
34
+ def prompt_f_input():
35
+ """
36
+ Prompts for input file.
37
+ """
38
+ return inquirer.filepath(
39
+ message='Select input file:',
40
+ validate=PathValidator(is_file=True, message='Input must be a file.')
41
+ ).execute()
42
+
43
+
44
+ def prompt_f_output():
45
+ """
46
+ Prompts for output file.
47
+ """
48
+ return inquirer.filepath(
49
+ message='Select output file:',
50
+ validate=PathValidator(is_file=True, message='Output must be a file.')
51
+ ).execute()
52
+
53
+
54
+ def action_convert():
55
+ """
56
+ Converts a h2p dictionary file from one format to another.
57
+ """
58
+ # Select input file
59
+ input_file = prompt_f_input()
60
+ if not input_file:
61
+ return
62
+
63
+ # Select output file
64
+ output_file = prompt_f_output()
65
+ if not output_file:
66
+ return
67
+
68
+ # Ask for delimiter
69
+ delimiter = inquirer.text(
70
+ message='Enter delimiter:',
71
+ default='|'
72
+ ).execute()
73
+ if not delimiter:
74
+ return
75
+
76
+ # Run Process
77
+ convert_h2p(input_file, output_file, delimiter)
78
+
79
+
80
+ def action_parse_file():
81
+ """
82
+ Parses a metadata.csv file and checks for dictionary coverage
83
+ :return:
84
+ """
85
+ # Select input file
86
+ input_file = prompt_f_input()
87
+ if not input_file:
88
+ return
89
+
90
+ # Ask for delimiter
91
+ delimiter = inquirer.text(
92
+ message='Enter delimiter:',
93
+ default='|'
94
+ ).execute()
95
+ if not delimiter:
96
+ return
97
+
98
+ # Run Process
99
+ result = parser.check_lines(parser.read_file(input_file, delimiter))
100
+
101
+ # Print results
102
+ color_print([("#e5c07b", "Unresolved Words")])
103
+ color_print([("#d21205", "[All]: "),
104
+ ("#ffffff", f"{len(result.unres_all_words)}/{len(result.all_words)}")])
105
+ color_print([("#7e3b41", "[Unique]: "),
106
+ ("#ffffff", f"{len(result.unres_words)}/{len(result.words)}")])
107
+
108
+ color_print([("#4ce5c8", "-" * 10)])
109
+
110
+ color_print([("#e5c07b", "Unresolved Lines")])
111
+ color_print([("#d21205", "[All]: "),
112
+ ("#ffffff", f"{len(result.unres_all_lines)}/{len(result.all_lines)}")])
113
+ color_print([("#7e3b41", "[Unique]: "),
114
+ ("#ffffff", f"{len(result.unres_lines)}/{len(result.lines)}")])
115
+
116
+ color_print([("#4ce5c8", "-" * 10)])
117
+
118
+ color_print([("#e5c07b", "Expected Coverage")])
119
+ color_print([("#d21205", "[Lines]: "),
120
+ ("#ffffff", f"{result.line_coverage()}%")])
121
+ color_print([("#7e3b41", "[Words]: "),
122
+ ("#ffffff", f"{result.word_coverage()}%")])
123
+
124
+ color_print([("#4ce5c8", "-" * 10)])
125
+
126
+ color_print([("#e5c07b", "H2p parser")])
127
+ color_print([("#d21205", "[Lines with Heteronyms]: "),
128
+ ("#ffffff", f"{len(result.all_lines_cont_het)}/{len(result.all_lines)}"
129
+ f" | {result.percent_line_het()}%")])
130
+ color_print([("#7e3b41", "[Words Resolved by H2p]: "),
131
+ ("#ffffff", f"{result.n_words_het}/{result.n_words_res}"
132
+ f" | {result.percent_word_h2p()}%")])
133
+ # Calcs
134
+ feature_res = result.n_words_fet
135
+ feature_percent = round(feature_res / result.n_words_res * 100, 2)
136
+ cmu_res = result.n_words_cmu
137
+ cmu_percent = round(cmu_res / result.n_words_res * 100, 2)
138
+ color_print([("#c8bd20", "[Transformed Resolves]: "),
139
+ ("#ffffff", f"{feature_res}/{result.n_words_res}"
140
+ f" | {feature_percent}%")])
141
+ color_print([("#25a0c8", "[Words in CMUDict]: "),
142
+ ("#ffffff", f"{cmu_res}/{result.n_words_res}"
143
+ f" | {cmu_percent}%")])
144
+
145
+ color_print([("#4ce5c8", "-" * 10)])
146
+
147
+ color_print([("#e5c07b", "Feature Usage")])
148
+
149
+ # Loop through feature results
150
+ for ft in result.ft_stats:
151
+ color_print([("#d21205", f"{ft}: "),
152
+ ("#ffffff", f"{result.ft_stats[ft]}/{result.n_words_res}"
153
+ f" | {round(result.ft_stats[ft]/result.n_words_res*100, 2)}%")])
154
+
155
+ color_print([("#4ce5c8", "-" * 10)])
156
+
157
+ # Print 100 sampled unresolved words by frequency
158
+ color_print([("#e5c07b", "Top 100 most frequent unresolved words")])
159
+ # Count frequency of words
160
+ word_freq = Counter(result.unres_all_words)
161
+ # Sort by frequency
162
+ word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
163
+ # Print top 100
164
+ for word, freq in word_freq[:100]:
165
+ color_print([("#d21205", f"{word}: "),
166
+ ("#ffffff", f"{freq}")])
167
+
168
+
169
+ def entry():
170
+ """
171
+ Prints help information.
172
+ """
173
+ # Select action type
174
+ action = prompt_action()
175
+ if action == 'Convert':
176
+ action_convert()
177
+ elif action == 'Parse':
178
+ action_parse_file()
179
+
180
+
181
+ if __name__ == "__main__":
182
+ entry()
183
+
184
+
185
+
resources/app/python/xvapitch/text/h2p_parser/cmudictext.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Extended Grapheme to Phoneme conversion using CMU Dictionary and Heteronym parsing.
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Optional
6
+
7
+ import pywordsegment
8
+ import nltk
9
+ from nltk.stem import WordNetLemmatizer
10
+ from nltk.stem.snowball import SnowballStemmer
11
+ from .h2p import H2p
12
+ from .h2p import replace_first
13
+ from . import format_ph as ph
14
+ from .dict_reader import DictReader
15
+ from .text.numbers import normalize_numbers
16
+ from .filter import filter_text
17
+ from .processors import Processor
18
+ from copy import deepcopy
19
+
20
+ re_digit = re.compile(r"\((\d+)\)")
21
+ re_bracket_with_digit = re.compile(r"\(.*\)")
22
+
23
+ # Check that the nltk data is downloaded, if not, download it
24
+ try:
25
+ nltk.data.find('corpora/wordnet.zip')
26
+ nltk.data.find('corpora/omw-1.4.zip')
27
+ except LookupError:
28
+ nltk.download('wordnet')
29
+ nltk.download('omw-1.4')
30
+
31
+
32
+ class CMUDictExt:
33
+ def __init__(self, cmu_dict_path: str = None, h2p_dict_path: str = None, cmu_multi_mode: int = 0,
34
+ process_numbers: bool = True, phoneme_brackets: bool = True, unresolved_mode: str = 'keep'):
35
+ # noinspection GrazieInspection
36
+ """
37
+ Initialize CMUDictExt - Extended Grapheme to Phoneme conversion using CMU Dictionary with Heteronym parsing.
38
+
39
+ CMU multi-entry resolution modes:
40
+ - -2 : Raw entry (i.e. 'A' resolves to 'AH0' and 'A(1)' to 'EY1')
41
+ - -1 : Skip resolving any entry with multiple pronunciations.
42
+ - 0 : Resolve using default un-numbered pronunciation.
43
+ - 1 : Resolve using (1) numbered pronunciation.
44
+ - n : Resolve using (n) numbered pronunciation.
45
+ - If a higher number is specified than available for the word, the highest available number is used.
46
+
47
+ Unresolved word resolution modes:
48
+ - keep : Keep the text-form word in the output.
49
+ - remove : Remove the text-form word from the output.
50
+ - drop : Return the line as None if any word is unresolved.
51
+
52
+ :param cmu_dict_path: Path to CMU dictionary file (.txt)
53
+ :type: str
54
+ :param h2p_dict_path: Path to Custom H2p dictionary (.json)
55
+ :type: str
56
+ :param cmu_multi_mode: CMU resolution mode for entries with multiple pronunciations.
57
+ :type: int
58
+ """
59
+
60
+ # Check valid unresolved_mode argument
61
+ if unresolved_mode not in ['keep', 'remove', 'drop']:
62
+ raise ValueError('Invalid value for unresolved_mode: {}'.format(unresolved_mode))
63
+ self.unresolved_mode = unresolved_mode
64
+
65
+ self.cmu_dict_path = cmu_dict_path # Path to CMU dictionary file (.txt), if None, uses built-in
66
+ self.h2p_dict_path = h2p_dict_path # Path to Custom H2p dictionary (.json), if None, uses built-in
67
+ self.cmu_multi_mode = cmu_multi_mode # CMU multi-entry resolution mode
68
+ self.process_numbers = process_numbers # Normalize numbers to text form, if enabled
69
+ self.phoneme_brackets = phoneme_brackets # If True, phonemes are wrapped in curly brackets.
70
+ self.dict = DictReader(self.cmu_dict_path).dict # CMU Dictionary
71
+ self.h2p = H2p(self.h2p_dict_path, preload=True) # H2p parser
72
+ self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form
73
+ self.stem = SnowballStemmer('english').stem # Snowball Stemmer - used to find stem root of words
74
+ self.segment = pywordsegment.WordSegmenter().segment # Word Segmenter
75
+ self.p = Processor(self) # Processor for processing text
76
+
77
+ # Features
78
+ # Auto pluralization and de-pluralization
79
+ self.ft_auto_plural = True
80
+ # Auto splits and infers possessive forms of original words
81
+ self.ft_auto_pos = True
82
+ # Auto splits 'll
83
+ self.ft_auto_ll = True
84
+ # Auto splits and infers hyphenated words
85
+ self.ft_auto_hyphenated = True
86
+ # Auto splits possible compound words
87
+ self.ft_auto_compound = True
88
+ # Analyzes word root stem and infers pronunciation separately
89
+ # i.e. 'generously' -> 'generous' + 'ly'
90
+ self.ft_stem = True
91
+ # Forces compound words using manual lookup
92
+ self.ft_auto_compound_l2 = True
93
+
94
+ def lookup(self, text: str, pos: str = None, ph_format: str = 'sds') -> str | list | None:
95
+ # noinspection GrazieInspection
96
+ """
97
+ Gets the CMU Dictionary entry for a word.
98
+
99
+ Options for ph_format:
100
+
101
+ - 'sds' space delimited string
102
+ - 'sds_b' space delimited string with curly brackets
103
+ - 'list' list of phoneme strings
104
+
105
+ :param pos: Part of speech tag (Optional)
106
+ :param ph_format: Format of the phonemes to return:
107
+ :type: str
108
+ :param text: Word to lookup
109
+ :type: str
110
+ """
111
+
112
+ def format_as(in_phoneme):
113
+ if ph_format == 'sds':
114
+ output = ph.to_sds(in_phoneme)
115
+ elif ph_format == 'sds_b':
116
+ output = ph.with_cb(ph.to_sds(in_phoneme))
117
+ elif ph_format == 'list':
118
+ output = ph.to_list(in_phoneme)
119
+ else:
120
+ raise ValueError('Invalid value for ph_format: {}'.format(ph_format))
121
+ return output
122
+
123
+ # Get the CMU Dictionary entry for the word
124
+ word = text.lower()
125
+ entry = deepcopy(self.dict.get(word)) # Ensure safe copy of entry
126
+
127
+ # Has entry, return it directly
128
+ if entry is not None:
129
+ return format_as(entry)
130
+
131
+ # Auto Possessive Processor
132
+ if self.ft_auto_pos:
133
+ res = self.p.auto_possessives(word)
134
+ if res is not None:
135
+ return format_as(res)
136
+
137
+ # Auto Contractions for "ll" or "d"
138
+ if self.ft_auto_ll:
139
+ res = self.p.auto_contractions(word)
140
+ if res is not None:
141
+ return format_as(res)
142
+
143
+ # Check for hyphenated words
144
+ if self.ft_auto_hyphenated:
145
+ res = self.p.auto_hyphenated(word)
146
+ if res is not None:
147
+ return format_as(res)
148
+
149
+ # Check for compound words
150
+ if self.ft_auto_compound:
151
+ res = self.p.auto_compound(word)
152
+ if res is not None:
153
+ return format_as(res)
154
+
155
+ # No entry, detect if this is a multi-word entry
156
+ if '(' in word and ')' in word and any(char.isdigit() for char in word):
157
+ # Parse the integer from the word using regex
158
+ num = int(re.findall(re_digit, word)[0])
159
+ # If found
160
+ if num is not None:
161
+ # Remove the integer and bracket from the word
162
+ actual_word = re.sub(re_bracket_with_digit, "", word)
163
+ # See if this is a valid entry
164
+ result = deepcopy(self.dict.get(actual_word)) # Ensure safe copy of entry
165
+ # If found:
166
+ if result is not None:
167
+ # Translate the integer to index
168
+ index = min(num - 1, 0)
169
+ # Check if index is less than the number of pronunciations
170
+ if index < len(result):
171
+ # Return the entry using the provided num index
172
+ return format_as(result[index])
173
+ # If entry is higher
174
+ else:
175
+ # Return the highest available entry
176
+ return format_as(result[-1])
177
+
178
+ # Auto de-pluralization
179
+ # This is placed near the end because we need to do a pos-tag process
180
+ if self.ft_auto_plural:
181
+ res = self.p.auto_plural(word, pos)
182
+ if res is not None:
183
+ return format_as(res)
184
+
185
+ # Stem check
186
+ # noinspection SpellCheckingInspection
187
+ """
188
+ Supported modes for words ending in:
189
+ "ing", "ingly", "ly"
190
+ """
191
+ if self.ft_stem:
192
+ res = self.p.auto_stem(word)
193
+ if res is not None:
194
+ return format_as(res)
195
+
196
+ # Force compounding
197
+ if self.ft_auto_compound_l2:
198
+ res = self.p.auto_compound_l2(word)
199
+ if res is not None:
200
+ return format_as(res)
201
+
202
+ # If not found
203
+ return None
204
+
205
+ def convert(self, text: str) -> str | None:
206
+ # noinspection GrazieInspection
207
+ """
208
+ Replace a grapheme text line with phonemes.
209
+
210
+ :param text: Text line to be converted
211
+ :type: str
212
+ """
213
+
214
+ # Check valid unresolved_mode argument
215
+ if self.unresolved_mode not in ['keep', 'remove', 'drop']:
216
+ raise ValueError('Invalid value for unresolved_mode: {}'.format(self.unresolved_mode))
217
+ ur_mode = self.unresolved_mode
218
+
219
+ # Normalize numbers, if enabled
220
+ if self.process_numbers:
221
+ text = normalize_numbers(text)
222
+ # Filter and Tokenize
223
+ f_text = filter_text(text, preserve_case=True)
224
+ words = self.h2p.tokenize(f_text)
225
+ # Run POS tagging
226
+ tags = self.h2p.get_tags(words)
227
+
228
+ # Loop through words and pos tags
229
+ for word, pos in tags:
230
+ # Skip punctuation
231
+ if word == '.':
232
+ continue
233
+ # If word not in h2p dict, check CMU dict
234
+ if not self.h2p.dict.contains(word):
235
+ entry = self.lookup(word, pos)
236
+ if entry is None:
237
+ if ur_mode == 'drop':
238
+ return None
239
+ if ur_mode == 'remove':
240
+ text = replace_first(word, '', text)
241
+ continue
242
+ # Do replace
243
+ f_ph = ph.with_cb(ph.to_sds(entry))
244
+ text = replace_first(word, f_ph, text)
245
+ continue
246
+ # For word in h2p dict, get phonemes
247
+ phonemes = self.h2p.dict.get_phoneme(word, pos)
248
+ # Format phonemes
249
+ f_ph = ph.with_cb(ph.to_sds(phonemes))
250
+ # Replace word with phonemes
251
+ text = replace_first(word, f_ph, text)
252
+ # Return text
253
+ return text
resources/app/python/xvapitch/text/h2p_parser/compat/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ Compatibility module.
3
+
4
+ This module contains compatibility wrappers for existing
5
+ implementations of CMUDict and other dictionaries.
6
+
7
+ """
resources/app/python/xvapitch/text/h2p_parser/compat/cmudict.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Compatibility layer for using CMUDictExt with CMUDict-like API calls.
2
+ # Designed to be compatible with the implementation of CMUDict in:
3
+ # https://github.com/NVIDIA/DeepLearningExamples/
4
+ #
5
+ # Example usage:
6
+ # from h2p_parser.compat.cmudict import CMUDict
7
+
8
+ from h2p_parser.cmudictext import CMUDictExt
9
+
10
+
11
+ class CMUDict(CMUDictExt):
12
+ def __init__(self, file_or_path=None, heteronyms_path=None, keep_ambiguous=True):
13
+ # Parameter Mapping:
14
+ # file_or_path => Mapped to cmu_dict_path
15
+ # heteronyms_path => Dropped as CMUDictExt uses H2p for heteronym parsing.
16
+ # keep_ambiguous => Mapped to cmu_multi_mode | True => -2, False => -1
17
+ super().__init__(file_or_path, heteronyms_path)
18
+ self._entries = {}
19
+ self.heteronyms = []
resources/app/python/xvapitch/text/h2p_parser/data/__init__.py ADDED
File without changes
resources/app/python/xvapitch/text/h2p_parser/data/cmudict-0.7b.txt ADDED
The diff for this file is too large to render. See raw diff
 
resources/app/python/xvapitch/text/h2p_parser/data/cmudict.dict ADDED
The diff for this file is too large to render. See raw diff
 
resources/app/python/xvapitch/text/h2p_parser/data/dict.json ADDED
@@ -0,0 +1,1500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "absent": {
3
+ "DEFAULT": "AE1 B S AH0 N T",
4
+ "VERB": "AH1 B S AE1 N T"
5
+ },
6
+ "abstract": {
7
+ "DEFAULT": "AE1 B S T R AE2 K T",
8
+ "VERB": "AE0 B S T R AE1 K T"
9
+ },
10
+ "abstracts": {
11
+ "DEFAULT": "AE1 B S T R AE0 K T S",
12
+ "VERB": "AE0 B S T R AE1 K T S"
13
+ },
14
+ "abuse": {
15
+ "DEFAULT": "AH0 B Y UW1 S",
16
+ "VERB": "AH0 B Y UW1 Z"
17
+ },
18
+ "abuses": {
19
+ "DEFAULT": "AH0 B Y UW1 S IH0 Z",
20
+ "VERB": "AH0 B Y UW1 Z IH0 Z"
21
+ },
22
+ "accent": {
23
+ "DEFAULT": "AE1 K S EH2 N T",
24
+ "VERB": "AH0 K S EH1 N T"
25
+ },
26
+ "accents": {
27
+ "DEFAULT": "AE1 K S EH0 N T S",
28
+ "VERB": "AE1 K S EH0 N T S"
29
+ },
30
+ "addict": {
31
+ "DEFAULT": "AE1 D IH2 K T",
32
+ "VERB": "AH0 D IH1 K T"
33
+ },
34
+ "addicts": {
35
+ "DEFAULT": "AE1 D IH2 K T S",
36
+ "VERB": "AH0 D IH1 K T S"
37
+ },
38
+ "advocate": {
39
+ "DEFAULT": "AE1 D V AH0 K AH0 T",
40
+ "VERB": "AE1 D V AH0 K EY2 T"
41
+ },
42
+ "advocates": {
43
+ "DEFAULT": "AE1 D V AH0 K AH0 T S",
44
+ "VERB": "AE1 D V AH0 K EY2 T S"
45
+ },
46
+ "affect": {
47
+ "DEFAULT": "AE1 F EH0 K T",
48
+ "VERB": "AH0 F EH1 K T"
49
+ },
50
+ "affects": {
51
+ "DEFAULT": "AE1 F EH0 K T S",
52
+ "VERB": "AH0 F EH1 K T S"
53
+ },
54
+ "affix": {
55
+ "DEFAULT": "AE1 F IH0 K S",
56
+ "VERB": "AH0 F IH1 K S"
57
+ },
58
+ "affixes": {
59
+ "DEFAULT": "AE1 F IH0 K S IH0 Z",
60
+ "VERB": "AH0 F IH1 K S IH0 Z"
61
+ },
62
+ "agglomerate": {
63
+ "DEFAULT": "AH0 G L AA1 M ER0 AH0 T",
64
+ "VERB": "AH0 G L AA1 M ER0 EY2 T"
65
+ },
66
+ "aggregate": {
67
+ "DEFAULT": "AE1 G R AH0 G AH0 T",
68
+ "VERB": "AE1 G R AH0 G EY0 T"
69
+ },
70
+ "aggregates": {
71
+ "DEFAULT": "AE1 G R AH0 G IH0 T S",
72
+ "VERB": "AE1 G R AH0 G EY2 T S"
73
+ },
74
+ "allies": {
75
+ "DEFAULT": "AE1 L AY0 Z",
76
+ "VERB": "AH0 L AY1 Z"
77
+ },
78
+ "alloy": {
79
+ "DEFAULT": "AE1 L OY2",
80
+ "VERB": "AH0 L OY1"
81
+ },
82
+ "alloys": {
83
+ "DEFAULT": "AE1 L OY2 Z",
84
+ "VERB": "AH0 L OY1 Z"
85
+ },
86
+ "ally": {
87
+ "DEFAULT": "AE1 L AY0",
88
+ "VERB": "AH0 L AY1"
89
+ },
90
+ "alternate": {
91
+ "DEFAULT": "AO0 L T ER1 N AH0 T",
92
+ "VERB": "AO1 L T ER0 N EY2 T"
93
+ },
94
+ "analyses": {
95
+ "DEFAULT": "AE1 N AH0 L AY0 Z IH2 Z",
96
+ "VERB": "AH0 N AE1 L IH0 S IY2 Z"
97
+ },
98
+ "animate": {
99
+ "DEFAULT": "AE1 N AH0 M AH0 T",
100
+ "VERB": "AE1 N AH0 M EY2 T"
101
+ },
102
+ "annex": {
103
+ "DEFAULT": "AE1 N EH2 K S",
104
+ "VERB": "AH0 N EH1 K S"
105
+ },
106
+ "annexes": {
107
+ "DEFAULT": "AE1 N EH2 K S IH0 Z",
108
+ "VERB": "AH0 N EH1 K S IH0 Z"
109
+ },
110
+ "appropriate": {
111
+ "DEFAULT": "AH0 P R OW1 P R IY0 AH0 T",
112
+ "VERB": "AH0 P R OW1 P R IY0 EY2 T"
113
+ },
114
+ "approximate": {
115
+ "DEFAULT": "AH0 P R AA1 K S AH0 M AH0 T",
116
+ "VERB": "AH0 P R AA1 K S AH0 M EY2 T"
117
+ },
118
+ "articulate": {
119
+ "DEFAULT": "AA0 R T IH1 K Y AH0 L EY2 T",
120
+ "VERB": "AA0 R T IH1 K Y AH0 L AH0 T"
121
+ },
122
+ "aspirate": {
123
+ "DEFAULT": "AE1 S P ER0 AH0 T",
124
+ "VERB": "AE1 S P ER0 EY2 T"
125
+ },
126
+ "aspirates": {
127
+ "DEFAULT": "AE1 S P ER0 AH0 T S",
128
+ "VERB": "AE1 S P ER0 EY2 T S"
129
+ },
130
+ "associate": {
131
+ "DEFAULT": "AH0 S OW1 S IY0 AH0 T",
132
+ "VERB": "AH0 S OW1 S IY0 EY2 T"
133
+ },
134
+ "associates": {
135
+ "DEFAULT": "AH0 S OW1 S IY0 AH0 T S",
136
+ "VERB": "AH0 S OW1 S IY0 EY2 T S"
137
+ },
138
+ "attribute": {
139
+ "DEFAULT": "AE1 T R IH0 B Y UW0 T",
140
+ "VERB": "AH0 T R IH1 B Y UW2 T"
141
+ },
142
+ "attributes": {
143
+ "DEFAULT": "AE1 T R IH0 B Y UW0 T S",
144
+ "VERB": "AH0 T R IH1 B Y UW2 T S"
145
+ },
146
+ "baths": {
147
+ "DEFAULT": "B AE1 DH Z",
148
+ "VERB": "B AE1 TH S"
149
+ },
150
+ "blessed": {
151
+ "DEFAULT": "B L EH1 S T",
152
+ "VERB": "B L EH1 S IH0 D"
153
+ },
154
+ "certificate": {
155
+ "DEFAULT": "S ER0 T IH1 F IH0 K EY2 T",
156
+ "VERB": "S ER0 T IH1 F IH0 K AH0 T"
157
+ },
158
+ "certificates": {
159
+ "DEFAULT": "S ER0 T IH1 F IH0 K AH0 T S",
160
+ "VERB": "S ER0 T IH1 F IH0 K EY2 T S"
161
+ },
162
+ "close": {
163
+ "DEFAULT": "K L OW1 S",
164
+ "VERB": "K L OW1 Z"
165
+ },
166
+ "closer": {
167
+ "DEFAULT": "K L OW1 S ER0",
168
+ "NOUN": "K L OW1 Z ER0"
169
+ },
170
+ "closes": {
171
+ "DEFAULT": "K L OW1 S IH0 Z",
172
+ "VERB": "K L OW1 Z IH0 Z"
173
+ },
174
+ "collect": {
175
+ "DEFAULT": "K AA1 L EH0 K T",
176
+ "VERB": "K AH0 L EH1 K T"
177
+ },
178
+ "collects": {
179
+ "DEFAULT": "K AA1 L EH0 K T S",
180
+ "VERB": "K AH0 L EH1 K T S"
181
+ },
182
+ "combat": {
183
+ "DEFAULT": "K AA1 M B AE0 T",
184
+ "VERB": "K AH0 M B AE1 T"
185
+ },
186
+ "combats": {
187
+ "DEFAULT": "K AH1 M B AE0 T S",
188
+ "VERB": "K AH0 M B AE1 T S"
189
+ },
190
+ "combine": {
191
+ "DEFAULT": "K AA1 M B AY0 N",
192
+ "VERB": "K AH0 M B AY1 N"
193
+ },
194
+ "commune": {
195
+ "DEFAULT": "K AA1 M Y UW0 N",
196
+ "VERB": "K AH0 M Y UW1 N"
197
+ },
198
+ "communes": {
199
+ "DEFAULT": "K AA1 M Y UW0 N Z",
200
+ "VERB": "K AH0 M Y UW1 N Z"
201
+ },
202
+ "compact": {
203
+ "DEFAULT": "K AA1 M P AE0 K T",
204
+ "VERB": "K AH0 M P AE1 K T"
205
+ },
206
+ "compacts": {
207
+ "DEFAULT": "K AA1 M P AE0 K T S",
208
+ "VERB": "K AH0 M P AE1 K T S"
209
+ },
210
+ "complex": {
211
+ "ADJ": "K AH0 M P L EH1 K S",
212
+ "DEFAULT": " K AA1 M P L EH0 K S"
213
+ },
214
+ "compliment": {
215
+ "DEFAULT": "K AA1 M P L AH0 M AH0 N T",
216
+ "VERB": "K AA1 M P L AH0 M EH0 N T"
217
+ },
218
+ "compliments": {
219
+ "DEFAULT": "K AA1 M P L AH0 M AH0 N T S",
220
+ "VERB": "K AA1 M P L AH0 M EH0 N T S"
221
+ },
222
+ "compound": {
223
+ "DEFAULT": "K AA1 M P AW0 N D",
224
+ "VERB": "K AH0 M P AW1 N D"
225
+ },
226
+ "compounds": {
227
+ "DEFAULT": "K AA1 M P AW0 N D Z",
228
+ "VERB": "K AH0 M P AW1 N D Z"
229
+ },
230
+ "compress": {
231
+ "DEFAULT": "K AA1 M P R EH0 S",
232
+ "VERB": "K AH0 M P R EH1 S"
233
+ },
234
+ "compresses": {
235
+ "DEFAULT": "K AA1 M P R EH0 S AH0 Z",
236
+ "VERB": "K AH0 M P R EH1 S IH0 Z"
237
+ },
238
+ "concert": {
239
+ "DEFAULT": "K AA1 N S ER0 T",
240
+ "VERB": "K AH0 N S ER1 T"
241
+ },
242
+ "concerts": {
243
+ "DEFAULT": "K AA1 N S ER0 T S",
244
+ "VERB": "K AH0 N S ER1 T S"
245
+ },
246
+ "conduct": {
247
+ "DEFAULT": "K AA1 N D AH0 K T",
248
+ "VERB": "K AA0 N D AH1 K T"
249
+ },
250
+ "confederate": {
251
+ "DEFAULT": "K AH0 N F EH1 D ER0 AH0 T",
252
+ "VERB": "K AH0 N F EH1 D ER0 EY2 T"
253
+ },
254
+ "confederates": {
255
+ "DEFAULT": "K AH0 N F EH1 D ER0 AH0 T S",
256
+ "VERB": "K AH0 N F EH1 D ER0 EY2 T S"
257
+ },
258
+ "confines": {
259
+ "DEFAULT": "K AA1 N F AY2 N Z",
260
+ "VERB": "K AH0 N F AY1 N Z"
261
+ },
262
+ "conflict": {
263
+ "DEFAULT": "K AA1 N F L IH0 K T",
264
+ "VERB": "K AH0 N F L IH1 K T"
265
+ },
266
+ "conflicts": {
267
+ "DEFAULT": "K AA1 N F L IH0 K T S",
268
+ "VERB": "K AH0 N F L IH1 K T S"
269
+ },
270
+ "conglomerate": {
271
+ "DEFAULT": "K AH0 N G L AA1 M ER0 AH0 T",
272
+ "VERB": "K AH0 N G L AA1 M ER0 EY2 T"
273
+ },
274
+ "conglomerates": {
275
+ "DEFAULT": "K AH0 N G L AA1 M ER0 AH0 T S",
276
+ "VERB": "K AH0 N G L AA1 M ER0 EY2 T S"
277
+ },
278
+ "conscript": {
279
+ "DEFAULT": "K AA1 N S K R IH0 P T",
280
+ "VERB": "K AH0 N S K R IH1 P T"
281
+ },
282
+ "conscripts": {
283
+ "DEFAULT": "K AA1 N S K R IH0 P T S",
284
+ "VERB": "K AH0 N S K R IH1 P T S"
285
+ },
286
+ "console": {
287
+ "DEFAULT": "K AA1 N S OW0 L",
288
+ "VERB": "K AH0 N S OW1 L"
289
+ },
290
+ "consoles": {
291
+ "DEFAULT": "K AA1 N S OW0 L Z",
292
+ "VERB": "K AH0 N S OW1 L Z"
293
+ },
294
+ "consort": {
295
+ "DEFAULT": "K AA1 N S AO0 R T",
296
+ "VERB": "K AH0 N S AO1 R T"
297
+ },
298
+ "construct": {
299
+ "DEFAULT": "K AA1 N S T R AH0 K T",
300
+ "VERB": "K AH0 N S T R AH1 K T"
301
+ },
302
+ "constructs": {
303
+ "DEFAULT": "K AA1 N S T R AH0 K T S",
304
+ "VERB": "K AH0 N S T R AH1 K T S"
305
+ },
306
+ "consummate": {
307
+ "DEFAULT": "K AA0 N S AH1 M AH0 T",
308
+ "VERB": "K AA1 N S AH0 M EY2 T"
309
+ },
310
+ "content": {
311
+ "DEFAULT": "K AH0 N T EH1 N T",
312
+ "NOUN": "K AA1 N T EH0 N T"
313
+ },
314
+ "contents": {
315
+ "DEFAULT": "K AA1 N T EH0 N T S",
316
+ "VERB": "K AH0 N T EH1 N T S"
317
+ },
318
+ "contest": {
319
+ "DEFAULT": "K AA1 N T EH0 S T",
320
+ "VERB": "K AH0 N T EH1 S T"
321
+ },
322
+ "contests": {
323
+ "DEFAULT": "K AA1 N T EH0 S T S",
324
+ "VERB": "K AH0 N T EH1 S T S"
325
+ },
326
+ "contract": {
327
+ "DEFAULT": "K AA1 N T R AE2 K T",
328
+ "VERB": "K AH0 N T R AE1 K T"
329
+ },
330
+ "contracts": {
331
+ "DEFAULT": "K AA1 N T R AE2 K T S",
332
+ "VERB": "K AH0 N T R AE1 K T S"
333
+ },
334
+ "contrast": {
335
+ "DEFAULT": "K AA1 N T R AE0 S T",
336
+ "VERB": "K AH0 N T R AE1 S T"
337
+ },
338
+ "contrasts": {
339
+ "DEFAULT": "K AA1 N T R AE0 S T S",
340
+ "VERB": "K AH0 N T R AE1 S T S"
341
+ },
342
+ "converse": {
343
+ "DEFAULT": "K AA1 N V ER0 S",
344
+ "VERB": "K AH0 N V ER1 S"
345
+ },
346
+ "convert": {
347
+ "DEFAULT": "K AA1 N V ER0 T",
348
+ "VERB": "K AH0 N V ER1 T"
349
+ },
350
+ "converts": {
351
+ "DEFAULT": "K AA1 N V ER0 T S",
352
+ "VERB": "K AH0 N V ER1 T S"
353
+ },
354
+ "convict": {
355
+ "DEFAULT": "K AA1 N V IH0 K T",
356
+ "VERB": "K AH0 N V IH1 K T"
357
+ },
358
+ "convicts": {
359
+ "DEFAULT": "K AA1 N V IH0 K T S",
360
+ "VERB": "K AH0 N V IH1 K T S"
361
+ },
362
+ "coordinate": {
363
+ "DEFAULT": "K OW0 AO1 R D AH0 N AH0 T",
364
+ "VERB": "K OW0 AO1 R D AH0 N EY2 T"
365
+ },
366
+ "coordinates": {
367
+ "DEFAULT": "K OW0 AO1 R D AH0 N AH0 T S",
368
+ "VERB": "K OW0 AO1 R D AH0 N EY2 T S"
369
+ },
370
+ "counterbalance": {
371
+ "DEFAULT": "K AW2 N T ER0 B AE1 L AH0 N S",
372
+ "VERB": "K AW1 N T ER0 B AE2 L AH0 N S"
373
+ },
374
+ "counterbalances": {
375
+ "DEFAULT": "K AW1 N T ER0 B AE2 L AH0 N S IH0 Z",
376
+ "VERB": "K AW2 N T ER0 B AE1 L AH0 N S IH0 Z"
377
+ },
378
+ "crabbed": {
379
+ "DEFAULT": "K R AE1 B IH0 D",
380
+ "VERB": "K R AE1 B D"
381
+ },
382
+ "crooked": {
383
+ "DEFAULT": "K R UH1 K AH0 D",
384
+ "VERB": "K R UH1 K T"
385
+ },
386
+ "curate": {
387
+ "DEFAULT": "K Y UH1 R AH0 T",
388
+ "VERB": "K Y UH0 R AH1 T"
389
+ },
390
+ "cursed": {
391
+ "DEFAULT": "K ER1 S IH0 D",
392
+ "VERB": "K ER1 S T"
393
+ },
394
+ "decoy": {
395
+ "DEFAULT": "D IY1 K OY0",
396
+ "VERB": "D IY0 K OY1"
397
+ },
398
+ "decoys": {
399
+ "DEFAULT": "D IY1 K OY0 Z",
400
+ "VERB": "D IY0 K OY1 Z"
401
+ },
402
+ "decrease": {
403
+ "DEFAULT": "D IY1 K R IY2 S",
404
+ "VERB": "D IH0 K R IY1 S"
405
+ },
406
+ "decreases": {
407
+ "DEFAULT": "D IY1 K R IY2 S IH0 Z",
408
+ "VERB": "D IH0 K R IY1 S IH0 Z"
409
+ },
410
+ "defect": {
411
+ "DEFAULT": "D IY1 F EH0 K T",
412
+ "VERB": "D IH0 F EH1 K T"
413
+ },
414
+ "defects": {
415
+ "DEFAULT": "D IY1 F EH0 K T S",
416
+ "VERB": "D IH0 F EH1 K T S"
417
+ },
418
+ "degenerate": {
419
+ "DEFAULT": "D IH0 JH EH1 N ER0 AH0 T",
420
+ "VERB": "D IH0 JH EH1 N ER0 EY2 T"
421
+ },
422
+ "degenerates": {
423
+ "DEFAULT": "D IH0 JH EH1 N ER0 AH0 T S",
424
+ "VERB": "D IH0 JH EH1 N ER0 EY2 T S"
425
+ },
426
+ "delegate": {
427
+ "DEFAULT": "D EH1 L AH0 G AH0 T",
428
+ "VERB": "D EH1 L AH0 G EY2 T"
429
+ },
430
+ "delegates": {
431
+ "DEFAULT": "D EH1 L AH0 G AH0 T S",
432
+ "VERB": "D EH1 L AH0 G EY2 T S"
433
+ },
434
+ "deliberate": {
435
+ "DEFAULT": "D IH0 L IH1 B ER0 AH0 T",
436
+ "VERB": "D IH0 L IH1 B ER0 EY2 T"
437
+ },
438
+ "desert": {
439
+ "DEFAULT": "D EH1 Z ER0 T",
440
+ "VERB": "D IH0 Z ER1 T"
441
+ },
442
+ "deserts": {
443
+ "DEFAULT": "D EH1 Z ER0 T S",
444
+ "VERB": "D IH0 Z ER1 T S"
445
+ },
446
+ "desolate": {
447
+ "DEFAULT": "D EH1 S AH0 L AH0 T",
448
+ "VERB": "D EH1 S AH0 L EY2 T"
449
+ },
450
+ "diagnoses": {
451
+ "DEFAULT": "D AY2 AH0 G N OW1 S IY0 Z",
452
+ "VERB": "D AY1 AH0 G N OW2 Z IY0 Z"
453
+ },
454
+ "dictate": {
455
+ "DEFAULT": "D IH1 K T EY2 T",
456
+ "VERB": "D IH0 K T EY1 T"
457
+ },
458
+ "dictates": {
459
+ "DEFAULT": "D IH1 K T EY2 T S",
460
+ "VERB": "D IH0 K T EY1 T S"
461
+ },
462
+ "diffuse": {
463
+ "DEFAULT": "D IH0 F Y UW1 S",
464
+ "VERB": "D IH0 F Y UW1 Z"
465
+ },
466
+ "digest": {
467
+ "DEFAULT": "D AY1 JH EH0 S T",
468
+ "VERB": "D AY0 JH EH1 S T"
469
+ },
470
+ "digests": {
471
+ "DEFAULT": "D AY1 JH EH0 S T S",
472
+ "VERB": "D AY2 JH EH1 S T S"
473
+ },
474
+ "discard": {
475
+ "DEFAULT": "D IH1 S K AA0 R D",
476
+ "VERB": "D IH0 S K AA1 R D"
477
+ },
478
+ "discards": {
479
+ "DEFAULT": "D IH1 S K AA0 R D Z",
480
+ "VERB": "D IH0 S K AA1 R D Z"
481
+ },
482
+ "discharge": {
483
+ "DEFAULT": "D IH1 S CH AA2 R JH",
484
+ "VERB": "D IH0 S CH AA1 R JH"
485
+ },
486
+ "discharges": {
487
+ "DEFAULT": "D IH1 S CH AA2 R JH AH0 Z",
488
+ "VERB": "D IH0 S CH AA1 R JH AH0 Z"
489
+ },
490
+ "discount": {
491
+ "DEFAULT": "D IH1 S K AW0 N T",
492
+ "VERB": "D IH0 S K AW1 N T"
493
+ },
494
+ "discounts": {
495
+ "DEFAULT": "D IH1 S K AW2 N T S",
496
+ "VERB": "D IH0 S K AW1 N T S"
497
+ },
498
+ "discourse": {
499
+ "DEFAULT": "D IH1 S K AO0 R S",
500
+ "VERB": "D IH0 S K AO1 R S"
501
+ },
502
+ "discourses": {
503
+ "DEFAULT": "D IH1 S K AO0 R S IH0 Z",
504
+ "VERB": "D IH0 S K AO1 R S IH0 Z"
505
+ },
506
+ "document": {
507
+ "DEFAULT": "D AA1 K Y AH0 M AH0 N T",
508
+ "VERB": "D AA1 K Y UW0 M EH0 N T"
509
+ },
510
+ "documents": {
511
+ "DEFAULT": "D AA1 K Y AH0 M AH0 N T S",
512
+ "VERB": "D AA1 K Y UW0 M EH0 N T S"
513
+ },
514
+ "dogged": {
515
+ "DEFAULT": "D AO1 G D",
516
+ "VERB": "D AO1 G IH0 D"
517
+ },
518
+ "duplicate": {
519
+ "DEFAULT": "D UW1 P L AH0 K AH0 T",
520
+ "VERB": "D UW1 P L AH0 K EY2 T"
521
+ },
522
+ "duplicates": {
523
+ "DEFAULT": "D UW1 P L AH0 K AH0 T S",
524
+ "VERB": "D UW1 P L AH0 K EY2 T S"
525
+ },
526
+ "ejaculate": {
527
+ "DEFAULT": "IH0 JH AE1 K Y UW0 L AH0 T",
528
+ "VERB": "IH0 JH AE1 K Y UW0 L EY2 T"
529
+ },
530
+ "ejaculates": {
531
+ "DEFAULT": "IH0 JH AE1 K Y UW0 L AH0 T S",
532
+ "VERB": "IH0 JH AE1 K Y UW0 L EY2 T S"
533
+ },
534
+ "elaborate": {
535
+ "DEFAULT": "IH0 L AE1 B R AH0 T",
536
+ "VERB": "IH0 L AE1 B ER0 EY2 T"
537
+ },
538
+ "entrance": {
539
+ "DEFAULT": "EH1 N T R AH0 N S",
540
+ "VERB": "IH0 N T R AH1 N S"
541
+ },
542
+ "entrances": {
543
+ "DEFAULT": "EH1 N T R AH0 N S AH0 Z",
544
+ "VERB": "IH0 N T R AH1 N S AH0 Z"
545
+ },
546
+ "envelope": {
547
+ "DEFAULT": "EH1 N V AH0 L OW2 P",
548
+ "VERB": "IH0 N V EH1 L AH0 P"
549
+ },
550
+ "envelopes": {
551
+ "DEFAULT": "EH1 N V AH0 L OW2 P S",
552
+ "VERB": "IH0 N V EH1 L AH0 P S"
553
+ },
554
+ "escort": {
555
+ "DEFAULT": "EH1 S K AO0 R T",
556
+ "VERB": "EH0 S K AO1 R T"
557
+ },
558
+ "escorts": {
559
+ "DEFAULT": "EH1 S K AO0 R T S",
560
+ "VERB": "EH0 S K AO1 R T S"
561
+ },
562
+ "essay": {
563
+ "DEFAULT": "EH1 S EY2",
564
+ "VERB": "EH0 S EY1"
565
+ },
566
+ "essays": {
567
+ "DEFAULT": "EH1 S EY2 Z",
568
+ "VERB": "EH0 S EY1 Z"
569
+ },
570
+ "estimate": {
571
+ "DEFAULT": "EH1 S T AH0 M AH0 T",
572
+ "VERB": "EH1 S T AH0 M EY2 T"
573
+ },
574
+ "estimates": {
575
+ "DEFAULT": "EH1 S T AH0 M AH0 T S",
576
+ "VERB": "EH1 S T AH0 M EY2 T S"
577
+ },
578
+ "excess": {
579
+ "DEFAULT": "EH1 K S EH2 S",
580
+ "VERB": "IH0 K S EH1 S"
581
+ },
582
+ "excise": {
583
+ "DEFAULT": "EH1 K S AY0 Z",
584
+ "VERB": "EH0 K S AY1 S"
585
+ },
586
+ "excuse": {
587
+ "DEFAULT": "IH0 K S K Y UW1 S",
588
+ "VERB": "IH0 K S K Y UW1 Z"
589
+ },
590
+ "excuses": {
591
+ "DEFAULT": "IH0 K S K Y UW1 S IH0 Z",
592
+ "VERB": "IH0 K S K Y UW1 Z IH0 Z"
593
+ },
594
+ "expatriate": {
595
+ "DEFAULT": "EH0 K S P EY1 T R IY0 AH0 T",
596
+ "VERB": "EH0 K S P EY1 T R IY0 EY2 T"
597
+ },
598
+ "expatriates": {
599
+ "DEFAULT": "EH0 K S P EY1 T R IY0 AH0 T S",
600
+ "VERB": "EH0 K S P EY1 T R IY0 EY2 T S"
601
+ },
602
+ "exploit": {
603
+ "DEFAULT": "EH2 K S P L OY1 T",
604
+ "VERB": "EH1 K S P L OY2 T"
605
+ },
606
+ "exploits": {
607
+ "DEFAULT": "EH2 K S P L OY1 T S",
608
+ "VERB": "EH1 K S P L OY2 T S"
609
+ },
610
+ "export": {
611
+ "DEFAULT": "EH1 K S P AO0 R T",
612
+ "VERB": "IH0 K S P AO1 R T"
613
+ },
614
+ "exports": {
615
+ "DEFAULT": "EH1 K S P AO0 R T S",
616
+ "VERB": "IH0 K S P AO1 R T S"
617
+ },
618
+ "extract": {
619
+ "DEFAULT": "EH1 K S T R AE2 K T",
620
+ "VERB": "IH0 K S T R AE1 K T"
621
+ },
622
+ "extracts": {
623
+ "DEFAULT": "EH1 K S T R AE2 K T S",
624
+ "VERB": "IH0 K S T R AE1 K T S"
625
+ },
626
+ "ferment": {
627
+ "DEFAULT": "F ER1 M EH0 N T",
628
+ "VERB": "F ER0 M EH1 N T"
629
+ },
630
+ "ferments": {
631
+ "DEFAULT": "F ER1 M EH0 N T S",
632
+ "VERB": "F ER0 M EH1 N T S"
633
+ },
634
+ "fragment": {
635
+ "DEFAULT": "F R AE0 G M EH1 N T",
636
+ "VERB": "F R AE1 G M AH0 N T"
637
+ },
638
+ "fragments": {
639
+ "DEFAULT": "F R AE1 G M AH0 N T S",
640
+ "VERB": "F R AE0 G M EH1 N T S"
641
+ },
642
+ "frequent": {
643
+ "DEFAULT": "F R IY1 K W AH0 N T",
644
+ "VERB": "F R IY1 K W EH2 N T"
645
+ },
646
+ "graduate": {
647
+ "DEFAULT": "G R AE1 JH AH0 W AH0 T",
648
+ "VERB": "G R AE1 JH AH0 W EY2 T"
649
+ },
650
+ "graduates": {
651
+ "DEFAULT": "G R AE1 JH AH0 W AH0 T S",
652
+ "VERB": "G R AE1 JH AH0 W EY2 T S"
653
+ },
654
+ "house": {
655
+ "DEFAULT": "HH AW1 S",
656
+ "VERB": "HH AW1 Z"
657
+ },
658
+ "impact": {
659
+ "DEFAULT": "IH1 M P AE0 K T",
660
+ "VERB": "IH2 M P AE1 K T"
661
+ },
662
+ "impacts": {
663
+ "DEFAULT": "IH1 M P AE0 K T S",
664
+ "VERB": "IH2 M P AE1 K T S"
665
+ },
666
+ "implant": {
667
+ "DEFAULT": "IH1 M P L AE2 N T",
668
+ "VERB": "IH2 M P L AE1 N T"
669
+ },
670
+ "implants": {
671
+ "DEFAULT": "IH1 M P L AE2 N T S",
672
+ "VERB": "IH2 M P L AE1 N T S"
673
+ },
674
+ "implement": {
675
+ "DEFAULT": "IH1 M P L AH0 M AH0 N T",
676
+ "VERB": "IH1 M P L AH0 M EH0 N T"
677
+ },
678
+ "implements": {
679
+ "DEFAULT": "IH1 M P L AH0 M AH0 N T S",
680
+ "VERB": "IH1 M P L AH0 M EH0 N T S"
681
+ },
682
+ "import": {
683
+ "DEFAULT": "IH1 M P AO2 R T",
684
+ "VERB": "IH2 M P AO1 R T"
685
+ },
686
+ "imports": {
687
+ "DEFAULT": "IH1 M P AO2 R T S",
688
+ "VERB": "IH2 M P AO1 R T S"
689
+ },
690
+ "impress": {
691
+ "DEFAULT": "IH1 M P R EH0 S",
692
+ "VERB": "IH0 M P R EH1 S"
693
+ },
694
+ "imprint": {
695
+ "DEFAULT": "IH2 M P R IH1 N T",
696
+ "VERB": "IH1 M P R IH0 N T"
697
+ },
698
+ "imprints": {
699
+ "DEFAULT": "IH1 M P R IH0 N T S",
700
+ "VERB": "IH2 M P R IH1 N T S"
701
+ },
702
+ "incense": {
703
+ "DEFAULT": "IH1 N S EH2 N S",
704
+ "VERB": "IH2 N S EH1 N S"
705
+ },
706
+ "incline": {
707
+ "DEFAULT": "IH1 N K L AY0 N",
708
+ "VERB": "IH2 N K L AY1 N"
709
+ },
710
+ "inclines": {
711
+ "DEFAULT": "IH1 N K L AY0 N Z",
712
+ "VERB": "IH2 N K L AY1 N Z"
713
+ },
714
+ "incorporate": {
715
+ "DEFAULT": "IH2 N K AO1 R P ER0 AH0 T",
716
+ "VERB": "IH2 N K AO1 R P ER0 EY2 T"
717
+ },
718
+ "increase": {
719
+ "DEFAULT": "IH1 N K R IY2 S",
720
+ "VERB": "IH2 N K R IY1 S"
721
+ },
722
+ "increases": {
723
+ "DEFAULT": "IH1 N K R IY2 S IH0 Z",
724
+ "VERB": "IH2 N K R IY1 S IH0 Z"
725
+ },
726
+ "indent": {
727
+ "DEFAULT": "IH1 N D EH0 N T",
728
+ "VERB": "IH2 N D EH1 N T"
729
+ },
730
+ "indents": {
731
+ "DEFAULT": "IH1 N D EH0 N T S",
732
+ "VERB": "IH2 N D EH1 N T S"
733
+ },
734
+ "inebriate": {
735
+ "DEFAULT": "IH2 N EH1 B R IY0 AH0 T",
736
+ "VERB": "IH2 N EH1 B R IY0 EY2 T"
737
+ },
738
+ "inebriates": {
739
+ "DEFAULT": "IH2 N EH1 B R IY0 AH0 T S",
740
+ "VERB": "IH2 N EH1 B R IY0 EY2 T S"
741
+ },
742
+ "initiate": {
743
+ "DEFAULT": "IH2 N IH1 SH IY0 AH0 T",
744
+ "VERB": "IH2 N IH1 SH IY0 EY2 T"
745
+ },
746
+ "initiates": {
747
+ "DEFAULT": "IH2 N IH1 SH IY0 AH0 T S",
748
+ "VERB": "IH2 N IH1 SH IY0 EY2 T S"
749
+ },
750
+ "inlay": {
751
+ "DEFAULT": "IH1 N L EY2",
752
+ "VERB": "IH2 N L EY1"
753
+ },
754
+ "inlays": {
755
+ "DEFAULT": "IH1 N L EY2 Z",
756
+ "VERB": "IH2 N L EY1 Z"
757
+ },
758
+ "insert": {
759
+ "DEFAULT": "IH1 N S ER2 T",
760
+ "VERB": "IH2 N S ER1 T"
761
+ },
762
+ "inserts": {
763
+ "DEFAULT": "IH1 N S ER2 T S",
764
+ "VERB": "IH2 N S ER1 T S"
765
+ },
766
+ "inset": {
767
+ "DEFAULT": "IH1 N S EH2 T",
768
+ "VERB": "IH2 N S EH1 T"
769
+ },
770
+ "insets": {
771
+ "DEFAULT": "IH1 N S EH2 T S",
772
+ "VERB": "IH2 N S EH1 T S"
773
+ },
774
+ "instinct": {
775
+ "DEFAULT": "IH1 N S T IH0 NG K T",
776
+ "VERB": "IH2 N S T IH1 NG K T"
777
+ },
778
+ "insult": {
779
+ "DEFAULT": "IH1 N S AH2 L T",
780
+ "VERB": "IH2 N S AH1 L T"
781
+ },
782
+ "insults": {
783
+ "DEFAULT": "IH1 N S AH2 L T S",
784
+ "VERB": "IH2 N S AH1 L T S"
785
+ },
786
+ "interchange": {
787
+ "DEFAULT": "IH1 N T ER0 CH EY2 N JH",
788
+ "VERB": "IH2 T ER0 CH EY1 N JH"
789
+ },
790
+ "interchanges": {
791
+ "DEFAULT": "IH1 N T ER0 CH EY2 N JH IH0 Z",
792
+ "VERB": "IH2 T ER0 CH EY1 N JH IH0 Z"
793
+ },
794
+ "interdict": {
795
+ "DEFAULT": "IH1 N T ER0 D IH2 K T",
796
+ "VERB": "IH2 N T ER0 D IH1 K T"
797
+ },
798
+ "interdicts": {
799
+ "DEFAULT": "IH1 N T ER0 D IH2 K T S",
800
+ "VERB": "IH2 N T ER0 D IH1 K T S"
801
+ },
802
+ "intern": {
803
+ "DEFAULT": "IH1 N T ER0 N",
804
+ "VERB": "IH0 N T ER1 N"
805
+ },
806
+ "interns": {
807
+ "DEFAULT": "IH1 N T ER0 N Z",
808
+ "VERB": "IH0 N T ER1 N Z"
809
+ },
810
+ "intimate": {
811
+ "DEFAULT": "IH1 N T AH0 M AH0 T",
812
+ "VERB": "IH1 N T IH0 M EY2 T"
813
+ },
814
+ "intimates": {
815
+ "DEFAULT": "IH1 N T AH0 M AH0 T S",
816
+ "VERB": "IH1 N T IH0 M EY2 T S"
817
+ },
818
+ "intrigue": {
819
+ "DEFAULT": "IH1 N T R IY0 G",
820
+ "VERB": "IH2 N T R IY1 G"
821
+ },
822
+ "introvert": {
823
+ "DEFAULT": "IH1 N T R AO0 V ER2 T",
824
+ "VERB": "IH2 N T R AO0 V ER1 T"
825
+ },
826
+ "introverts": {
827
+ "DEFAULT": "IH1 N T R AO0 V ER2 T S",
828
+ "VERB": "IH2 N T R AO0 V ER1 T S"
829
+ },
830
+ "inverse": {
831
+ "DEFAULT": "IH2 N V ER1 S",
832
+ "VERB": "IH1 N V ER0 S"
833
+ },
834
+ "invite": {
835
+ "DEFAULT": "IH1 N V AY0 T",
836
+ "VERB": "IH2 N V AY1 T"
837
+ },
838
+ "invites": {
839
+ "DEFAULT": "IH1 N V AY0 T S",
840
+ "VERB": "IH2 N V AY1 T S"
841
+ },
842
+ "jagged": {
843
+ "DEFAULT": "JH AE1 G IH0 D",
844
+ "VERB": "JH AE1 G D"
845
+ },
846
+ "learned": {
847
+ "DEFAULT": "L ER1 N D",
848
+ "VERB": "L ER1 N IH0 D"
849
+ },
850
+ "legitimate": {
851
+ "DEFAULT": "L AH0 JH IH1 T AH0 M AH0 T",
852
+ "VERB": "L AH0 JH IH1 T AH0 M EY2 T"
853
+ },
854
+ "live": {
855
+ "DEFAULT": "L AY1 V",
856
+ "VERB": "L IH1 V"
857
+ },
858
+ "lives": {
859
+ "DEFAULT": "L AY1 V Z",
860
+ "VERB": "L IH1 V Z"
861
+ },
862
+ "mandate": {
863
+ "DEFAULT": "M AE2 N D EY1 T",
864
+ "VERB": "M AE1 N D EY2 T"
865
+ },
866
+ "misconduct": {
867
+ "DEFAULT": "M IH2 S K AA0 N D AH1 K T",
868
+ "VERB": "M IH2 S K AA1 N D AH0 K T"
869
+ },
870
+ "misprint": {
871
+ "DEFAULT": "M IH1 S P R IH0 N T",
872
+ "VERB": "M IH2 S P R IH1 N T"
873
+ },
874
+ "misprints": {
875
+ "DEFAULT": "M IH1 S P R IH0 N T S",
876
+ "VERB": "M IH2 S P R IH1 N T S"
877
+ },
878
+ "misuse": {
879
+ "DEFAULT": "M IH0 S Y UW1 Z",
880
+ "VERB": "M IH0 S Y UW1 S"
881
+ },
882
+ "misuses": {
883
+ "DEFAULT": "M IH0 S Y UW1 S IH0 Z",
884
+ "VERB": "M IH0 S Y UW1 Z IH0 Z"
885
+ },
886
+ "moderate": {
887
+ "DEFAULT": "M AA1 D ER0 AH0 T",
888
+ "VERB": "M AA1 D ER0 EY2 T"
889
+ },
890
+ "moderates": {
891
+ "DEFAULT": "M AA1 D ER0 AH0 T S",
892
+ "VERB": "M AA1 D ER0 EY2 T S"
893
+ },
894
+ "mouth": {
895
+ "DEFAULT": "M AW1 DH",
896
+ "VERB": "M AW1 TH"
897
+ },
898
+ "mouths": {
899
+ "DEFAULT": "M AW1 TH S",
900
+ "VERB": "M AW1 DH Z"
901
+ },
902
+ "object": {
903
+ "DEFAULT": "AA1 B JH EH0 K T",
904
+ "VERB": "AH0 B JH EH1 K T"
905
+ },
906
+ "objects": {
907
+ "DEFAULT": "AA1 B JH EH0 K T S",
908
+ "VERB": "AH0 B JH EH1 K T S"
909
+ },
910
+ "ornament": {
911
+ "DEFAULT": "AO1 R N AH0 M AH0 N T",
912
+ "VERB": "AO1 R N AH0 M EH0 N T"
913
+ },
914
+ "ornaments": {
915
+ "DEFAULT": "AO1 R N AH0 M AH0 N T S",
916
+ "VERB": "AO1 R N AH0 M EH0 N T S"
917
+ },
918
+ "overcharge": {
919
+ "DEFAULT": "OW1 V ER0 CH AA2 R JH",
920
+ "VERB": "OW2 V ER0 CH AA1 R JH"
921
+ },
922
+ "overcharges": {
923
+ "DEFAULT": "OW1 V ER0 CH AA2 R JH IH0 Z",
924
+ "VERB": "OW2 V ER0 CH AA1 R JH IH0 Z"
925
+ },
926
+ "overflow": {
927
+ "DEFAULT": "OW1 V ER0 F L OW2",
928
+ "VERB": "OW2 V ER0 F L OW1"
929
+ },
930
+ "overflows": {
931
+ "DEFAULT": "OW1 V ER0 F L OW2 Z",
932
+ "VERB": "OW2 V ER0 F L OW1 Z"
933
+ },
934
+ "overhang": {
935
+ "DEFAULT": "OW1 V ER0 HH AE2 NG",
936
+ "VERB": "OW2 V ER0 HH AE1 NG"
937
+ },
938
+ "overhangs": {
939
+ "DEFAULT": "OW1 V ER0 HH AE2 NG Z",
940
+ "VERB": "OW2 V ER0 HH AE1 NG Z"
941
+ },
942
+ "overhaul": {
943
+ "DEFAULT": "OW1 V ER0 HH AO2 L",
944
+ "VERB": "OW2 V ER0 HH AO1 L"
945
+ },
946
+ "overhauls": {
947
+ "DEFAULT": "OW1 V ER0 HH AO2 L Z",
948
+ "VERB": "OW2 V ER0 HH AO1 L Z"
949
+ },
950
+ "overlap": {
951
+ "DEFAULT": "OW1 V ER0 L AE2 P",
952
+ "VERB": "OW2 V ER0 L AE1 P"
953
+ },
954
+ "overlaps": {
955
+ "DEFAULT": "OW1 V ER0 L AE2 P S",
956
+ "VERB": "OW2 V ER0 L AE1 P S"
957
+ },
958
+ "overlay": {
959
+ "DEFAULT": "OW1 V ER0 L EY2",
960
+ "VERB": "OW2 V ER0 L EY1"
961
+ },
962
+ "overlays": {
963
+ "DEFAULT": "OW1 V ER0 L EY2 Z",
964
+ "VERB": "OW2 V ER0 L EY1 Z"
965
+ },
966
+ "overwork": {
967
+ "DEFAULT": "OW1 V ER0 W ER2 K",
968
+ "VERB": "OW2 V ER0 W ER1 K"
969
+ },
970
+ "perfect": {
971
+ "DEFAULT": "P ER1 F IH2 K T",
972
+ "VERB": "P ER0 F EH1 K T"
973
+ },
974
+ "perfume": {
975
+ "DEFAULT": "P ER1 F Y UW0 M",
976
+ "VERB": "P ER0 F Y UW1 M"
977
+ },
978
+ "perfumes": {
979
+ "DEFAULT": "P ER1 F Y UW0 M Z",
980
+ "VERB": "P ER0 F Y UW1 M Z"
981
+ },
982
+ "permit": {
983
+ "DEFAULT": "P ER1 M IH2 T",
984
+ "VERB": "P ER0 M IH1 T"
985
+ },
986
+ "permits": {
987
+ "DEFAULT": "P ER1 M IH2 T S",
988
+ "VERB": "P ER0 M IH1 T S"
989
+ },
990
+ "pervert": {
991
+ "DEFAULT": "P ER1 V ER0 T",
992
+ "VERB": "P ER0 V ER1 T"
993
+ },
994
+ "perverts": {
995
+ "DEFAULT": "P ER1 V ER0 T S",
996
+ "VERB": "P ER0 V ER1 T S"
997
+ },
998
+ "pontificate": {
999
+ "DEFAULT": "P AA0 N T IH1 F AH0 K EY2 T",
1000
+ "VERB": "P AA0 N T IH1 F AH0 K AH0 T"
1001
+ },
1002
+ "pontificates": {
1003
+ "DEFAULT": "P AA0 N T IH1 F AH0 K AH0 T S",
1004
+ "VERB": "P AA0 N T IH1 F AH0 K EY2 T S"
1005
+ },
1006
+ "precipitate": {
1007
+ "DEFAULT": "P R IH0 S IH1 P IH0 T EY2 T",
1008
+ "VERB": "P R IH0 S IH1 P IH0 T AH0 T"
1009
+ },
1010
+ "predicate": {
1011
+ "DEFAULT": "P R EH1 D AH0 K EY2 T",
1012
+ "VERB": "P R EH1 D IH0 K AH0 T"
1013
+ },
1014
+ "predicates": {
1015
+ "DEFAULT": "P R EH1 D IH0 K AH0 T S",
1016
+ "VERB": "P R EH1 D AH0 K EY2 T S"
1017
+ },
1018
+ "prefix": {
1019
+ "DEFAULT": "P R IY1 F IH0 K S",
1020
+ "VERB": "P R IY2 F IH1 K S"
1021
+ },
1022
+ "prefixes": {
1023
+ "DEFAULT": "P R IY1 F IH0 K S IH0 JH",
1024
+ "VERB": "P R IY2 F IH1 K S IH0 JH"
1025
+ },
1026
+ "presage": {
1027
+ "DEFAULT": "P R EH1 S IH0 JH",
1028
+ "VERB": "P R EH2 S IH1 JH"
1029
+ },
1030
+ "presages": {
1031
+ "DEFAULT": "P R EH1 S IH0 JH IH0 JH",
1032
+ "VERB": "P R EH2 S IH1 JH IH0 JH"
1033
+ },
1034
+ "present": {
1035
+ "DEFAULT": "P R EH1 Z AH0 N T",
1036
+ "VERB": "P R IY0 Z EH1 N T"
1037
+ },
1038
+ "presents": {
1039
+ "DEFAULT": "P R EH1 Z AH0 N T S",
1040
+ "VERB": "P R IY0 Z EH1 N T S"
1041
+ },
1042
+ "proceeds": {
1043
+ "DEFAULT": "P R OW1 S IY0 D Z",
1044
+ "VERB": "P R AH0 S IY1 D Z"
1045
+ },
1046
+ "process": {
1047
+ "DEFAULT": "P R AA1 S EH2 S",
1048
+ "VERB": "P R AO2 S EH1 S"
1049
+ },
1050
+ "processes": {
1051
+ "DEFAULT": "P R AO2 S EH1 S AH0 Z",
1052
+ "VERB": "P R AA1 S EH0 S AH0 Z"
1053
+ },
1054
+ "processing": {
1055
+ "DEFAULT": "P R AA1 S EH0 S IH0 NG",
1056
+ "VERB": "P R AA0 S EH1 S IH0 NG"
1057
+ },
1058
+ "produce": {
1059
+ "DEFAULT": "P R OW1 D UW0 S",
1060
+ "VERB": "P R AH0 D UW1 S"
1061
+ },
1062
+ "progress": {
1063
+ "DEFAULT": "P R AA1 G R EH2 S",
1064
+ "VERB": "P R AH0 G R EH1 S"
1065
+ },
1066
+ "progresses": {
1067
+ "DEFAULT": "P R AA1 G R EH2 S AH0 Z",
1068
+ "VERB": "P R OW0 G R EH1 S AH0 Z"
1069
+ },
1070
+ "project": {
1071
+ "DEFAULT": "P R AA1 JH EH0 K T",
1072
+ "VERB": "P R AA0 JH EH1 K T"
1073
+ },
1074
+ "projects": {
1075
+ "DEFAULT": "P R AA1 JH EH0 K T S",
1076
+ "VERB": "P R AA0 JH EH1 K T S"
1077
+ },
1078
+ "prospect": {
1079
+ "DEFAULT": "P R AA1 S P EH0 K T",
1080
+ "VERB": "P R AH2 S P EH1 K T"
1081
+ },
1082
+ "prospects": {
1083
+ "DEFAULT": "P R AA1 S P EH0 K T S",
1084
+ "VERB": "P R AH2 S P EH1 K T S"
1085
+ },
1086
+ "prostrate": {
1087
+ "DEFAULT": "P R AA1 S T R EY0 T",
1088
+ "VERB": "P R AA0 S T R EY1 T"
1089
+ },
1090
+ "protest": {
1091
+ "DEFAULT": "P R OW1 T EH2 S T",
1092
+ "VERB": "P R AH0 T EH1 S T"
1093
+ },
1094
+ "protests": {
1095
+ "DEFAULT": "P R OW1 T EH2 S T S",
1096
+ "VERB": "P R AH0 T EH1 S T S"
1097
+ },
1098
+ "purport": {
1099
+ "DEFAULT": "P ER1 P AO2 R T",
1100
+ "VERB": "P ER0 P AO1 R T"
1101
+ },
1102
+ "quadruple": {
1103
+ "DEFAULT": "K W AA0 D R UW1 P AH0 L",
1104
+ "VERB": "K W AA1 D R UW0 P AH0 L"
1105
+ },
1106
+ "quadruples": {
1107
+ "DEFAULT": "K W AA1 D R UW0 P AH0 L Z",
1108
+ "VERB": "K W AA0 D R UW1 P AH0 L Z"
1109
+ },
1110
+ "ragged": {
1111
+ "DEFAULT": "R AE1 G AH0 D",
1112
+ "VERB": "R AE1 G D"
1113
+ },
1114
+ "rampage": {
1115
+ "DEFAULT": "R AE1 M P EY2 JH",
1116
+ "VERB": "R AE2 M P EY1 JH"
1117
+ },
1118
+ "rampages": {
1119
+ "DEFAULT": "R AE1 M P EY2 JH IH0 Z",
1120
+ "VERB": "R AE2 M P EY1 JH IH0 Z"
1121
+ },
1122
+ "read": {
1123
+ "DEFAULT": "R IY1 D",
1124
+ "VBD": "R EH1 D",
1125
+ "VBN": "R EH1 D",
1126
+ "VBP": "R EH1 D"
1127
+ },
1128
+ "rebel": {
1129
+ "DEFAULT": "R IH0 B EH1 L",
1130
+ "VERB": "R EH1 B AH0 L"
1131
+ },
1132
+ "rebels": {
1133
+ "DEFAULT": "R EH1 B AH0 L Z",
1134
+ "VERB": "R IH0 B EH1 L Z"
1135
+ },
1136
+ "rebound": {
1137
+ "DEFAULT": "R IY1 B AW0 N D",
1138
+ "VERB": "R IY0 B AW1 N D"
1139
+ },
1140
+ "rebounds": {
1141
+ "DEFAULT": "R IY1 B AW0 N D Z",
1142
+ "VERB": "R IY0 B AW1 N D Z"
1143
+ },
1144
+ "recall": {
1145
+ "DEFAULT": "R IY1 K AO2 L",
1146
+ "VERB": "R IH0 K AO1 L"
1147
+ },
1148
+ "recalls": {
1149
+ "DEFAULT": "R IY1 K AO2 L Z",
1150
+ "VERB": "R IH0 K AO1 L Z"
1151
+ },
1152
+ "recap": {
1153
+ "DEFAULT": "R IY1 K AE2 P",
1154
+ "VERB": "R IH0 K AE1 P"
1155
+ },
1156
+ "recapped": {
1157
+ "DEFAULT": "R IY1 K AE2 P T",
1158
+ "VERB": "R IH0 K AE1 P T"
1159
+ },
1160
+ "recapping": {
1161
+ "DEFAULT": "R IY1 K AE2 P IH0 NG",
1162
+ "VERB": "R IH0 K AE1 P IH0 NG"
1163
+ },
1164
+ "recaps": {
1165
+ "DEFAULT": "R IY1 K AE2 P S",
1166
+ "VERB": "R IH0 K AE1 P S"
1167
+ },
1168
+ "record": {
1169
+ "DEFAULT": "R EH1 K ER0 D",
1170
+ "VERB": "R IH0 K AO1 R D"
1171
+ },
1172
+ "records": {
1173
+ "DEFAULT": "R EH1 K ER0 D Z",
1174
+ "VERB": "R IH0 K AO1 R D Z"
1175
+ },
1176
+ "recount": {
1177
+ "DEFAULT": " R IH1 K AW0 N T",
1178
+ "VERB": "R IY2 K AW1 N T"
1179
+ },
1180
+ "recounts": {
1181
+ "DEFAULT": " R IH1 K AW0 N T S",
1182
+ "VERB": "R IY2 K AW1 N T S"
1183
+ },
1184
+ "refill": {
1185
+ "DEFAULT": "R IY1 F IH0 L",
1186
+ "VERB": "R IY0 F IH1 L"
1187
+ },
1188
+ "refills": {
1189
+ "DEFAULT": "R IY1 F IH0 L Z",
1190
+ "VERB": "R IY0 F IH1 L Z"
1191
+ },
1192
+ "refit": {
1193
+ "DEFAULT": "R IY1 F IH0 T",
1194
+ "VERB": "R IY0 F IH1 T"
1195
+ },
1196
+ "refits": {
1197
+ "DEFAULT": "R IY1 F IH0 T S",
1198
+ "VERB": "R IY0 F IH1 T S"
1199
+ },
1200
+ "refresh": {
1201
+ "DEFAULT": "R IH1 F R EH0 SH",
1202
+ "VERB": "R IH0 F R EH1 SH"
1203
+ },
1204
+ "refund": {
1205
+ "DEFAULT": "R IY1 F AH2 N D",
1206
+ "VERB": "R IH0 F AH1 N D"
1207
+ },
1208
+ "refunds": {
1209
+ "DEFAULT": "R IY1 F AH2 N D Z",
1210
+ "VERB": "R IH0 F AH1 N D Z"
1211
+ },
1212
+ "refuse": {
1213
+ "DEFAULT": "R EH1 F Y UW2 Z",
1214
+ "VERB": "R IH0 F Y UW1 Z"
1215
+ },
1216
+ "regenerate": {
1217
+ "DEFAULT": "R IY0 JH EH1 N ER0 AH0 T",
1218
+ "VERB": "R IY0 JH EH1 N ER0 EY2 T"
1219
+ },
1220
+ "rehash": {
1221
+ "DEFAULT": "R IY1 HH AE0 SH",
1222
+ "VERB": "R IY0 HH AE1 SH"
1223
+ },
1224
+ "rehashes": {
1225
+ "DEFAULT": "R IY1 HH AE0 SH IH0 Z",
1226
+ "VERB": "R IY0 HH AE1 SH IH0 Z"
1227
+ },
1228
+ "reincarnate": {
1229
+ "DEFAULT": "R IY2 IH0 N K AA1 R N AH0 T",
1230
+ "VERB": "R IY2 IH0 N K AA1 R N EY2 T"
1231
+ },
1232
+ "reject": {
1233
+ "DEFAULT": "R IY1 JH EH0 K T",
1234
+ "VERB": "R IH0 JH EH1 K T"
1235
+ },
1236
+ "rejects": {
1237
+ "DEFAULT": "R IY1 JH EH0 K T S",
1238
+ "VERB": "R IH0 JH EH1 K T S"
1239
+ },
1240
+ "relay": {
1241
+ "DEFAULT": "R IY1 L EY2",
1242
+ "VERB": "R IY2 L EY1"
1243
+ },
1244
+ "relaying": {
1245
+ "DEFAULT": "R IY1 L EY2 IH0 NG",
1246
+ "VERB": "R IY2 L EY1 IH0 NG"
1247
+ },
1248
+ "relays": {
1249
+ "DEFAULT": "R IY1 L EY2 Z",
1250
+ "VERB": "R IY2 L EY1 Z"
1251
+ },
1252
+ "remake": {
1253
+ "DEFAULT": "R IY1 M EY0 K",
1254
+ "VERB": "R IY2 M EY1 K"
1255
+ },
1256
+ "remakes": {
1257
+ "DEFAULT": "R IY1 M EY0 K S",
1258
+ "VERB": "R IY2 M EY1 K S"
1259
+ },
1260
+ "replay": {
1261
+ "DEFAULT": "R IY1 P L EY0",
1262
+ "VERB": "R IY0 P L EY1"
1263
+ },
1264
+ "replays": {
1265
+ "DEFAULT": "R IY1 P L EY0 Z",
1266
+ "VERB": "R IY0 P L EY1 Z"
1267
+ },
1268
+ "reprint": {
1269
+ "DEFAULT": "R IY1 P R IH0 N T",
1270
+ "VERB": "R IY0 P R IH1 N T"
1271
+ },
1272
+ "reprints": {
1273
+ "DEFAULT": "R IY1 P R IH0 N T S",
1274
+ "VERB": "R IY0 P R IH1 N T S"
1275
+ },
1276
+ "rerun": {
1277
+ "DEFAULT": "R IY1 R AH0 N",
1278
+ "VERB": "R IY2 R AH1 N"
1279
+ },
1280
+ "reruns": {
1281
+ "DEFAULT": "R IY1 R AH0 N Z",
1282
+ "VERB": "R IY2 R AH1 N Z"
1283
+ },
1284
+ "resume": {
1285
+ "DEFAULT": "R EH1 Z AH0 M EY2",
1286
+ "VERB": "R IY0 Z UW1 M"
1287
+ },
1288
+ "retake": {
1289
+ "DEFAULT": "R IY1 T EY0 K",
1290
+ "VERB": "R IY0 T EY1 K"
1291
+ },
1292
+ "retakes": {
1293
+ "DEFAULT": "R IY1 T EY0 K S",
1294
+ "VERB": "R IY0 T EY1 K S"
1295
+ },
1296
+ "rethink": {
1297
+ "DEFAULT": "R IY1 TH IH0 NG K",
1298
+ "VERB": "R IY2 TH IH1 NG K"
1299
+ },
1300
+ "rethinks": {
1301
+ "DEFAULT": "R IY1 TH IH0 NG K S",
1302
+ "VERB": "R IY2 TH IH1 NG K S"
1303
+ },
1304
+ "retread": {
1305
+ "DEFAULT": "R IY1 T R EH0 D",
1306
+ "VERB": "R IY2 T R EH1 D"
1307
+ },
1308
+ "retreads": {
1309
+ "DEFAULT": "R IY1 T R EH0 D Z",
1310
+ "VERB": "R IY2 T R EH1 D Z"
1311
+ },
1312
+ "rewrite": {
1313
+ "DEFAULT": "R IY1 R AY2 T",
1314
+ "VERB": "R IY0 R AY1 T"
1315
+ },
1316
+ "rewrites": {
1317
+ "DEFAULT": "R IY1 R AY2 T S",
1318
+ "VERB": "R IY0 R AY1 T S"
1319
+ },
1320
+ "segment": {
1321
+ "DEFAULT": "S EH2 G M EH1 N T",
1322
+ "VERB": "S EH1 G M AH0 N T"
1323
+ },
1324
+ "segments": {
1325
+ "DEFAULT": "S EH1 G M AH0 N T S",
1326
+ "VERB": "S EH2 G M EH1 N T S"
1327
+ },
1328
+ "separate": {
1329
+ "DEFAULT": "S EH1 P ER0 IH0 T",
1330
+ "VERB": "S EH1 P ER0 EY2 T"
1331
+ },
1332
+ "separates": {
1333
+ "DEFAULT": "S EH1 P ER0 IH0 T S",
1334
+ "VERB": "S EH1 P ER0 EY2 T S"
1335
+ },
1336
+ "subcontract": {
1337
+ "DEFAULT": "S AH2 B K AA0 N T R AE1 K T",
1338
+ "VERB": "S AH0 B K AA1 N T R AE2 K T"
1339
+ },
1340
+ "subcontracts": {
1341
+ "DEFAULT": "S AH0 B K AA1 N T R AE2 K T S",
1342
+ "VERB": "S AH2 B K AA0 N T R AE1 K T S"
1343
+ },
1344
+ "subject": {
1345
+ "DEFAULT": "S AH1 B JH IH0 K T",
1346
+ "VERB": "S AH0 B JH EH1 K T"
1347
+ },
1348
+ "subjects": {
1349
+ "DEFAULT": "S AH1 B JH IH0 K T S",
1350
+ "VERB": "S AH0 B JH EH1 K T S"
1351
+ },
1352
+ "subordinate": {
1353
+ "DEFAULT": "S AH0 B AO1 R D AH0 N AH0 T",
1354
+ "VERB": "S AH0 B AO1 R D AH0 N EY2 T"
1355
+ },
1356
+ "subordinates": {
1357
+ "DEFAULT": "S AH0 B AO1 R D AH0 N AH0 T S",
1358
+ "VERB": "S AH0 B AO1 R D AH0 N EY2 T S"
1359
+ },
1360
+ "supplement": {
1361
+ "DEFAULT": "S AH1 P L AH0 M AH0 N T",
1362
+ "VERB": "S AH1 P L AH0 M EH0 N T"
1363
+ },
1364
+ "supplements": {
1365
+ "DEFAULT": "S AH1 P L AH0 M AH0 N T S",
1366
+ "VERB": "S AH1 P L AH0 M EH0 N T S"
1367
+ },
1368
+ "surmise": {
1369
+ "DEFAULT": "S ER1 M AY0 Z",
1370
+ "VERB": "S ER0 M AY1 Z"
1371
+ },
1372
+ "surmises": {
1373
+ "DEFAULT": "S ER1 M AY0 Z IH0 Z",
1374
+ "VERB": "S ER0 M AY1 Z IH0 Z"
1375
+ },
1376
+ "survey": {
1377
+ "DEFAULT": "S ER1 V EY2",
1378
+ "VERB": "S ER0 V EY1"
1379
+ },
1380
+ "surveys": {
1381
+ "DEFAULT": "S ER1 V EY2 Z",
1382
+ "VERB": "S ER0 V EY1 Z"
1383
+ },
1384
+ "suspect": {
1385
+ "DEFAULT": "S AH1 S P EH2 K T",
1386
+ "VERB": "S AH0 S P EH1 K T"
1387
+ },
1388
+ "suspects": {
1389
+ "DEFAULT": "S AH1 S P EH2 K T S",
1390
+ "VERB": "S AH0 S P EH1 K T S"
1391
+ },
1392
+ "syndicate": {
1393
+ "DEFAULT": "S IH1 N D IH0 K AH0 T",
1394
+ "VERB": "S IH1 N D AH0 K EY2 T"
1395
+ },
1396
+ "syndicates": {
1397
+ "DEFAULT": "S IH1 N D IH0 K AH0 T S",
1398
+ "VERB": "S IH1 N D IH0 K EY2 T S"
1399
+ },
1400
+ "torment": {
1401
+ "DEFAULT": "T AO0 R M EH1 N T",
1402
+ "VERB": "T AO1 R M EH2 N T"
1403
+ },
1404
+ "transfer": {
1405
+ "DEFAULT": "T R AE1 N S F ER0",
1406
+ "VERB": "T R AE0 N S F ER1"
1407
+ },
1408
+ "transfers": {
1409
+ "DEFAULT": "T R AE1 N S F ER0 Z",
1410
+ "VERB": "T R AE0 N S F ER1 Z"
1411
+ },
1412
+ "transplant": {
1413
+ "DEFAULT": "T R AE1 N S P L AE0 N T",
1414
+ "VERB": "T R AE0 N S P L AE1 N T"
1415
+ },
1416
+ "transplants": {
1417
+ "DEFAULT": "T R AE1 N S P L AE0 N T S",
1418
+ "VERB": "T R AE0 N S P L AE1 N T S"
1419
+ },
1420
+ "transport": {
1421
+ "DEFAULT": "T R AE1 N S P AO0 R T",
1422
+ "VERB": "T R AE0 N S P AO1 R T"
1423
+ },
1424
+ "transports": {
1425
+ "DEFAULT": "T R AE1 N S P AO0 R T S",
1426
+ "VERB": "T R AE0 N S P AO1 R T S"
1427
+ },
1428
+ "triplicate": {
1429
+ "DEFAULT": "T R IH1 P L IH0 K AH0 T",
1430
+ "VERB": "T R IH1 P L IH0 K EY2 T"
1431
+ },
1432
+ "triplicates": {
1433
+ "DEFAULT": "T R IH1 P L IH0 K AH0 T S",
1434
+ "VERB": "T R IH1 P L IH0 K EY2 T S"
1435
+ },
1436
+ "undercut": {
1437
+ "DEFAULT": "AH1 N D ER0 K AH2 T",
1438
+ "VERB": "AH2 N D ER0 K AH1 T"
1439
+ },
1440
+ "underestimate": {
1441
+ "DEFAULT": "AH1 N D ER0 EH1 S T AH0 M AH0 T",
1442
+ "VERB": "AH1 N D ER0 EH1 S T AH0 M EY2 T"
1443
+ },
1444
+ "underestimates": {
1445
+ "DEFAULT": "AH1 N D ER0 EH1 S T AH0 M AH0 T S",
1446
+ "VERB": "AH1 N D ER0 EH1 S T AH0 M EY2 T S"
1447
+ },
1448
+ "underline": {
1449
+ "DEFAULT": "AH1 N D ER0 L AY2 N",
1450
+ "VERB": "AH2 N D ER0 L AY1 N"
1451
+ },
1452
+ "underlines": {
1453
+ "DEFAULT": "AH1 N D ER0 L AY2 N Z",
1454
+ "VERB": "AH2 N D ER0 L AY1 N Z"
1455
+ },
1456
+ "undertaking": {
1457
+ "DEFAULT": "AH1 N D ER0 T EY2 K IH0 NG",
1458
+ "VERB": "AH2 N D ER0 T EY1 K IH0 NG"
1459
+ },
1460
+ "undertakings": {
1461
+ "DEFAULT": "AH1 N D ER0 T EY2 K IH0 NG Z",
1462
+ "VERB": "AH2 N D ER0 T EY1 K IH0 NG Z"
1463
+ },
1464
+ "unused": {
1465
+ "DEFAULT": "AH0 N Y UW1 S T",
1466
+ "VERB": "AH0 N Y UW1 Z D"
1467
+ },
1468
+ "upgrade": {
1469
+ "DEFAULT": "AH1 P G R EY0 D",
1470
+ "VERB": "AH0 P G R EY1 D"
1471
+ },
1472
+ "upgrades": {
1473
+ "DEFAULT": "AH1 P G R EY0 D Z",
1474
+ "VERB": "AH0 P G R EY1 D Z"
1475
+ },
1476
+ "uplift": {
1477
+ "DEFAULT": "AH1 P L IH0 F T",
1478
+ "VERB": "AH2 P L IH1 F T"
1479
+ },
1480
+ "upset": {
1481
+ "DEFAULT": "AH1 P S EH2 T",
1482
+ "VERB": "AH0 P S EH1 T"
1483
+ },
1484
+ "upsets": {
1485
+ "DEFAULT": "AH1 P S EH2 T S",
1486
+ "VERB": "AH0 P S EH1 T S"
1487
+ },
1488
+ "use": {
1489
+ "DEFAULT": "Y UW1 S",
1490
+ "VERB": "Y UW1 Z"
1491
+ },
1492
+ "used": {
1493
+ "DEFAULT": "Y UW1 S T",
1494
+ "VBN": "Y UW1 Z D"
1495
+ },
1496
+ "uses": {
1497
+ "DEFAULT": "Y UW1 S IH0 Z",
1498
+ "VERB": "Y UW1 Z IH0 Z"
1499
+ }
1500
+ }
resources/app/python/xvapitch/text/h2p_parser/data/example.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "absent": {
3
+ "VERB": "AH1 B S AE1 N T",
4
+ "DEFAULT": "AE1 B S AH0 N T"
5
+ },
6
+ "reject": {
7
+ "VERB": "R IH0 JH EH1 K T",
8
+ "DEFAULT": "R IY1 JH EH0 K T"
9
+ },
10
+ "read": {
11
+ "VBD": "R EH1 D",
12
+ "VBN": "R EH1 D",
13
+ "VBP": "R EH1 D",
14
+ "DEFAULT": "R IY1 D"
15
+ }
16
+ }
resources/app/python/xvapitch/text/h2p_parser/dict_reader.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This reads a CMUDict formatted dictionary as a dictionary object
2
+ import re
3
+ import h2p_parser.format_ph as ph
4
+ from . import DATA_PATH
5
+
6
+
7
+ _dict_primary = 'cmudict.dict'
8
+
9
+
10
+ def read_dict(filename: str) -> list:
11
+ # Read the file
12
+ with open(filename, encoding='utf-8', mode='r') as f:
13
+ # Read the file into lines
14
+ lines = f.readlines()
15
+ # Remove any line starting with ";;;"
16
+ lines = [line for line in lines if not line.startswith(';;;')]
17
+ return lines
18
+
19
+
20
+ def parse_dict(lines: list) -> dict:
21
+ # Create a dictionary to store the parsed data
22
+ parsed_dict = {}
23
+ # Detect file format
24
+
25
+ # We will read the first 10 lines to determine the format
26
+ # Default to SSD format unless we find otherwise
27
+ dict_form = 'SSD'
28
+ for line in lines[:10]:
29
+ # Strip new lines
30
+ line = line.strip()
31
+ if line == '':
32
+ continue
33
+ """
34
+ Format 1 (Double Space Delimited):
35
+ - Comment allowed to start with ";;;"
36
+ WORD W ER1 D
37
+
38
+ Format 2 (Single Space Delimited):
39
+ - Comment allowed at end of any line using "#"
40
+ WORD W ER1 D # Comment
41
+ """
42
+ if ' ' in line:
43
+ dict_form = 'DSD'
44
+ break
45
+
46
+ # Iterate over the lines
47
+ for line in lines:
48
+ # Skip empty lines and lines with no space
49
+ line = line.strip()
50
+ if line == '' and ' ' not in line:
51
+ continue
52
+
53
+ # Split depending on format
54
+ if dict_form == 'DSD':
55
+ pairs = line.split(' ')
56
+ else:
57
+ space_index = line.find(' ')
58
+ line_split = line[:space_index], line[space_index + 1:]
59
+ pairs = line_split[0], line_split[1].split('#')[0]
60
+
61
+ word = str.lower(pairs[0]) # Get word and lowercase it
62
+ phonemes = ph.to_list(pairs[1]) # Convert to list of phonemes
63
+ phonemes = [phonemes] # Wrap in nested list
64
+ word_num = 0
65
+ word_orig = None
66
+
67
+ # Detect if this is a multi-word entry
68
+ if ('(' in word) and (')' in word) and any(char.isdigit() for char in word):
69
+ # Parse the integer from the word using regex
70
+ result = int(re.findall(r"\((\d+)\)", word)[0])
71
+ # If found
72
+ if result is not None:
73
+ # Set the original word
74
+ word_orig = word
75
+ # Remove the integer and bracket from the word
76
+ word = re.sub(r"\(.*\)", "", word)
77
+ # Set the word number to the result
78
+ word_num = result
79
+
80
+ # Check existing key
81
+ if word in parsed_dict:
82
+ # If word number is 0, ignore
83
+ if word_num == 0:
84
+ continue
85
+ # If word number is not 0, add phoneme to existing key at index
86
+ parsed_dict[word].extend(phonemes)
87
+ # Also add the original word if it exists
88
+ if word_orig is not None:
89
+ parsed_dict[word_orig] = phonemes
90
+ else:
91
+ # Create a new key
92
+ parsed_dict[word] = phonemes
93
+
94
+ # Return the dictionary
95
+ return parsed_dict
96
+
97
+
98
+ class DictReader:
99
+ def __init__(self, filename=None):
100
+ self.filename = filename
101
+ self.dict = {}
102
+ # If filename is None, use the default dictionary
103
+ # default = 'data' uses the dictionary file in the data module
104
+ # default = 'nltk' uses the nltk cmudict
105
+ if filename is not None:
106
+ self.dict = parse_dict(read_dict(filename))
107
+ else:
108
+ with DATA_PATH.joinpath(_dict_primary) as f:
109
+ self.dict = parse_dict(read_dict(f))
resources/app/python/xvapitch/text/h2p_parser/dictionary.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dictionary.py
2
+
3
+ # Defines a dictionary class that can be used to store and retrieve from the json file
4
+ import sys
5
+ if sys.version_info < (3, 9):
6
+ # In Python versions below 3.9, this is needed
7
+ import importlib_resources as pkg_resources
8
+ else:
9
+ # Since python 3.9+, importlib.resources.files is built-in
10
+ import importlib.resources as pkg_resources
11
+ from os.path import exists
12
+ import json
13
+ import h2p_parser.pos_parser as pos_parser
14
+
15
+
16
+ # Method to get data path
17
+ def get_data_path():
18
+ data_path = pkg_resources.files('h2p_parser.data')
19
+ if data_path is None:
20
+ raise FileNotFoundError("Data folder not found")
21
+ return data_path
22
+
23
+
24
+ # Dictionary class
25
+ class Dictionary:
26
+ def __init__(self, file_name=None):
27
+ # If a file name is not provided, use the default file name
28
+ self.file_name = file_name
29
+ if file_name is None:
30
+ self.file_name = 'dict.json'
31
+ self.use_default = True
32
+ else:
33
+ self.file_name = file_name
34
+ self.use_default = False
35
+ self.dictionary = {}
36
+ self.dictionary = self.load_dictionary(file_name)
37
+
38
+ # Loads the dictionary from the json file
39
+ def load_dictionary(self, path=None):
40
+ if path is None:
41
+ data_path = get_data_path()
42
+ dict_path = data_path.joinpath(self.file_name)
43
+ with open(str(dict_path)) as def_file:
44
+ read_dict = json.load(def_file)
45
+ else:
46
+ if not exists(path):
47
+ raise FileNotFoundError(f'Dictionary {self.file_name} file not found')
48
+ with open(path) as file:
49
+ try:
50
+ read_dict = json.load(file)
51
+ except json.decoder.JSONDecodeError:
52
+ raise ValueError(f'Dictionary {self.file_name} file is not valid JSON')
53
+ # Check dictionary has at least one entry
54
+ if len(read_dict) == 0:
55
+ raise ValueError('Dictionary is empty or invalid')
56
+ return read_dict
57
+
58
+ # Check if a word is in the dictionary
59
+ def contains(self, word):
60
+ word = word.lower()
61
+ return word in self.dictionary
62
+
63
+ # Get the phonetic pronunciation of a word using Part of Speech tag
64
+ def get_phoneme(self, word, pos):
65
+ # Get the sub-dictionary at dictionary[word]
66
+ sub_dict = self.dictionary[word.lower()]
67
+
68
+ # First, check if the exact pos is a key
69
+ if pos in sub_dict:
70
+ return sub_dict[pos]
71
+
72
+ # If not, use the parent pos of the pos tag
73
+ parent_pos = pos_parser.get_parent_pos(pos)
74
+
75
+ if parent_pos is not None:
76
+ # Check if the sub_dict contains the parent pos
77
+ if parent_pos in sub_dict:
78
+ return sub_dict[parent_pos]
79
+
80
+ # If not, check if the sub_dict contains a DEFAULT key
81
+ if 'DEFAULT' in sub_dict:
82
+ return sub_dict['DEFAULT']
83
+
84
+ # If no matches, return None
85
+ return None
resources/app/python/xvapitch/text/h2p_parser/filter.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unicodedata import normalize
2
+ import re
3
+
4
+ # Pre-compile regex
5
+ re_filter = re.compile(r"[^ A-Za-z'.,?!()\-]")
6
+ re_filter_with_num = re.compile(r"[^ A-Za-z\d'.,?!()\-]")
7
+ re_multi_space = re.compile(r"\s\s+")
8
+
9
+
10
+ # Filters text before parsing
11
+ # @param text: text to be filtered
12
+ # @return: filtered text
13
+ def filter_text(text: str, allow_num: bool = False, preserve_case: bool = False) -> str:
14
+ """
15
+ Filters text before parsing
16
+ :param preserve_case:
17
+ :param allow_num: True if numbers are allowed
18
+ :param text: Input raw text
19
+ :return: Text after stripped accents, lower-cased, and invalid punctuation removed
20
+ """
21
+ # Strip accents
22
+ text = normalize('NFD', text)
23
+ # To lowercase
24
+ if not preserve_case:
25
+ text = text.lower()
26
+ # Remove all invalid punctuation
27
+ if allow_num:
28
+ text = re.sub(re_filter_with_num, '', text)
29
+ else:
30
+ text = re.sub(re_filter, "", text)
31
+ # Remove all spaces more than 1
32
+ text = re.sub(re_multi_space, " ", text)
33
+ # Return
34
+ return text
resources/app/python/xvapitch/text/h2p_parser/format_ph.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import overload
2
+
3
+ # Converts and outputs various formats of phonemes
4
+
5
+
6
+ @overload
7
+ def to_sds(ph: str) -> str: ...
8
+
9
+
10
+ @overload
11
+ def to_sds(ph: list) -> str: ...
12
+
13
+
14
+ def to_sds(ph: list or str) -> str or None:
15
+ """
16
+ Converts phonemes to space delimited string format
17
+
18
+ :param ph: Phoneme as str or list, supports nested lists
19
+ :return: Phoneme as space delimited string
20
+ """
21
+ # Return None if None
22
+ if ph is None:
23
+ return None
24
+
25
+ # Return directly if str
26
+ if isinstance(ph, str):
27
+ return ph
28
+ # If is list, convert each element
29
+ if isinstance(ph, list):
30
+ # If list empty, return None
31
+ if len(ph) == 0:
32
+ return None
33
+ # Case for further lists
34
+ if isinstance(ph[0], list):
35
+ return to_sds(ph[0]) # Recursive call
36
+ # Case if str at index 0, and size 1, return directly
37
+ elif isinstance(ph[0], str) and len(ph) == 1:
38
+ return ph[0]
39
+ # Case if str at index 0, above size 1, return with join
40
+ elif isinstance(ph[0], str):
41
+ return ' '.join(ph)
42
+ # Case for none
43
+ elif ph[0] is None:
44
+ return None
45
+ else:
46
+ raise TypeError('to_sds() encountered an unexpected nested element type')
47
+ # Error if no matches
48
+ raise TypeError('to_sds() expects a list or string')
49
+
50
+
51
+ @overload
52
+ def to_list(ph: str) -> list: ...
53
+
54
+
55
+ @overload
56
+ def to_list(ph: list) -> list: ...
57
+
58
+
59
+ def to_list(ph: str or list) -> list or None:
60
+ """
61
+ Converts phonemes to list format
62
+
63
+ :param ph: Phoneme as str or list, supports nested lists
64
+ :return: Phoneme as list
65
+ """
66
+ # Return None if None
67
+ if ph is None:
68
+ return None
69
+
70
+ # Return directly if list and index 0 is str
71
+ if isinstance(ph, list) and len(ph) > 0 and isinstance(ph[0], str):
72
+ return ph
73
+
74
+ # If space delimited string, convert to list
75
+ if isinstance(ph, str):
76
+ return ph.split(' ')
77
+
78
+ # If nested list, convert each element
79
+ if isinstance(ph, list):
80
+ # If list empty or has None, return None
81
+ if len(ph) == 0 or ph[0] is None:
82
+ return None
83
+ # Case for further lists
84
+ if isinstance(ph[0], list):
85
+ return to_list(ph[0]) # Recursive call
86
+
87
+ # Error if no matches
88
+ raise TypeError('to_list() expects a list or string')
89
+
90
+
91
+ # Surrounds text with curly brackets
92
+ def with_cb(text: str) -> str:
93
+ """
94
+ Surrounds text with curly brackets
95
+
96
+ :param text: Text to surround
97
+ :return: Surrounded text
98
+ """
99
+ return '{' + text + '}'
resources/app/python/xvapitch/text/h2p_parser/h2p.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import re
3
+ from nltk.tokenize import TweetTokenizer
4
+ from nltk import pos_tag
5
+ from nltk import pos_tag_sents
6
+ from .dictionary import Dictionary
7
+ from .filter import filter_text as ft
8
+ from . import format_ph as ph
9
+
10
+ # Check that the nltk data is downloaded, if not, download it
11
+ try:
12
+ nltk.data.find('taggers/averaged_perceptron_tagger.zip')
13
+ except LookupError:
14
+ nltk.download('averaged_perceptron_tagger')
15
+
16
+
17
+ # Method to use Regex to replace the first instance of a word with its phonemes
18
+ def replace_first(target, replacement, text):
19
+ # Skip if target invalid
20
+ if target is None or target == '':
21
+ return text
22
+ # Replace the first instance of a word with its phonemes
23
+ return re.sub(r'(?i)\b' + target + r'\b', replacement, text, 1)
24
+
25
+
26
+ class H2p:
27
+ def __init__(self, dict_path=None, preload=False, phoneme_format=''):
28
+ """
29
+ Creates a H2p parser
30
+
31
+ Supported phoneme formats:
32
+ - Space delimited
33
+ - Space delimited surrounded by { }
34
+
35
+ :param dict_path: Path to a heteronym dictionary json file. Built-in dictionary will be used if None
36
+ :type dict_path: str
37
+ :param preload: Preloads the tokenizer and tagger during initialization
38
+ :type preload: bool
39
+ """
40
+
41
+ # Supported phoneme formats
42
+ self.phoneme_format = phoneme_format
43
+ self.dict = Dictionary(dict_path)
44
+ self.tokenize = TweetTokenizer().tokenize
45
+ self.get_tags = pos_tag
46
+ if preload:
47
+ self.preload()
48
+
49
+ # Method to preload tokenizer and pos_tag
50
+ def preload(self):
51
+ tokens = self.tokenize('a')
52
+ assert tokens == ['a']
53
+ assert pos_tag(tokens)[0][0] == 'a'
54
+
55
+ # Method to check if a text line contains a heteronym
56
+ def contains_het(self, text):
57
+ # Filter the text
58
+ text = ft(text)
59
+ # Tokenize
60
+ words = self.tokenize(text)
61
+ # Check match with dictionary
62
+ hets = []
63
+ for word in words:
64
+ if self.dict.contains(word):
65
+ hets.append(word)
66
+ return len(hets)>0, hets
67
+
68
+ # Method to replace heteronyms in a text line to phonemes
69
+ def replace_het(self, text):
70
+ # Filter the text
71
+ working_text = ft(text, preserve_case=True)
72
+ # Tokenize
73
+ words = self.tokenize(working_text)
74
+ # Get pos tags
75
+ tags = pos_tag(words)
76
+ # Loop through words and pos tags
77
+ for word, pos in tags:
78
+ # Skip if word not in dictionary
79
+ if not self.dict.contains(word):
80
+ continue
81
+ # Get phonemes
82
+ phonemes = self.dict.get_phoneme(word, pos)
83
+ # Format phonemes
84
+ f_ph = ph.with_cb(ph.to_sds(phonemes))
85
+ # Replace word with phonemes
86
+ text = replace_first(word, f_ph, text)
87
+ return text
88
+
89
+ # Replaces heteronyms in a list of text lines
90
+ # Slightly faster than replace_het() called on each line
91
+ def replace_het_list(self, text_list):
92
+ # Filter the text
93
+ working_text_list = [ft(text, preserve_case=True) for text in text_list]
94
+ # Tokenize
95
+ list_sentence_words = [self.tokenize(text) for text in working_text_list]
96
+ # Get pos tags list
97
+ tags_list = pos_tag_sents(list_sentence_words)
98
+ # Loop through lines
99
+ for index in range(len(tags_list)):
100
+ # Loop through words and pos tags in tags_list index
101
+ for word, pos in tags_list[index]:
102
+ # Skip if word not in dictionary
103
+ if not self.dict.contains(word):
104
+ continue
105
+ # Get phonemes
106
+ phonemes = self.dict.get_phoneme(word, pos)
107
+ # Format phonemes
108
+ f_ph = ph.with_cb(ph.to_sds(phonemes))
109
+ # Replace word with phonemes
110
+ text_list[index] = replace_first(word, f_ph, text_list[index])
111
+ return text_list
112
+
113
+ # Method to tag a text line, returns a list of tags
114
+ def tag(self, text):
115
+ # Filter the text
116
+ working_text = ft(text, preserve_case=True)
117
+ # Tokenize
118
+ words = self.tokenize(working_text)
119
+ # Get pos tags
120
+ tags = pos_tag(words)
121
+ # Only return element 1 of each list
122
+ return [tag[1] for tag in tags]
123
+
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/PKG-INFO ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: h2p-parser
3
+ Version: 1.0.0
4
+ Summary: Heteronym to Phoneme Parser
5
+ Home-page: https://github.com/ionite34/h2p-parser
6
+ Author: ionite
7
+ Author-email: dev@ionite.io
8
+ License: Apache 2.0
9
+ Platform: UNKNOWN
10
+ Requires-Python: >=3.7
11
+ License-File: LICENSE
12
+
13
+ UNKNOWN
14
+
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ h2p_parser/__init__.py
5
+ h2p_parser/__main__.py
6
+ h2p_parser/cmudictext.py
7
+ h2p_parser/dict_reader.py
8
+ h2p_parser/dictionary.py
9
+ h2p_parser/filter.py
10
+ h2p_parser/format_ph.py
11
+ h2p_parser/h2p.py
12
+ h2p_parser/pos_parser.py
13
+ h2p_parser/processors.py
14
+ h2p_parser/symbols.py
15
+ h2p_parser/h2p_parser.egg-info/PKG-INFO
16
+ h2p_parser/h2p_parser.egg-info/SOURCES.txt
17
+ h2p_parser/h2p_parser.egg-info/dependency_links.txt
18
+ h2p_parser/h2p_parser.egg-info/requires.txt
19
+ h2p_parser/h2p_parser.egg-info/top_level.txt
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/requires.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ nltk
2
+ inflect
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
resources/app/python/xvapitch/text/h2p_parser/pos_parser.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Part of Speech Tag Operations
2
+
3
+ # Method to get the parent part of speech (VERB) or (NOUN) from a pos tag
4
+ # from __future__ import annotations
5
+
6
+ # def get_parent_pos(pos: str) -> str | None:
7
+ def get_parent_pos(pos):
8
+ # Get the parent part of speech from a pos tag
9
+ if pos.startswith('VB'):
10
+ return 'VERB'
11
+ elif pos.startswith('NN'):
12
+ return 'NOUN'
13
+ elif pos.startswith('RB'):
14
+ return 'ADVERB'
15
+ else:
16
+ return None
17
+
resources/app/python/xvapitch/text/h2p_parser/processors.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformations of text sequences for matching
2
+ from __future__ import annotations
3
+ from typing import TYPE_CHECKING
4
+ from .symbols import consonants
5
+
6
+ import re
7
+
8
+ if TYPE_CHECKING:
9
+ from .cmudictext import CMUDictExt
10
+
11
+ _re_digit = re.compile(r'\d+')
12
+
13
+
14
+ class Processor:
15
+ def __init__(self, cde: CMUDictExt):
16
+ self._lookup = cde.lookup
17
+ self._cmu_get = cde.dict.get
18
+ self._segment = cde.segment
19
+ self._tag = cde.h2p.tag
20
+ self._stem = cde.stem
21
+ # Number of times respective methods were called
22
+ self.stat_hits = {
23
+ 'plural': 0,
24
+ 'possessives': 0,
25
+ 'contractions': 0,
26
+ 'hyphenated': 0,
27
+ 'compound': 0,
28
+ 'compound_l2': 0,
29
+ 'stem': 0
30
+ }
31
+ # Number of times respective methods returned value (not None)
32
+ self.stat_resolves = {
33
+ 'plural': 0,
34
+ 'possessives': 0,
35
+ 'contractions': 0,
36
+ 'hyphenated': 0,
37
+ 'compound': 0,
38
+ 'compound_l2': 0,
39
+ 'stem': 0
40
+ }
41
+ # Holds events when features encountered unexpected language syntax
42
+ self.stat_unexpected = {
43
+ 'plural': [],
44
+ 'possessives': [],
45
+ 'contractions': [],
46
+ 'hyphenated': [],
47
+ 'compound': [],
48
+ 'compound_l2': [],
49
+ 'stem': []
50
+ }
51
+
52
+ def auto_possessives(self, word: str) -> str | None:
53
+ """
54
+ Auto-possessives
55
+ :param word: Input of possible possessive word
56
+ :return: Phoneme of word as SDS, or None if unresolvable
57
+ """
58
+ if not word.endswith("'s"):
59
+ return None
60
+ # If the word ends with "'s", register a hit
61
+ self.stat_hits['possessives'] += 1
62
+ """
63
+ There are 3 general cases:
64
+ 1. Base words ending in one of 6 special consonants (sibilants)
65
+ - i.e. Tess's, Rose's, Butch's, Midge's, Rush's, Garage's
66
+ - With consonants ending of [s], [z], [ch], [j], [sh], [zh]
67
+ - In ARPAbet: {S}, {Z}, {CH}, {JH}, {SH}, {ZH}
68
+ - These require a suffix of {IH0 Z}
69
+ 2. Base words ending in vowels and voiced consonants:
70
+ - i.e. Fay's, Hugh's, Bob's, Ted's, Meg's, Sam's, Dean's, Claire's, Paul's, Bing's
71
+ - In ARPAbet: {IY0}, {EY1}, {UW1}, {B}, {D}, {G}, {M}, {N}, {R}, {L}, {NG}
72
+ - Vowels need a wildcard match of any numbered variant
73
+ - These require a suffix of {Z}
74
+ 3. Base words ending in voiceless consonants:
75
+ - i.e. Hope's, Pat's, Clark's, Ruth's
76
+ - In ARPAbet: {P}, {T}, {K}, {TH}
77
+ - These require a suffix of {S}
78
+ """
79
+
80
+ # Method to return phoneme and increment stat
81
+ def _resolve(phoneme: str) -> str:
82
+ self.stat_resolves['possessives'] += 1
83
+ return phoneme
84
+
85
+ core = word[:-2] # Get core word without possessive
86
+ ph = self._lookup(core, ph_format='list') # find core word using recursive search
87
+ if ph is None:
88
+ return None # Core word not found
89
+ # [Case 1]
90
+ if ph[-1] in {'S', 'Z', 'CH', 'JH', 'SH', 'ZH'}:
91
+ ph += 'IH0' + 'Z'
92
+ return _resolve(ph)
93
+ # [Case 2]
94
+ """
95
+ Valid for case 2:
96
+ 'AA', 'AO', 'EY', 'OW', 'UW', 'AE', 'AW', 'EH', 'IH',
97
+ 'OY', 'AH', 'AY', 'ER', 'IY', 'UH', 'UH',
98
+ 'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG'
99
+ To simplify matching, we will check for the listed single-letter variants and 'NG'
100
+ and then check for any numbered variant
101
+ """
102
+ if ph[-1] in {'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG'} or ph[-1][-1].isdigit():
103
+ ph += 'Z'
104
+ return _resolve(ph)
105
+ # [Case 3]
106
+ if ph[-1] in ['P', 'T', 'K', 'TH']:
107
+ ph += 'S'
108
+ return _resolve(ph)
109
+
110
+ return None # No match found
111
+
112
+ def auto_contractions(self, word: str) -> str | None:
113
+ """
114
+ Auto contracts form and finds phonemes
115
+ :param word:
116
+ :return:
117
+ """
118
+ """
119
+ Supported contractions:
120
+ - 'll
121
+ - 'd
122
+ """
123
+ # First, check if the word is a contraction
124
+ parts = word.split("\'") # Split on [']
125
+ if len(parts) == 1 or parts[1] not in {'ll', 'd'}:
126
+ return None # No contraction found
127
+ if len(parts) > 2:
128
+ self.stat_unexpected['contraction'] += word
129
+ return None # More than 2 parts, can't be a contraction
130
+ # If initial check passes, register a hit
131
+ self.stat_hits['contractions'] += 1
132
+
133
+ # Get the core word
134
+ core = parts[0]
135
+ # Get the phoneme for the core word recursively
136
+ ph = self._lookup(core, ph_format='list')
137
+ if ph is None:
138
+ return None # Core word not found
139
+ # Add the phoneme with the appropriate suffix
140
+ if parts[1] == 'll':
141
+ ph += 'L'
142
+ elif parts[1] == 'd':
143
+ ph += 'D'
144
+ # Return the phoneme
145
+ self.stat_resolves['contractions'] += 1
146
+ return ph
147
+
148
+ def auto_hyphenated(self, word: str) -> str | None:
149
+ """
150
+ Splits hyphenated words and attempts to resolve components
151
+ :param word:
152
+ :return:
153
+ """
154
+ # First, check if the word is a hyphenated word
155
+ if '-' not in word:
156
+ return None # No hyphen found
157
+ # If initial check passes, register a hit
158
+ self.stat_hits['hyphenated'] += 1
159
+ # Split the word into parts
160
+ parts = word.split('-')
161
+ # Get the phonemes for each part
162
+ ph = []
163
+ for part in parts:
164
+ ph_part = self._lookup(part, ph_format='sds')
165
+ if ph_part is None:
166
+ return None # Part not found
167
+ ph.append(ph_part)
168
+ # Join the phonemes
169
+ ph = ' '.join(ph)
170
+ # Return the phoneme
171
+ self.stat_resolves['hyphenated'] += 1
172
+ return ph
173
+
174
+ def auto_compound(self, word: str) -> str | None:
175
+ """
176
+ Splits compound words and attempts to resolve components
177
+ :param word:
178
+ :return:
179
+ """
180
+ # Split word into parts
181
+ parts = self._segment(word)
182
+ if len(parts) == 1:
183
+ return None # No compound found
184
+ # If initial check passes, register a hit
185
+ self.stat_hits['compound'] += 1
186
+ # Get the phonemes for each part
187
+ ph = []
188
+ for part in parts:
189
+ ph_part = self._lookup(part, ph_format='sds')
190
+ if ph_part is None:
191
+ return None # Part not found
192
+ ph.append(ph_part)
193
+ # Join the phonemes
194
+ ph = ' '.join(ph)
195
+ # Return the phoneme
196
+ self.stat_resolves['compound'] += 1
197
+ return ph
198
+
199
+ def auto_plural(self, word: str, pos: str = None) -> str | None:
200
+ """
201
+ Finds singular form of plurals and attempts to resolve separately
202
+ Optionally a pos tag can be provided.
203
+ If no tags are provided, there will be a single word pos inference,
204
+ which is not ideal.
205
+ :param pos:
206
+ :param word:
207
+ :return:
208
+ """
209
+ # First, check if the word is a replaceable plural
210
+ # Needs to end in 's' or 'es'
211
+ if word[-1] != 's':
212
+ return None # No plural found
213
+ # Now check if the word is a plural using pos
214
+ if pos is None:
215
+ pos = self._tag(word)
216
+ if pos is None or len(pos) == 0 or (pos[0] != 'NNS' and pos[0] != 'NNPS'):
217
+ return None # No tag found
218
+ # If initial check passes, register a hit
219
+ self.stat_hits['plural'] += 1
220
+
221
+ """
222
+ Case 1:
223
+ > Word ends in 'oes'
224
+ > Remove the 'es' to get the singular
225
+ """
226
+ if len(word) > 3 and word[-3:] == 'oes':
227
+ singular = word[:-2]
228
+ # Look up the possessive form (since the pronunciation is the same)
229
+ ph = self.auto_possessives(singular + "'s")
230
+ if ph is not None:
231
+ self.stat_resolves['plural'] += 1
232
+ return ph # Return the phoneme
233
+
234
+ """
235
+ Case 2:
236
+ > Word ends in 's'
237
+ > Remove the 's' to get the singular
238
+ """
239
+ if len(word) > 1 and word[-1] == 's':
240
+ singular = word[:-1]
241
+ # Look up the possessive form (since the pronunciation is the same)
242
+ ph = self.auto_possessives(singular + "'s")
243
+ if ph is not None:
244
+ self.stat_resolves['plural'] += 1
245
+ return ph # Return the phoneme
246
+
247
+ # If no matches, return None
248
+ return None
249
+
250
+ def auto_stem(self, word: str) -> str | None:
251
+ """
252
+ Attempts to resolve using the root stem of a word.
253
+ Supported modes:
254
+ - "ing"
255
+ - "ingly"
256
+ - "ly"
257
+ :param word:
258
+ :return:
259
+ """
260
+
261
+ # noinspection SpellCheckingInspection
262
+ """
263
+ 'ly' has no special rules, always add phoneme 'L IY0'
264
+
265
+ 'ing' relevant rules:
266
+
267
+ > If the original verb ended in [e], remove it and add [ing]
268
+ - i.e. take -> taking, make -> making
269
+ - We will search once with the original verb, and once with [e] added
270
+ - 1st attempt: tak, mak
271
+ - 2nd attempt: take, make
272
+
273
+ > If the input word has a repeated consonant before [ing], it's likely that
274
+ the original verb has only 1 of the consonants
275
+ - i.e. running -> run, stopping -> stop
276
+ - We will search for repeated consonants, and perform 2 attempts:
277
+ - 1st attempt: without the repeated consonant (run, stop)
278
+ - 2nd attempt: with the repeated consonant (runn, stopp)
279
+ """
280
+ # Discontinue if word is too short
281
+ if len(word) < 3 or (not word.endswith('ly') and not word.endswith('ing')):
282
+ return None
283
+ # Register a hit
284
+ self.stat_hits['stem'] += 1 # Register hit
285
+
286
+ # For ly case
287
+ if word.endswith('ly'):
288
+ # Get the root word
289
+ root = word[:-2]
290
+ # Recursively get the root
291
+ ph_root = self._lookup(root, ph_format='sds')
292
+ # If not exist, return None
293
+ if ph_root is None:
294
+ return None
295
+ ph_ly = 'L IY0'
296
+ ph_joined = ' '.join([ph_root, ph_ly])
297
+ self.stat_resolves['stem'] += 1
298
+ return ph_joined
299
+
300
+ # For ing case
301
+ if word.endswith('ing'):
302
+ # Get the root word
303
+ root = word[:-3]
304
+ # Recursively get the root
305
+ ph_root = self._lookup(root, ph_format='sds')
306
+ # If not exist, return None
307
+ if ph_root is None:
308
+ return None
309
+ ph_ly = 'IH0 NG'
310
+ ph_joined = ' '.join([ph_root, ph_ly])
311
+ self.stat_resolves['stem'] += 1
312
+ return ph_joined
313
+
314
+ def auto_component(self, word: str) -> str | None:
315
+ """
316
+ Searches for target word as component of a larger word
317
+ :param word:
318
+ :return:
319
+ """
320
+
321
+ """
322
+ This processing step checks for words as a component of a larger word
323
+ - i.e. 'synth' is not in the cmu dictionary
324
+ - Stage 1: We will search for any word beginning with 'synth' (10 matches)
325
+ - This is because most unseen short words are likely shortened versions
326
+ - We will split
327
+ - Stage 2: Search for any word containing 'synth' (13 matches)
328
+
329
+ """
330
+ raise NotImplementedError
331
+
332
+ def auto_compound_l2(self, word: str, recursive: bool = True) -> str | None:
333
+ """
334
+ Searches for target word as a compound word.
335
+ > Does not use n-gram splitting like auto_compound()
336
+ > Splits words manually into every possible combination
337
+ > Returns the match with the highest length of both words
338
+ :param recursive: True to enable recursive lookups, otherwise only use base CMU dictionary
339
+ :param word:
340
+ :return:
341
+ """
342
+ # Word must be fully alphabetic
343
+ if not word.isalpha() or len(word) < 3:
344
+ return None
345
+ self.stat_hits['compound_l2'] += 1 # Register hit
346
+
347
+ # Define lookup mode
348
+ def _lu(search_word: str) -> str | None:
349
+ if recursive:
350
+ return self._lookup(search_word, ph_format='sds')
351
+ else:
352
+ return self._cmu_get(search_word)
353
+
354
+ # Check if the last part is a single character
355
+ # And that it is repeated in the last char of the first part
356
+ # This is likely silent so remove it
357
+ # i.e. 'Derakk' -> 'Derak'
358
+ # If the word contains a repeated consonant at the end, remove it
359
+ # First check repeated last 2 letters
360
+ if word[-2:][0] == word[-2:][1]:
361
+ # Remove the last char from the word
362
+ word = word[:-1]
363
+
364
+ # Holds all matches as tuples
365
+ # (len1, len2, p1, p2, ph1, ph2)
366
+ matches = []
367
+
368
+ # Splits the word into every possible combination
369
+ for i in range(1, len(word)):
370
+ p1 = word[:i]
371
+ p2 = word[i:]
372
+ # Looks up both words
373
+ ph1 = _lu(p1)
374
+ if ph1 is None:
375
+ continue # Skip if not found
376
+ ph2 = _lu(p2)
377
+ if ph2 is None:
378
+ continue # Skip if not found
379
+ # If both words exist, add to list as tuple
380
+ matches.append((len(p1), len(p2), p1, p2, ph1, ph2))
381
+
382
+ # Pick the match with the highest length of both words
383
+ if len(matches) == 0:
384
+ return None
385
+ else:
386
+ # Sort by the minimum of len1 and len2
387
+ matches.sort(key=lambda x: min(x[0], x[1]))
388
+ # Get the highest minimum length match
389
+ match = matches[-1]
390
+ # Otherwise, return the full joined match
391
+ self.stat_resolves['compound_l2'] += 1 # Register resolve
392
+ return match[4] + ' ' + match[5]
resources/app/python/xvapitch/text/h2p_parser/symbols.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Holds symbols for graphemes, phonemes, and pos-tags.
2
+ # noinspection SpellCheckingInspection,GrazieInspection
3
+ """
4
+ POS tag list:
5
+
6
+ CC coordinating conjunction
7
+ CD cardinal digit
8
+ DT determiner
9
+ EX existential there ("there is" -> "there exists")
10
+ FW foreign word
11
+ IN preposition/subordinating conjunction
12
+ JJ adjective ('big')
13
+ JJR adjective, comparative ('bigger')
14
+ JJS adjective, superlative ('biggest')
15
+ LS list marker ("1)", "2)", "3)")
16
+ MD modal ('could', 'will')
17
+ NN noun, singular
18
+ NNS noun plural
19
+ NNP proper noun, singular 'Harrison'
20
+ NNPS proper noun, plural 'Americans'
21
+ PDT predeterminer ('all' in 'all the kids')
22
+ POS possessive ending (parent's)
23
+ PRP personal pronoun (I, he, she)
24
+ PRP$ possessive pronoun (my, his, hers)
25
+ RB adverb ('very', 'silently')
26
+ RBR adverb, comparative ('better')
27
+ RBS adverb, superlative ('best')
28
+ RP particle ('give up')
29
+ TO to ("go 'to' the store.")
30
+ UH interjection ("errrrrrrrm")
31
+ VB verb, base form take
32
+ VBD verb, past tense took
33
+ VBG verb, gerund/present participle taking
34
+ VBN verb, past participle taken
35
+ VBP verb, sing. present, non-3d take
36
+ VBZ verb, 3rd person sing. present takes
37
+ WDT wh-determiner which
38
+ WP wh-pronoun who, what
39
+ WP$ possessive wh-pronoun whose
40
+ WRB wh-abverb where, when
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ # noinspection SpellCheckingInspection,GrazieInspection
46
+ graphemes = list("abcdefghijklmnopqrstuvwxyz")
47
+ phonemes = ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
48
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
49
+ 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH',
50
+ 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
51
+ 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
52
+ 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
53
+ pos_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS',
54
+ 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH',
55
+ 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
56
+ pos_type_tags = ['VERB', 'NOUN', 'PRON', 'ADJ', 'ADV']
57
+ pos_type_short_tags = ['V', 'N', 'P', 'A', 'R']
58
+ pos_type_form_dict = {'V': 'VERB', 'N': 'NOUN', 'P': 'PRON', 'A': 'ADJ', 'R': 'ADV'}
59
+ graphemes_set = set(graphemes)
60
+ phonemes_set = set(phonemes)
61
+ pos_tags_set = set(pos_tags)
62
+ pos_type_tags_set = set(pos_type_tags)
63
+ pos_type_short_tags_set = set(pos_type_short_tags)
64
+ punctuation = {'.', ',', ':', ';', '?', '!', '-', '_', '\'', '\"', '`', '~', '@', '#', '$'}
65
+ consonants = {'B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 'P', 'R',
66
+ 'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH'}
67
+
68
+
69
+ # Method to convert from short type tags to full type tags.
70
+ def to_full_type_tag(short_type_tag: str) -> str | None:
71
+ if short_type_tag == 'V':
72
+ return 'VERB'
73
+ elif short_type_tag == 'N':
74
+ return 'NOUN'
75
+ elif short_type_tag == 'P':
76
+ return 'PRON'
77
+ elif short_type_tag == 'A':
78
+ return 'ADJ'
79
+ elif short_type_tag == 'R':
80
+ return 'ADV'
81
+ else:
82
+ return None
resources/app/python/xvapitch/text/h2p_parser/text/__init__.py ADDED
File without changes
resources/app/python/xvapitch/text/h2p_parser/text/numbers.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Provides parsing of numbers to text
2
+ """
3
+ This module provides parsing of numeric types in English to text.
4
+ Modified from https://github.com/keithito/tacotron
5
+ """
6
+
7
+ import inflect
8
+ import re
9
+
10
+ _magnitudes = ['trillion', 'billion', 'million', 'thousand', 'hundred', 'm', 'b', 't']
11
+ _magnitudes_key = {'m': 'million', 'b': 'billion', 't': 'trillion'}
12
+ _measurements = '(f|c|k|d|m|km|ft)'
13
+ _measurements_key = {'f': 'fahrenheit',
14
+ 'c': 'celsius',
15
+ 'k': 'thousand',
16
+ 'm': 'meters',
17
+ 'km': 'kilometers',
18
+ 'ft': 'feet'}
19
+ _currency_key = {'$': 'dollar', 'Β£': 'pound', '€': 'euro', 'β‚©': 'won'}
20
+ _inflect = inflect.engine()
21
+ _comma_number_re = re.compile(r'([0-9][0-9,]+[0-9])')
22
+ _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
23
+ _currency_re = re.compile(r'([$€£₩])([0-9.,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]|$))?'.format("|".join(_magnitudes)),
24
+ re.IGNORECASE)
25
+ _measurement_re = re.compile(r'([0-9.,]*[0-9]+(\s)?{}\b)'.format(_measurements), re.IGNORECASE)
26
+ _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
27
+ _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
28
+ _roman_re = re.compile(r'\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b') # avoid I
29
+ _multiply_re = re.compile(r'(\b[0-9]+)(x)([0-9]+)')
30
+ _number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
31
+
32
+
33
+ def _remove_commas(m):
34
+ return m.group(1).replace(',', '')
35
+
36
+
37
+ def _expand_decimal_point(m):
38
+ return m.group(1).replace('.', ' point ')
39
+
40
+
41
+ def _expand_currency(m):
42
+ currency = _currency_key[m.group(1)]
43
+ quantity = m.group(2)
44
+ magnitude = m.group(3)
45
+
46
+ # remove commas from quantity to be able to convert to numerical
47
+ quantity = quantity.replace(',', '')
48
+
49
+ # check for million, billion, etc...
50
+ if magnitude is not None and magnitude.lower() in _magnitudes:
51
+ if len(magnitude) == 1:
52
+ magnitude = _magnitudes_key[magnitude.lower()]
53
+ return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + 's')
54
+
55
+ parts = quantity.split('.')
56
+ if len(parts) > 2:
57
+ return quantity + " " + currency + "s" # Unexpected format
58
+
59
+ dollars = int(parts[0]) if parts[0] else 0
60
+
61
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
62
+ if dollars and cents:
63
+ dollar_unit = currency if dollars == 1 else currency + 's'
64
+ cent_unit = 'cent' if cents == 1 else 'cents'
65
+ return "{} {}, {} {}".format(
66
+ _expand_hundreds(dollars), dollar_unit,
67
+ _inflect.number_to_words(cents), cent_unit)
68
+ elif dollars:
69
+ dollar_unit = currency if dollars == 1 else currency + 's'
70
+ return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
71
+ elif cents:
72
+ cent_unit = 'cent' if cents == 1 else 'cents'
73
+ return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
74
+ else:
75
+ return 'zero' + ' ' + currency + 's'
76
+
77
+
78
+ def _expand_hundreds(text):
79
+ number = float(text)
80
+ if 1000 < number < 10000 and (number % 100 == 0) and (number % 1000 != 0):
81
+ return _inflect.number_to_words(int(number / 100)) + " hundred"
82
+ else:
83
+ return _inflect.number_to_words(text)
84
+
85
+
86
+ def _expand_ordinal(m):
87
+ return _inflect.number_to_words(m.group(0))
88
+
89
+
90
+ def _expand_measurement(m):
91
+ _, number, measurement = re.split(r'(\d+(?:\.\d+)?)', m.group(0))
92
+ number = _inflect.number_to_words(number)
93
+ measurement = "".join(measurement.split())
94
+ measurement = _measurements_key[measurement.lower()]
95
+ # if measurement is plural, and number is singular, remove the 's'
96
+ if number == "one" and str.endswith(measurement, "s"):
97
+ # Remove the 's' from the end of the measurement
98
+ measurement = measurement[:-1]
99
+ return "{} {}".format(number, measurement)
100
+
101
+
102
+ def _expand_range(m):
103
+ return ' to '
104
+
105
+
106
+ def _expand_multiply(m):
107
+ left = m.group(1)
108
+ right = m.group(3)
109
+ return "{} by {}".format(left, right)
110
+
111
+
112
+ def _expand_roman(m):
113
+ # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
114
+ roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
115
+ result = 0
116
+ num = m.group(0)
117
+ for i, c in enumerate(num):
118
+ if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]:
119
+ result += roman_numerals[c]
120
+ else:
121
+ result -= roman_numerals[c]
122
+ return str(result)
123
+
124
+
125
+ def _expand_number(m):
126
+ _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
127
+ number = int(number)
128
+ if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
129
+ text = _inflect.number_to_words(number // 100) + " hundred"
130
+ elif 1000 < number < 3000:
131
+ if number == 2000:
132
+ text = 'two thousand'
133
+ elif 2000 < number < 2010:
134
+ text = 'two thousand ' + _inflect.number_to_words(number % 100)
135
+ elif number % 100 == 0:
136
+ text = _inflect.number_to_words(number // 100) + ' hundred'
137
+ else:
138
+ number = _inflect.number_to_words(number, andword='', zero='oh', group=2).replace(', ', ' ')
139
+ number = re.sub(r'-', ' ', number)
140
+ text = number
141
+ else:
142
+ number = _inflect.number_to_words(number, andword='and')
143
+ number = re.sub(r'-', ' ', number)
144
+ number = re.sub(r',', '', number)
145
+ text = number
146
+
147
+ if suffix in ("'s", "s"):
148
+ if text[-1] == 'y':
149
+ text = text[:-1] + 'ies'
150
+ else:
151
+ text = text + suffix
152
+
153
+ return text
154
+
155
+
156
+ def normalize_numbers(text):
157
+ text = re.sub(_comma_number_re, _remove_commas, text)
158
+ text = re.sub(_currency_re, _expand_currency, text)
159
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
160
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
161
+ # text = re.sub(_range_re, _expand_range, text)
162
+ text = re.sub(_measurement_re, _expand_measurement, text)
163
+ text = re.sub(_roman_re, _expand_roman, text)
164
+ text = re.sub(_multiply_re, _expand_multiply, text)
165
+ text = re.sub(_number_re, _expand_number, text)
166
+ return text
resources/app/python/xvapitch/text/h2p_parser/utils/__init__.py ADDED
File without changes
resources/app/python/xvapitch/text/h2p_parser/utils/converter.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Converts dictionary files
2
+ import json
3
+ import os
4
+
5
+ from .. import symbols
6
+ from .. import format_ph as ph
7
+ from tqdm import tqdm
8
+
9
+
10
+ def from_binary_delim(path, delimiter) -> dict:
11
+ # Converts a delimited binary state heteronym look-up dictionary to a dict format
12
+ # Expected format: WORD|(Space Seperated Phonemes Case)|(Space Seperated Phonemes Default)|(Case)
13
+ # Example: "REJECT|R IH0 JH EH1 K T|R IY1 JH EH0 K T|V"
14
+ # Hashtag comments are allowed but only at the start of a file
15
+
16
+ # Import file
17
+ result_dict = {}
18
+ num_lines = sum(1 for line in open(path, 'r'))
19
+ with open(path, 'r') as f:
20
+ skipped_comments = False
21
+ for line in tqdm(f, total=num_lines):
22
+ # Skip comments
23
+ if not skipped_comments:
24
+ if line.startswith('#') or line == '\n':
25
+ continue
26
+ else:
27
+ skipped_comments = True
28
+ # Skip empty or newline lines
29
+ if line.strip() == '' or line.strip() == '\n':
30
+ continue
31
+ # Parse line using passed delimiter
32
+ tokens = line.strip().split(delimiter)
33
+ # Check for correct number of tokens
34
+ if len(tokens) != 4:
35
+ raise ValueError('Invalid number of tokens in line: ' + line)
36
+ # Get word (token 0) and check validity (no spaces)
37
+ word = tokens[0].lower()
38
+ if ' ' in word:
39
+ raise ValueError('Invalid word in line: ' + line)
40
+ # Get phonemes and check validity (alphanumeric)
41
+ ph_case = tokens[1]
42
+ ph_default = tokens[2]
43
+ if not ph_case.replace(' ', '').isalnum() or not ph_default.replace(' ', '').isalnum():
44
+ raise ValueError('Invalid phonemes in line: ' + line)
45
+ # Get case (token 3) and check validity (alphanumeric)
46
+ case = tokens[3]
47
+ if not case.isalnum():
48
+ raise ValueError('Invalid case in line: ' + line)
49
+ # Check if case is a full case or full type case
50
+ if case in symbols.pos_tags_set or case in symbols.pos_type_tags_set:
51
+ # Add to dictionary directly
52
+ # Build sub-dictionary for each case
53
+ sub_dict = result_dict.get(word, {})
54
+ sub_dict[case] = ph.to_sds(ph_case)
55
+ sub_dict['DEFAULT'] = ph.to_sds(ph_default)
56
+ result_dict[word] = sub_dict
57
+ # Check if case is a short type case
58
+ elif case in symbols.pos_type_short_tags_set:
59
+ # Need to convert to full type case
60
+ sub_dict = result_dict.get(word, {})
61
+ case_short = symbols.pos_type_form_dict[case]
62
+ sub_dict[case_short] = ph.to_sds(ph_case)
63
+ sub_dict['DEFAULT'] = ph.to_sds(ph_default)
64
+ result_dict[word] = sub_dict
65
+ else:
66
+ raise ValueError('Invalid case in line: ' + line)
67
+ return result_dict
68
+
69
+
70
+ # Method to write a dict to a json file
71
+ def to_json(path, dict_to_write):
72
+ # Writes a dictionary to a json file
73
+ with open(path, 'w') as f:
74
+ json.dump(dict_to_write, f, indent=4, sort_keys=True)
75
+
76
+
77
+ # Combined method to convert binary delimited files to json
78
+ def bin_delim_to_json(path, output_path, delimiter):
79
+ to_json(output_path, from_binary_delim(path, delimiter))
resources/app/python/xvapitch/text/h2p_parser/utils/parser.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Parses annotation files for conversion of sentences to phonemes
2
+ from __future__ import annotations
3
+ from h2p_parser import cmudictext
4
+ from h2p_parser.filter import filter_text
5
+ from h2p_parser.text.numbers import normalize_numbers
6
+ from h2p_parser.symbols import punctuation
7
+
8
+ # Reads a file into a list of lines
9
+ from tqdm import tqdm
10
+
11
+
12
+ def read_file(file_name, delimiter) -> list:
13
+ with open(file_name, 'r', encoding="utf-8") as f:
14
+ result = []
15
+ for line in f:
16
+ line = line.split(delimiter)
17
+ # Take the second element
18
+ result.append(line[1].lower())
19
+ return result
20
+
21
+ # Method that checks if a single line is resolvable
22
+
23
+
24
+ # Checks a list of lines for unresolvable words
25
+ # Returns a list of lines with unresolvable words, or None if no unresolvable words
26
+ def check_lines(lines: list) -> ParseResult:
27
+ cde = cmudictext.CMUDictExt()
28
+ # Holds result
29
+ result = ParseResult()
30
+ # Loop with nqdm
31
+ for line in tqdm(lines, desc='Checking lines'):
32
+ # Add
33
+ result.all_lines.append(line)
34
+ result.lines.add(line)
35
+ # If line contains het, add to result
36
+ if cde.h2p.contains_het(line):
37
+ result.all_lines_cont_het.append(line)
38
+ # Filter the line
39
+ f_line = filter_text(line)
40
+ # Number converter
41
+ f_line = normalize_numbers(f_line)
42
+ # Tokenize
43
+ tokens = cde.h2p.tokenize(f_line)
44
+ for word in tokens:
45
+ # Skip word if punctuation
46
+ if word in punctuation:
47
+ continue
48
+ # Add word to result
49
+ result.all_words.append(word)
50
+ result.words.add(word)
51
+ # Check if word is resolvable
52
+ h2p_res = cde.h2p.contains_het(word)
53
+ cmu_res = cde.dict.get(word) is not None
54
+ fet_res = cde.lookup(word) is not None
55
+ if not h2p_res and not cmu_res and not fet_res:
56
+ # If word ends in "'s", remove it and add the base word
57
+ if word.endswith("'s"):
58
+ word = word[:-2]
59
+ result.unres_all_lines.append(line)
60
+ result.unres_all_words.append(word)
61
+ result.unres_lines.add(line)
62
+ result.unres_words.add(word)
63
+ elif h2p_res:
64
+ result.n_words_res += 1
65
+ result.n_words_het += 1
66
+ elif cmu_res:
67
+ result.n_words_res += 1
68
+ result.n_words_cmu += 1
69
+ elif fet_res:
70
+ result.n_words_res += 1
71
+ result.n_words_fet += 1
72
+
73
+ # Also pass stats
74
+ result.ft_stats = cde.p.stat_resolves
75
+
76
+ return result
77
+
78
+
79
+ # Class to hold the result of a parse
80
+ class ParseResult:
81
+ def __init__(self):
82
+ self.all_lines = []
83
+ self.all_lines_cont_het = []
84
+ self.unres_all_lines = []
85
+ self.lines = set()
86
+ self.unres_lines = set()
87
+ # Words
88
+ self.all_words = []
89
+ self.unres_all_words = []
90
+ self.words = set()
91
+ self.unres_words = set()
92
+ # Numerical stats
93
+ self.n_words_res = 0 # Number of total resolved words
94
+ self.n_words_cmu = 0 # Resolved words from CMU
95
+ self.n_words_fet = 0 # Resolved words from Features
96
+ self.n_words_het = 0 # Resolved words from H2p
97
+ # Stats from cmudictext
98
+ self.ft_stats = None
99
+
100
+ # Get percentage of lines covered
101
+ def line_unique_coverage(self) -> float:
102
+ dec = 1 - len(self.unres_lines) / len(self.lines)
103
+ return round(dec * 100, 2)
104
+
105
+ # Get percentage of words covered
106
+ def word_unique_coverage(self) -> float:
107
+ dec = 1 - len(self.unres_words) / len(self.words)
108
+ return round(dec * 100, 2)
109
+
110
+ # Get percentage of lines covered (All)
111
+ def line_coverage(self) -> float:
112
+ dec = 1 - len(self.unres_all_lines) / len(self.all_lines)
113
+ return round(dec * 100, 2)
114
+
115
+ # Get percentage of words covered (All)
116
+ def word_coverage(self) -> float:
117
+ dec = 1 - len(self.unres_all_words) / len(self.all_words)
118
+ return round(dec * 100, 2)
119
+
120
+ # Get percentage of heteronyms containing lines
121
+ def percent_line_het(self) -> float:
122
+ dec = len(self.all_lines_cont_het) / len(self.all_lines)
123
+ return round(dec * 100, 2)
124
+
125
+ # Get percentage of words resolved by H2p
126
+ def percent_word_h2p(self) -> float:
127
+ dec = self.n_words_het / self.n_words_res
128
+ return round(dec * 100, 2)
129
+
130
+ # Get percentage of words resolved by CMU
131
+ def percent_word_cmu(self) -> float:
132
+ dec = self.n_words_cmu / self.n_words_res
133
+ return round(dec * 100, 2)