AntonioMS commited on
Commit
94f2602
1 Parent(s): 9382c4c

Update preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +30 -137
preprocess.py CHANGED
@@ -9,103 +9,10 @@ from TTS.config import load_config
9
  from TTS.utils.manage import ModelManager
10
  from TTS.utils.synthesizer import Synthesizer
11
 
12
-
13
- PUNCLIST = [';', '?', '¿', ',', ':', '.', '!', '¡']
14
-
15
-
16
- def canBeNumber(n):
17
- try:
18
- int(n)
19
- return True
20
- except ValueError:
21
- # Not a number
22
- return False
23
-
24
- def accent_convert(phontrans):
25
- transcript = re.sub('a\^','á',phontrans)
26
- transcript = re.sub('e\^','é',transcript)
27
- transcript = re.sub('i\^','í',transcript)
28
- transcript = re.sub('o\^','ó',transcript)
29
- transcript = re.sub('u\^','ú',transcript)
30
- transcript = re.sub('E\^','É',transcript)
31
- transcript = re.sub('O\^','Ó',transcript)
32
- return transcript
33
-
34
- def remove_tra3_tags(phontrans):
35
- s = re.sub(r'#(.+?)#', r'', phontrans)
36
- s = re.sub(r'%(.+?)%', r'', s)
37
- s = re.sub(' +',' ',s)
38
- s = re.sub('-','',s)
39
- return s.strip()
40
-
41
  def sanitize_filename(filename):
42
  """Remove or replace any characters that are not allowed in file names."""
43
  return ''.join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()
44
 
45
- def is_number(index, text):
46
- if index == 0:
47
- return False
48
- elif index == len(text) - 1:
49
- return False
50
- else:
51
- return canBeNumber(text[index - 1]) and canBeNumber(text[index + 1])
52
-
53
- #Splits text from punctuation marks, gives list of segments in between and the punctuation marks. Skips punctuation not present in training.
54
- def split_punc(text):
55
- segments = []
56
- puncs = []
57
- curr_seg = ""
58
- previous_punc = False
59
- for i, c in enumerate(text):
60
- if c in PUNCLIST and not previous_punc and not is_number(i, text):
61
- curr_seg += c
62
- segments.append(curr_seg.strip())
63
- puncs.append(c)
64
- curr_seg = ""
65
- previous_punc = True
66
- elif c in PUNCLIST and previous_punc:
67
- curr_seg += c
68
- puncs[-1] += c
69
- else:
70
- curr_seg += c
71
- previous_punc = False
72
-
73
- segments.append(curr_seg.strip())
74
-
75
- # print("Split Segments: ", segments)
76
-
77
- #Remove empty segments in the list
78
- segments = filter(None, segments)
79
-
80
- # store segments as a list
81
- segments = list(segments)
82
-
83
- # print("Split Segments: ", segments)
84
- # print("Split Puncs: ", puncs)
85
-
86
- return segments, puncs
87
-
88
- def merge_punc(text_segs, puncs):
89
- merged_str = ""
90
- # print("Text segs: ", text_segs)
91
- # print("Puncs: ", puncs)
92
- for i, seg in enumerate(text_segs):
93
- merged_str += seg + " "
94
-
95
- if i < len(puncs):
96
- merged_str += puncs[i] + " "
97
-
98
- # remove spaces before , . ! ? ; : ) ] of the merged string
99
- merged_str = re.sub(r"\s+([.,!?;:)\]])", r"\1", merged_str)
100
-
101
- # remove spaces after ( [ ¡ ¿ of the merged string
102
- merged_str = re.sub(r"([\(\[¡¿])\s+", r"\1", merged_str)
103
-
104
- # print("Merged str: ", merged_str)
105
-
106
- return merged_str.strip()
107
-
108
-
109
  # función que engade a puntuación orixinal á extensión de números de cotovía (opción p)
110
  def punctuate_p(str_ext):
111
 
@@ -146,82 +53,68 @@ def punctuate_p(str_ext):
146
  return str_ext
147
 
148
 
149
- def to_cotovia(text_segments):
150
- # Input and output Cotovía files
151
- res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=5))
152
- COTOVIA_IN_TXT_PATH = res + '.txt'
153
- COTOVIA_IN_TXT_PATH_ISO = 'iso8859-1' + res + '.txt'
154
- COTOVIA_OUT_PRE_PATH = 'iso8859-1' + res + '.tra'
155
- COTOVIA_OUT_PRE_PATH_UTF8 = 'utf8' + res + '.tra'
156
-
157
-
158
- # print("Text segments: ", text_segments)
159
- # Initial text preprocessing
160
  # substitute ' M€' by 'millóns de euros' and 'somewordM€' by 'someword millóns de euros'
161
- text_segments = [re.sub(r"(\w+)\s*M€", r"\1 millóns de euros", seg) for seg in text_segments]
162
 
163
  # substitute ' €' by 'euros' and 'someword€' by 'someword euros'
164
- text_segments = [re.sub(r"(\w+)\s*€", r"\1 euros", seg) for seg in text_segments]
165
-
166
  # substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
167
- text_segments = [re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", seg) for seg in text_segments]
168
 
 
 
169
 
170
- text_segments = [subprocess.run(["sed", "-e", "s/₂//g", "-e", "s/⸺//g", "-e", "s/ //g", "-e", "s///g", "-e", "s/č/c/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g",
171
  "-e", "s/ş/s/g", "-e", "s/Ž/Z/g", "-e", "s/ž/z/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g", "-e", "s/ş/s/g", "-e", "s/«//g", "-e", "s/»//g",
172
  "-e", "s/<<//g", "-e", "s/>>//g", "-e", "s/“/\"/g", "-e", "s/”/'\"'/g", "-e", "s/\'//g", "-e", "s/‘//g", "-e", "s/’//g", "-e", "s/…//g",
173
- "-e", "s/-/-/g", "-e", "s/–/-/g", "-e", "s/—/-/g", "-e", "s/―/-/g", "-e", "s/−/-/g", "-e", "s/‒/-/g", "-e", "s/─/-/g", "-e", "s/^Si$/Si\./g"],
174
- input=seg, text=True, capture_output=True).stdout for seg in text_segments]
175
-
176
- # print("Text segments after sed: ", text_segments)
 
 
 
 
177
 
178
  with open(COTOVIA_IN_TXT_PATH, 'w') as f:
179
- for seg in text_segments:
180
- if seg:
181
- f.write(seg + '\n')
182
- else:
183
- f.write(',' + '\n')
184
 
185
  # utf-8 to iso8859-1
186
  subprocess.run(["iconv", "-f", "utf-8", "-t", "iso8859-1", COTOVIA_IN_TXT_PATH, "-o", COTOVIA_IN_TXT_PATH_ISO], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
187
- # call cotovia with -t3 option
188
- subprocess.run(["cotovia", "-i", COTOVIA_IN_TXT_PATH_ISO, "-t3", "-n"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
189
- # iso8859-1 to utf-8
190
  subprocess.run(["iconv", "-f", "iso8859-1", "-t", "utf-8", COTOVIA_OUT_PRE_PATH, "-o", COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
191
 
192
  segs = []
193
  try:
194
  with open(COTOVIA_OUT_PRE_PATH_UTF8, 'r') as f:
195
  segs = [line.rstrip() for line in f]
196
- segs = [remove_tra3_tags(line) for line in segs]
 
197
  except:
198
  print("ERROR: Couldn't read cotovia output")
199
 
200
  subprocess.run(["rm", COTOVIA_IN_TXT_PATH, COTOVIA_IN_TXT_PATH_ISO, COTOVIA_OUT_PRE_PATH, COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
201
 
202
- # print("Cotovia segments: ", segs)
203
-
204
  return segs
205
 
206
  def text_preprocess(text):
207
 
208
- #Split from punc
209
- text_segments, puncs = split_punc(text)
210
-
211
- cotovia_phon_segs = to_cotovia(text_segments)
212
 
213
- cotovia_phon_str = merge_punc(cotovia_phon_segs, puncs)
214
-
215
- phon_str = accent_convert(cotovia_phon_str)
216
 
217
- # remove extra spaces
218
- phon_str = re.sub(r"\s+", r" ", phon_str)
 
219
 
220
- # add final punctuation mark if it is not present
221
- if not re.match(r"[.!?]", phon_str[-1]):
222
- phon_str = phon_str + "."
223
 
224
- return phon_str
225
 
226
  def main():
227
  parser = argparse.ArgumentParser(description='Cotovia phoneme transcription.')
 
9
  from TTS.utils.manage import ModelManager
10
  from TTS.utils.synthesizer import Synthesizer
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def sanitize_filename(filename):
13
  """Remove or replace any characters that are not allowed in file names."""
14
  return ''.join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # función que engade a puntuación orixinal á extensión de números de cotovía (opción p)
17
  def punctuate_p(str_ext):
18
 
 
53
  return str_ext
54
 
55
 
56
+ def to_cotovia(text):
57
+ ## Initial text preprocessing
 
 
 
 
 
 
 
 
 
58
  # substitute ' M€' by 'millóns de euros' and 'somewordM€' by 'someword millóns de euros'
59
+ text = re.sub(r"(\w+)\s*M€", r"\1 millóns de euros", text)
60
 
61
  # substitute ' €' by 'euros' and 'someword€' by 'someword euros'
62
+ text = re.sub(r"(\w+)\s*€", r"\1 euros", text)
63
+
64
  # substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
65
+ text = re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", text)
66
 
67
+ # Random string generation
68
+ res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=5))
69
 
70
+ text = subprocess.run(["sed", "-e", "s/₂//g", "-e", "s/⸺//g", "-e", "s/ //g", "-e", "s///g", "-e", "s/č/c/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g",
71
  "-e", "s/ş/s/g", "-e", "s/Ž/Z/g", "-e", "s/ž/z/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g", "-e", "s/ş/s/g", "-e", "s/«//g", "-e", "s/»//g",
72
  "-e", "s/<<//g", "-e", "s/>>//g", "-e", "s/“/\"/g", "-e", "s/”/'\"'/g", "-e", "s/\'//g", "-e", "s/‘//g", "-e", "s/’//g", "-e", "s/…//g",
73
+ "-e", "s/-/-/g", "-e", "s/–/-/g", "-e", "s/—/-/g", "-e", "s/―/-/g", "-e", "s/−/-/g", "-e", "s/‒/-/g", "-e", "s/─/-/g"],
74
+ input = text, text = True, capture_output=True).stdout
75
+
76
+ # Input and output Cotovía files
77
+ COTOVIA_IN_TXT_PATH = res + '.txt'
78
+ COTOVIA_IN_TXT_PATH_ISO = 'iso8859-1' + res + '.txt'
79
+ COTOVIA_OUT_PRE_PATH = 'iso8859-1' + res + '.pre'
80
+ COTOVIA_OUT_PRE_PATH_UTF8 = 'utf8' + res + '.pre'
81
 
82
  with open(COTOVIA_IN_TXT_PATH, 'w') as f:
83
+ f.write(text + '\n')
84
+
 
 
 
85
 
86
  # utf-8 to iso8859-1
87
  subprocess.run(["iconv", "-f", "utf-8", "-t", "iso8859-1", COTOVIA_IN_TXT_PATH, "-o", COTOVIA_IN_TXT_PATH_ISO], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
88
+ subprocess.run(["cotovia", "-i", COTOVIA_IN_TXT_PATH_ISO, "-p"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
 
 
89
  subprocess.run(["iconv", "-f", "iso8859-1", "-t", "utf-8", COTOVIA_OUT_PRE_PATH, "-o", COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
90
 
91
  segs = []
92
  try:
93
  with open(COTOVIA_OUT_PRE_PATH_UTF8, 'r') as f:
94
  segs = [line.rstrip() for line in f]
95
+ # segs = [remove_tra3_tags(line) for line in segs] # modificar con punctuate_p
96
+ segs = [punctuate_p(line) for line in segs] # modificar con punctuate_p
97
  except:
98
  print("ERROR: Couldn't read cotovia output")
99
 
100
  subprocess.run(["rm", COTOVIA_IN_TXT_PATH, COTOVIA_IN_TXT_PATH_ISO, COTOVIA_OUT_PRE_PATH, COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
101
 
 
 
102
  return segs
103
 
104
  def text_preprocess(text):
105
 
106
+ cotovia_preproc_text = to_cotovia(text)
 
 
 
107
 
108
+ # convert list to string
109
+ cotovia_preproc_text_res = ' '.join(cotovia_preproc_text)
110
+
111
 
112
+ # add final punctuation if missing
113
+ if cotovia_preproc_text_res[-1] not in string.punctuation:
114
+ cotovia_preproc_text_res += '.'
115
 
116
+ return cotovia_preproc_text_res
 
 
117
 
 
118
 
119
  def main():
120
  parser = argparse.ArgumentParser(description='Cotovia phoneme transcription.')