AntonioMS commited on
Commit
9382c4c
1 Parent(s): 25a3bda

add preprocessing

Browse files
Files changed (2) hide show
  1. README.md +22 -21
  2. preprocess.py +255 -0
README.md CHANGED
@@ -21,7 +21,7 @@ This model was trained from scratch using the [Coqui TTS](https://github.com/coq
21
 
22
  A live inference demo can be found in our official page, [here](https://tts.nos.gal/).
23
 
24
- This model was trained using graphemes, so no preprocessing is needed for the input text.
25
 
26
  ## Intended uses and limitations
27
 
@@ -30,38 +30,39 @@ You can use this model to generate synthetic speech in Galician.
30
  ## How to use
31
  ### Usage
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  Required libraries:
34
 
35
  ```bash
36
  pip install TTS
37
  ```
38
 
39
- Synthesize a speech using python:
40
 
41
  ```bash
42
- import tempfile
43
- import numpy as np
44
- import os
45
- import json
46
-
47
- from typing import Optional
48
- from TTS.config import load_config
49
- from TTS.utils.manage import ModelManager
50
- from TTS.utils.synthesizer import Synthesizer
51
- model_path = # Absolute path to the model checkpoint.pth
52
- config_path = # Absolute path to the model config.json
53
- text = "Text to synthetize"
54
- synthesizer = Synthesizer(
55
- model_path, config_path, None, None, None, None,
56
- )
57
- wavs = synthesizer.tts(text)
58
  ```
59
 
 
60
 
61
- ## Training
62
- ### Training Procedure
63
- ### Data preparation
64
 
 
65
 
66
  ### Hyperparameter
67
 
 
21
 
22
  A live inference demo can be found in our official page, [here](https://tts.nos.gal/).
23
 
24
+ This model was trained using graphemes. A preprocessing with the [Cotovía](http://gtm.uvigo.es/en/transfer/software/cotovia/) tool is needed for the input text.
25
 
26
  ## Intended uses and limitations
27
 
 
30
  ## How to use
31
  ### Usage
32
 
33
+ #### Cotovía preprocessor
34
+
35
+ To generate fonectic transcriptions, the Cotovía tool is needed. The tool can be downloaded from the [SourceForge](https://sourceforge.net/projects/cotovia/files/Debian%20packages/) website. The required debian packages are `cotovia_0.5_amd64.deb` and `cotovia-lang-gl_0.5_all.deb`, that can be installed with the following commands:
36
+
37
+ ```bash
38
+ sudo dpkg -i cotovia_0.5_amd64.deb
39
+ sudo dpkg -i cotovia-lang-gl_0.5_all.deb
40
+ ```
41
+
42
+ The tool can be used to generate the phonetic transcription of the text. The following command can be used to generate the phonetic transcription of a text string:
43
+
44
+ ```bash
45
+ echo "Era unha avioneta... O piloto era pequeno, que se chega a ser dos grandes, tómbate!" | cotovia -p -n -S | iconv -f iso88591 -t utf8
46
+ ```
47
+
48
+ The output of the command is the phonetic transcription of the input text. This string may be used in the inference part, as shown next.
49
+
50
  Required libraries:
51
 
52
  ```bash
53
  pip install TTS
54
  ```
55
 
56
+ Synthesize speech using python and the script preprocess.py, avaliable in this repository:
57
 
58
  ```bash
59
+ python preprocess.py text model_path config_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  ```
61
 
62
+ This script takes a text input, preprocesses it with the cotovia tool, synthesizes speech from the preprocessed text, and saves the output as a .wav file.
63
 
 
 
 
64
 
65
+ ## Training
66
 
67
  ### Hyperparameter
68
 
preprocess.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import tempfile
3
+ import random
4
+ import re
5
+ import string
6
+ import subprocess
7
+ from typing import Optional
8
+ from TTS.config import load_config
9
+ from TTS.utils.manage import ModelManager
10
+ from TTS.utils.synthesizer import Synthesizer
11
+
12
+
13
+ PUNCLIST = [';', '?', '¿', ',', ':', '.', '!', '¡']
14
+
15
+
16
+ def canBeNumber(n):
17
+ try:
18
+ int(n)
19
+ return True
20
+ except ValueError:
21
+ # Not a number
22
+ return False
23
+
24
+ def accent_convert(phontrans):
25
+ transcript = re.sub('a\^','á',phontrans)
26
+ transcript = re.sub('e\^','é',transcript)
27
+ transcript = re.sub('i\^','í',transcript)
28
+ transcript = re.sub('o\^','ó',transcript)
29
+ transcript = re.sub('u\^','ú',transcript)
30
+ transcript = re.sub('E\^','É',transcript)
31
+ transcript = re.sub('O\^','Ó',transcript)
32
+ return transcript
33
+
34
+ def remove_tra3_tags(phontrans):
35
+ s = re.sub(r'#(.+?)#', r'', phontrans)
36
+ s = re.sub(r'%(.+?)%', r'', s)
37
+ s = re.sub(' +',' ',s)
38
+ s = re.sub('-','',s)
39
+ return s.strip()
40
+
41
+ def sanitize_filename(filename):
42
+ """Remove or replace any characters that are not allowed in file names."""
43
+ return ''.join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()
44
+
45
+ def is_number(index, text):
46
+ if index == 0:
47
+ return False
48
+ elif index == len(text) - 1:
49
+ return False
50
+ else:
51
+ return canBeNumber(text[index - 1]) and canBeNumber(text[index + 1])
52
+
53
+ #Splits text from punctuation marks, gives list of segments in between and the punctuation marks. Skips punctuation not present in training.
54
+ def split_punc(text):
55
+ segments = []
56
+ puncs = []
57
+ curr_seg = ""
58
+ previous_punc = False
59
+ for i, c in enumerate(text):
60
+ if c in PUNCLIST and not previous_punc and not is_number(i, text):
61
+ curr_seg += c
62
+ segments.append(curr_seg.strip())
63
+ puncs.append(c)
64
+ curr_seg = ""
65
+ previous_punc = True
66
+ elif c in PUNCLIST and previous_punc:
67
+ curr_seg += c
68
+ puncs[-1] += c
69
+ else:
70
+ curr_seg += c
71
+ previous_punc = False
72
+
73
+ segments.append(curr_seg.strip())
74
+
75
+ # print("Split Segments: ", segments)
76
+
77
+ #Remove empty segments in the list
78
+ segments = filter(None, segments)
79
+
80
+ # store segments as a list
81
+ segments = list(segments)
82
+
83
+ # print("Split Segments: ", segments)
84
+ # print("Split Puncs: ", puncs)
85
+
86
+ return segments, puncs
87
+
88
+ def merge_punc(text_segs, puncs):
89
+ merged_str = ""
90
+ # print("Text segs: ", text_segs)
91
+ # print("Puncs: ", puncs)
92
+ for i, seg in enumerate(text_segs):
93
+ merged_str += seg + " "
94
+
95
+ if i < len(puncs):
96
+ merged_str += puncs[i] + " "
97
+
98
+ # remove spaces before , . ! ? ; : ) ] of the merged string
99
+ merged_str = re.sub(r"\s+([.,!?;:)\]])", r"\1", merged_str)
100
+
101
+ # remove spaces after ( [ ¡ ¿ of the merged string
102
+ merged_str = re.sub(r"([\(\[¡¿])\s+", r"\1", merged_str)
103
+
104
+ # print("Merged str: ", merged_str)
105
+
106
+ return merged_str.strip()
107
+
108
+
109
+ # función que engade a puntuación orixinal á extensión de números de cotovía (opción p)
110
+ def punctuate_p(str_ext):
111
+
112
+ # substitute ' ·\n' by ...
113
+ str_ext = re.sub(r" ·", r"...", str_ext)
114
+
115
+ # remove spaces before , . ! ? ; : ) ] of the extended string
116
+ str_ext = re.sub(r"\s+([.,!?;:)\]])", r"\1", str_ext)
117
+
118
+ # remove spaces after ( [ ¡ ¿ of the extended string
119
+ str_ext = re.sub(r"([\(\[¡¿])\s+", r"\1", str_ext)
120
+
121
+ # remove unwanted spaces between quotations marks
122
+ str_ext = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', str_ext)
123
+
124
+ # substitute '- text -' to '-text-'
125
+ str_ext = re.sub(r"-\s*([^-]*?)\s*-", r"-\1-", str_ext)
126
+
127
+ # remove initial question marks
128
+ str_ext = re.sub(r"[¿¡]", r"", str_ext)
129
+
130
+ # eliminate extra spaces
131
+ str_ext = re.sub(r"\s+", r" ", str_ext)
132
+
133
+ str_ext = re.sub(r"(\d+)\s*-\s*(\d+)", r"\1 \2", str_ext)
134
+
135
+ ### - , ' and () by commas
136
+ # substitute '- text -' to ', text,'
137
+ str_ext = re.sub(r"(\w+)\s+-([^-]*?)-\s+([^-]*?)", r"\1, \2, ", str_ext)
138
+
139
+ # substitute ' - ' by ', '
140
+ str_ext = re.sub(r"(\w+[!\?]?)\s+-\s*", r"\1, ", str_ext)
141
+
142
+ # substitute ' ( text )' to ', text,'
143
+ str_ext = re.sub(r"(\w+)\s*\(\s*([^\(\)]*?)\s*\)", r"\1, \2,", str_ext)
144
+
145
+
146
+ return str_ext
147
+
148
+
149
+ def to_cotovia(text_segments):
150
+ # Input and output Cotovía files
151
+ res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=5))
152
+ COTOVIA_IN_TXT_PATH = res + '.txt'
153
+ COTOVIA_IN_TXT_PATH_ISO = 'iso8859-1' + res + '.txt'
154
+ COTOVIA_OUT_PRE_PATH = 'iso8859-1' + res + '.tra'
155
+ COTOVIA_OUT_PRE_PATH_UTF8 = 'utf8' + res + '.tra'
156
+
157
+
158
+ # print("Text segments: ", text_segments)
159
+ # Initial text preprocessing
160
+ # substitute ' M€' by 'millóns de euros' and 'somewordM€' by 'someword millóns de euros'
161
+ text_segments = [re.sub(r"(\w+)\s*M€", r"\1 millóns de euros", seg) for seg in text_segments]
162
+
163
+ # substitute ' €' by 'euros' and 'someword€' by 'someword euros'
164
+ text_segments = [re.sub(r"(\w+)\s*€", r"\1 euros", seg) for seg in text_segments]
165
+
166
+ # substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
167
+ text_segments = [re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", seg) for seg in text_segments]
168
+
169
+
170
+ text_segments = [subprocess.run(["sed", "-e", "s/₂//g", "-e", "s/⸺//g", "-e", "s/ //g", "-e", "s///g", "-e", "s/č/c/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g",
171
+ "-e", "s/ş/s/g", "-e", "s/Ž/Z/g", "-e", "s/ž/z/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g", "-e", "s/ş/s/g", "-e", "s/«//g", "-e", "s/»//g",
172
+ "-e", "s/<<//g", "-e", "s/>>//g", "-e", "s/“/\"/g", "-e", "s/”/'\"'/g", "-e", "s/\'//g", "-e", "s/‘//g", "-e", "s/’//g", "-e", "s/…//g",
173
+ "-e", "s/-/-/g", "-e", "s/–/-/g", "-e", "s/—/-/g", "-e", "s/―/-/g", "-e", "s/−/-/g", "-e", "s/‒/-/g", "-e", "s/─/-/g", "-e", "s/^Si$/Si\./g"],
174
+ input=seg, text=True, capture_output=True).stdout for seg in text_segments]
175
+
176
+ # print("Text segments after sed: ", text_segments)
177
+
178
+ with open(COTOVIA_IN_TXT_PATH, 'w') as f:
179
+ for seg in text_segments:
180
+ if seg:
181
+ f.write(seg + '\n')
182
+ else:
183
+ f.write(',' + '\n')
184
+
185
+ # utf-8 to iso8859-1
186
+ subprocess.run(["iconv", "-f", "utf-8", "-t", "iso8859-1", COTOVIA_IN_TXT_PATH, "-o", COTOVIA_IN_TXT_PATH_ISO], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
187
+ # call cotovia with -t3 option
188
+ subprocess.run(["cotovia", "-i", COTOVIA_IN_TXT_PATH_ISO, "-t3", "-n"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
189
+ # iso8859-1 to utf-8
190
+ subprocess.run(["iconv", "-f", "iso8859-1", "-t", "utf-8", COTOVIA_OUT_PRE_PATH, "-o", COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
191
+
192
+ segs = []
193
+ try:
194
+ with open(COTOVIA_OUT_PRE_PATH_UTF8, 'r') as f:
195
+ segs = [line.rstrip() for line in f]
196
+ segs = [remove_tra3_tags(line) for line in segs]
197
+ except:
198
+ print("ERROR: Couldn't read cotovia output")
199
+
200
+ subprocess.run(["rm", COTOVIA_IN_TXT_PATH, COTOVIA_IN_TXT_PATH_ISO, COTOVIA_OUT_PRE_PATH, COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
201
+
202
+ # print("Cotovia segments: ", segs)
203
+
204
+ return segs
205
+
206
+ def text_preprocess(text):
207
+
208
+ #Split from punc
209
+ text_segments, puncs = split_punc(text)
210
+
211
+ cotovia_phon_segs = to_cotovia(text_segments)
212
+
213
+ cotovia_phon_str = merge_punc(cotovia_phon_segs, puncs)
214
+
215
+ phon_str = accent_convert(cotovia_phon_str)
216
+
217
+ # remove extra spaces
218
+ phon_str = re.sub(r"\s+", r" ", phon_str)
219
+
220
+ # add final punctuation mark if it is not present
221
+ if not re.match(r"[.!?]", phon_str[-1]):
222
+ phon_str = phon_str + "."
223
+
224
+ return phon_str
225
+
226
+ def main():
227
+ parser = argparse.ArgumentParser(description='Cotovia phoneme transcription.')
228
+ parser.add_argument('text', type=str, help='Text to synthetize')
229
+ parser.add_argument('model_path', type=str, help='Absolute path to the model checkpoint.pth')
230
+ parser.add_argument('config_path', type=str, help='Absolute path to the model config.json')
231
+
232
+ args = parser.parse_args()
233
+
234
+ print("Text before preprocessing: ", args.text)
235
+ text = text_preprocess(args.text)
236
+ print("Text after preprocessing: ", text)
237
+
238
+ synthesizer = Synthesizer(
239
+ args.model_path, args.config_path, None, None, None, None,
240
+ )
241
+ wavs = synthesizer.tts(text)
242
+
243
+ # Step 1: Extract the first word from the text
244
+ first_word = args.text.split()[0] if args.text.split() else "audio"
245
+ first_word = sanitize_filename(first_word) # Sanitize to make it a valid filename
246
+
247
+ # Step 2: Use synthesizer's built-in function to synthesize and save the audio
248
+ wavs = synthesizer.tts(text)
249
+ filename = f"{first_word}.wav"
250
+ synthesizer.save_wav(wavs, filename)
251
+
252
+ print(f"Audio file saved as: {filename}")
253
+
254
+ if __name__ == "__main__":
255
+ main()