add preprocessing
Browse files- README.md +22 -21
- preprocess.py +255 -0
README.md
CHANGED
@@ -21,7 +21,7 @@ This model was trained from scratch using the [Coqui TTS](https://github.com/coq
|
|
21 |
|
22 |
A live inference demo can be found in our official page, [here](https://tts.nos.gal/).
|
23 |
|
24 |
-
This model was trained using graphemes
|
25 |
|
26 |
## Intended uses and limitations
|
27 |
|
@@ -30,38 +30,39 @@ You can use this model to generate synthetic speech in Galician.
|
|
30 |
## How to use
|
31 |
### Usage
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
Required libraries:
|
34 |
|
35 |
```bash
|
36 |
pip install TTS
|
37 |
```
|
38 |
|
39 |
-
Synthesize
|
40 |
|
41 |
```bash
|
42 |
-
|
43 |
-
import numpy as np
|
44 |
-
import os
|
45 |
-
import json
|
46 |
-
|
47 |
-
from typing import Optional
|
48 |
-
from TTS.config import load_config
|
49 |
-
from TTS.utils.manage import ModelManager
|
50 |
-
from TTS.utils.synthesizer import Synthesizer
|
51 |
-
model_path = # Absolute path to the model checkpoint.pth
|
52 |
-
config_path = # Absolute path to the model config.json
|
53 |
-
text = "Text to synthetize"
|
54 |
-
synthesizer = Synthesizer(
|
55 |
-
model_path, config_path, None, None, None, None,
|
56 |
-
)
|
57 |
-
wavs = synthesizer.tts(text)
|
58 |
```
|
59 |
|
|
|
60 |
|
61 |
-
## Training
|
62 |
-
### Training Procedure
|
63 |
-
### Data preparation
|
64 |
|
|
|
65 |
|
66 |
### Hyperparameter
|
67 |
|
|
|
21 |
|
22 |
A live inference demo can be found in our official page, [here](https://tts.nos.gal/).
|
23 |
|
24 |
+
This model was trained using graphemes. A preprocessing with the [Cotovía](http://gtm.uvigo.es/en/transfer/software/cotovia/) tool is needed for the input text.
|
25 |
|
26 |
## Intended uses and limitations
|
27 |
|
|
|
30 |
## How to use
|
31 |
### Usage
|
32 |
|
33 |
+
#### Cotovía preprocessor
|
34 |
+
|
35 |
+
To generate fonectic transcriptions, the Cotovía tool is needed. The tool can be downloaded from the [SourceForge](https://sourceforge.net/projects/cotovia/files/Debian%20packages/) website. The required debian packages are `cotovia_0.5_amd64.deb` and `cotovia-lang-gl_0.5_all.deb`, that can be installed with the following commands:
|
36 |
+
|
37 |
+
```bash
|
38 |
+
sudo dpkg -i cotovia_0.5_amd64.deb
|
39 |
+
sudo dpkg -i cotovia-lang-gl_0.5_all.deb
|
40 |
+
```
|
41 |
+
|
42 |
+
The tool can be used to generate the phonetic transcription of the text. The following command can be used to generate the phonetic transcription of a text string:
|
43 |
+
|
44 |
+
```bash
|
45 |
+
echo "Era unha avioneta... O piloto era pequeno, que se chega a ser dos grandes, tómbate!" | cotovia -p -n -S | iconv -f iso88591 -t utf8
|
46 |
+
```
|
47 |
+
|
48 |
+
The output of the command is the phonetic transcription of the input text. This string may be used in the inference part, as shown next.
|
49 |
+
|
50 |
Required libraries:
|
51 |
|
52 |
```bash
|
53 |
pip install TTS
|
54 |
```
|
55 |
|
56 |
+
Synthesize speech using python and the script preprocess.py, avaliable in this repository:
|
57 |
|
58 |
```bash
|
59 |
+
python preprocess.py text model_path config_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
```
|
61 |
|
62 |
+
This script takes a text input, preprocesses it with the cotovia tool, synthesizes speech from the preprocessed text, and saves the output as a .wav file.
|
63 |
|
|
|
|
|
|
|
64 |
|
65 |
+
## Training
|
66 |
|
67 |
### Hyperparameter
|
68 |
|
preprocess.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import tempfile
|
3 |
+
import random
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
import subprocess
|
7 |
+
from typing import Optional
|
8 |
+
from TTS.config import load_config
|
9 |
+
from TTS.utils.manage import ModelManager
|
10 |
+
from TTS.utils.synthesizer import Synthesizer
|
11 |
+
|
12 |
+
|
13 |
+
PUNCLIST = [';', '?', '¿', ',', ':', '.', '!', '¡']
|
14 |
+
|
15 |
+
|
16 |
+
def canBeNumber(n):
|
17 |
+
try:
|
18 |
+
int(n)
|
19 |
+
return True
|
20 |
+
except ValueError:
|
21 |
+
# Not a number
|
22 |
+
return False
|
23 |
+
|
24 |
+
def accent_convert(phontrans):
|
25 |
+
transcript = re.sub('a\^','á',phontrans)
|
26 |
+
transcript = re.sub('e\^','é',transcript)
|
27 |
+
transcript = re.sub('i\^','í',transcript)
|
28 |
+
transcript = re.sub('o\^','ó',transcript)
|
29 |
+
transcript = re.sub('u\^','ú',transcript)
|
30 |
+
transcript = re.sub('E\^','É',transcript)
|
31 |
+
transcript = re.sub('O\^','Ó',transcript)
|
32 |
+
return transcript
|
33 |
+
|
34 |
+
def remove_tra3_tags(phontrans):
|
35 |
+
s = re.sub(r'#(.+?)#', r'', phontrans)
|
36 |
+
s = re.sub(r'%(.+?)%', r'', s)
|
37 |
+
s = re.sub(' +',' ',s)
|
38 |
+
s = re.sub('-','',s)
|
39 |
+
return s.strip()
|
40 |
+
|
41 |
+
def sanitize_filename(filename):
|
42 |
+
"""Remove or replace any characters that are not allowed in file names."""
|
43 |
+
return ''.join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()
|
44 |
+
|
45 |
+
def is_number(index, text):
|
46 |
+
if index == 0:
|
47 |
+
return False
|
48 |
+
elif index == len(text) - 1:
|
49 |
+
return False
|
50 |
+
else:
|
51 |
+
return canBeNumber(text[index - 1]) and canBeNumber(text[index + 1])
|
52 |
+
|
53 |
+
#Splits text from punctuation marks, gives list of segments in between and the punctuation marks. Skips punctuation not present in training.
|
54 |
+
def split_punc(text):
|
55 |
+
segments = []
|
56 |
+
puncs = []
|
57 |
+
curr_seg = ""
|
58 |
+
previous_punc = False
|
59 |
+
for i, c in enumerate(text):
|
60 |
+
if c in PUNCLIST and not previous_punc and not is_number(i, text):
|
61 |
+
curr_seg += c
|
62 |
+
segments.append(curr_seg.strip())
|
63 |
+
puncs.append(c)
|
64 |
+
curr_seg = ""
|
65 |
+
previous_punc = True
|
66 |
+
elif c in PUNCLIST and previous_punc:
|
67 |
+
curr_seg += c
|
68 |
+
puncs[-1] += c
|
69 |
+
else:
|
70 |
+
curr_seg += c
|
71 |
+
previous_punc = False
|
72 |
+
|
73 |
+
segments.append(curr_seg.strip())
|
74 |
+
|
75 |
+
# print("Split Segments: ", segments)
|
76 |
+
|
77 |
+
#Remove empty segments in the list
|
78 |
+
segments = filter(None, segments)
|
79 |
+
|
80 |
+
# store segments as a list
|
81 |
+
segments = list(segments)
|
82 |
+
|
83 |
+
# print("Split Segments: ", segments)
|
84 |
+
# print("Split Puncs: ", puncs)
|
85 |
+
|
86 |
+
return segments, puncs
|
87 |
+
|
88 |
+
def merge_punc(text_segs, puncs):
|
89 |
+
merged_str = ""
|
90 |
+
# print("Text segs: ", text_segs)
|
91 |
+
# print("Puncs: ", puncs)
|
92 |
+
for i, seg in enumerate(text_segs):
|
93 |
+
merged_str += seg + " "
|
94 |
+
|
95 |
+
if i < len(puncs):
|
96 |
+
merged_str += puncs[i] + " "
|
97 |
+
|
98 |
+
# remove spaces before , . ! ? ; : ) ] of the merged string
|
99 |
+
merged_str = re.sub(r"\s+([.,!?;:)\]])", r"\1", merged_str)
|
100 |
+
|
101 |
+
# remove spaces after ( [ ¡ ¿ of the merged string
|
102 |
+
merged_str = re.sub(r"([\(\[¡¿])\s+", r"\1", merged_str)
|
103 |
+
|
104 |
+
# print("Merged str: ", merged_str)
|
105 |
+
|
106 |
+
return merged_str.strip()
|
107 |
+
|
108 |
+
|
109 |
+
# función que engade a puntuación orixinal á extensión de números de cotovía (opción p)
|
110 |
+
def punctuate_p(str_ext):
|
111 |
+
|
112 |
+
# substitute ' ·\n' by ...
|
113 |
+
str_ext = re.sub(r" ·", r"...", str_ext)
|
114 |
+
|
115 |
+
# remove spaces before , . ! ? ; : ) ] of the extended string
|
116 |
+
str_ext = re.sub(r"\s+([.,!?;:)\]])", r"\1", str_ext)
|
117 |
+
|
118 |
+
# remove spaces after ( [ ¡ ¿ of the extended string
|
119 |
+
str_ext = re.sub(r"([\(\[¡¿])\s+", r"\1", str_ext)
|
120 |
+
|
121 |
+
# remove unwanted spaces between quotations marks
|
122 |
+
str_ext = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', str_ext)
|
123 |
+
|
124 |
+
# substitute '- text -' to '-text-'
|
125 |
+
str_ext = re.sub(r"-\s*([^-]*?)\s*-", r"-\1-", str_ext)
|
126 |
+
|
127 |
+
# remove initial question marks
|
128 |
+
str_ext = re.sub(r"[¿¡]", r"", str_ext)
|
129 |
+
|
130 |
+
# eliminate extra spaces
|
131 |
+
str_ext = re.sub(r"\s+", r" ", str_ext)
|
132 |
+
|
133 |
+
str_ext = re.sub(r"(\d+)\s*-\s*(\d+)", r"\1 \2", str_ext)
|
134 |
+
|
135 |
+
### - , ' and () by commas
|
136 |
+
# substitute '- text -' to ', text,'
|
137 |
+
str_ext = re.sub(r"(\w+)\s+-([^-]*?)-\s+([^-]*?)", r"\1, \2, ", str_ext)
|
138 |
+
|
139 |
+
# substitute ' - ' by ', '
|
140 |
+
str_ext = re.sub(r"(\w+[!\?]?)\s+-\s*", r"\1, ", str_ext)
|
141 |
+
|
142 |
+
# substitute ' ( text )' to ', text,'
|
143 |
+
str_ext = re.sub(r"(\w+)\s*\(\s*([^\(\)]*?)\s*\)", r"\1, \2,", str_ext)
|
144 |
+
|
145 |
+
|
146 |
+
return str_ext
|
147 |
+
|
148 |
+
|
149 |
+
def to_cotovia(text_segments):
|
150 |
+
# Input and output Cotovía files
|
151 |
+
res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=5))
|
152 |
+
COTOVIA_IN_TXT_PATH = res + '.txt'
|
153 |
+
COTOVIA_IN_TXT_PATH_ISO = 'iso8859-1' + res + '.txt'
|
154 |
+
COTOVIA_OUT_PRE_PATH = 'iso8859-1' + res + '.tra'
|
155 |
+
COTOVIA_OUT_PRE_PATH_UTF8 = 'utf8' + res + '.tra'
|
156 |
+
|
157 |
+
|
158 |
+
# print("Text segments: ", text_segments)
|
159 |
+
# Initial text preprocessing
|
160 |
+
# substitute ' M€' by 'millóns de euros' and 'somewordM€' by 'someword millóns de euros'
|
161 |
+
text_segments = [re.sub(r"(\w+)\s*M€", r"\1 millóns de euros", seg) for seg in text_segments]
|
162 |
+
|
163 |
+
# substitute ' €' by 'euros' and 'someword€' by 'someword euros'
|
164 |
+
text_segments = [re.sub(r"(\w+)\s*€", r"\1 euros", seg) for seg in text_segments]
|
165 |
+
|
166 |
+
# substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
|
167 |
+
text_segments = [re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", seg) for seg in text_segments]
|
168 |
+
|
169 |
+
|
170 |
+
text_segments = [subprocess.run(["sed", "-e", "s/₂//g", "-e", "s/⸺//g", "-e", "s/ //g", "-e", "s///g", "-e", "s/č/c/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g",
|
171 |
+
"-e", "s/ş/s/g", "-e", "s/Ž/Z/g", "-e", "s/ž/z/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g", "-e", "s/ş/s/g", "-e", "s/«//g", "-e", "s/»//g",
|
172 |
+
"-e", "s/<<//g", "-e", "s/>>//g", "-e", "s/“/\"/g", "-e", "s/”/'\"'/g", "-e", "s/\'//g", "-e", "s/‘//g", "-e", "s/’//g", "-e", "s/…//g",
|
173 |
+
"-e", "s/-/-/g", "-e", "s/–/-/g", "-e", "s/—/-/g", "-e", "s/―/-/g", "-e", "s/−/-/g", "-e", "s/‒/-/g", "-e", "s/─/-/g", "-e", "s/^Si$/Si\./g"],
|
174 |
+
input=seg, text=True, capture_output=True).stdout for seg in text_segments]
|
175 |
+
|
176 |
+
# print("Text segments after sed: ", text_segments)
|
177 |
+
|
178 |
+
with open(COTOVIA_IN_TXT_PATH, 'w') as f:
|
179 |
+
for seg in text_segments:
|
180 |
+
if seg:
|
181 |
+
f.write(seg + '\n')
|
182 |
+
else:
|
183 |
+
f.write(',' + '\n')
|
184 |
+
|
185 |
+
# utf-8 to iso8859-1
|
186 |
+
subprocess.run(["iconv", "-f", "utf-8", "-t", "iso8859-1", COTOVIA_IN_TXT_PATH, "-o", COTOVIA_IN_TXT_PATH_ISO], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
187 |
+
# call cotovia with -t3 option
|
188 |
+
subprocess.run(["cotovia", "-i", COTOVIA_IN_TXT_PATH_ISO, "-t3", "-n"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
189 |
+
# iso8859-1 to utf-8
|
190 |
+
subprocess.run(["iconv", "-f", "iso8859-1", "-t", "utf-8", COTOVIA_OUT_PRE_PATH, "-o", COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
191 |
+
|
192 |
+
segs = []
|
193 |
+
try:
|
194 |
+
with open(COTOVIA_OUT_PRE_PATH_UTF8, 'r') as f:
|
195 |
+
segs = [line.rstrip() for line in f]
|
196 |
+
segs = [remove_tra3_tags(line) for line in segs]
|
197 |
+
except:
|
198 |
+
print("ERROR: Couldn't read cotovia output")
|
199 |
+
|
200 |
+
subprocess.run(["rm", COTOVIA_IN_TXT_PATH, COTOVIA_IN_TXT_PATH_ISO, COTOVIA_OUT_PRE_PATH, COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
201 |
+
|
202 |
+
# print("Cotovia segments: ", segs)
|
203 |
+
|
204 |
+
return segs
|
205 |
+
|
206 |
+
def text_preprocess(text):
|
207 |
+
|
208 |
+
#Split from punc
|
209 |
+
text_segments, puncs = split_punc(text)
|
210 |
+
|
211 |
+
cotovia_phon_segs = to_cotovia(text_segments)
|
212 |
+
|
213 |
+
cotovia_phon_str = merge_punc(cotovia_phon_segs, puncs)
|
214 |
+
|
215 |
+
phon_str = accent_convert(cotovia_phon_str)
|
216 |
+
|
217 |
+
# remove extra spaces
|
218 |
+
phon_str = re.sub(r"\s+", r" ", phon_str)
|
219 |
+
|
220 |
+
# add final punctuation mark if it is not present
|
221 |
+
if not re.match(r"[.!?]", phon_str[-1]):
|
222 |
+
phon_str = phon_str + "."
|
223 |
+
|
224 |
+
return phon_str
|
225 |
+
|
226 |
+
def main():
|
227 |
+
parser = argparse.ArgumentParser(description='Cotovia phoneme transcription.')
|
228 |
+
parser.add_argument('text', type=str, help='Text to synthetize')
|
229 |
+
parser.add_argument('model_path', type=str, help='Absolute path to the model checkpoint.pth')
|
230 |
+
parser.add_argument('config_path', type=str, help='Absolute path to the model config.json')
|
231 |
+
|
232 |
+
args = parser.parse_args()
|
233 |
+
|
234 |
+
print("Text before preprocessing: ", args.text)
|
235 |
+
text = text_preprocess(args.text)
|
236 |
+
print("Text after preprocessing: ", text)
|
237 |
+
|
238 |
+
synthesizer = Synthesizer(
|
239 |
+
args.model_path, args.config_path, None, None, None, None,
|
240 |
+
)
|
241 |
+
wavs = synthesizer.tts(text)
|
242 |
+
|
243 |
+
# Step 1: Extract the first word from the text
|
244 |
+
first_word = args.text.split()[0] if args.text.split() else "audio"
|
245 |
+
first_word = sanitize_filename(first_word) # Sanitize to make it a valid filename
|
246 |
+
|
247 |
+
# Step 2: Use synthesizer's built-in function to synthesize and save the audio
|
248 |
+
wavs = synthesizer.tts(text)
|
249 |
+
filename = f"{first_word}.wav"
|
250 |
+
synthesizer.save_wav(wavs, filename)
|
251 |
+
|
252 |
+
print(f"Audio file saved as: {filename}")
|
253 |
+
|
254 |
+
if __name__ == "__main__":
|
255 |
+
main()
|