Spaces:
Runtime error
Runtime error
Francis0917
commited on
Commit
•
2045faa
1
Parent(s):
55d46a2
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- README.md +111 -8
- checkpoint_results/checkpoint_gctc_clap/20240725-154258/checkpoint +2 -0
- checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.data-00000-of-00001 +3 -0
- checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.index +0 -0
- checkpoint_results/checkpoint_guided_ctc/20240725-011006/checkpoint +2 -0
- checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.data-00000-of-00001 +3 -0
- checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.index +0 -0
- criterion/__pycache__/total.cpython-37.pyc +0 -0
- criterion/__pycache__/total_ctc1_clap.cpython-37.pyc +0 -0
- criterion/__pycache__/utils.cpython-37.pyc +0 -0
- criterion/total.py +69 -0
- criterion/total_CLKWS.py +100 -0
- criterion/total_ctc1.py +97 -0
- criterion/total_ctc1_clap.py +125 -0
- criterion/utils.py +32 -0
- dataset/__pycache__/dataloader_demo.cpython-37.pyc +0 -0
- dataset/__pycache__/dataloader_infe.cpython-37.pyc +0 -0
- dataset/__pycache__/google.cpython-37.pyc +0 -0
- dataset/__pycache__/google_infe202405.cpython-37.pyc +0 -0
- dataset/__pycache__/libriphrase.cpython-37.pyc +0 -0
- dataset/__pycache__/libriphrase_ctc1.cpython-37.pyc +0 -0
- dataset/__pycache__/qualcomm.cpython-37.pyc +0 -0
- dataset/dataloader_demo.py +182 -0
- dataset/dataloader_infe.py +164 -0
- dataset/g2p/LICENSE.txt +201 -0
- dataset/g2p/g2p_en/__init__.py +1 -0
- dataset/g2p/g2p_en/__pycache__/__init__.cpython-37.pyc +0 -0
- dataset/g2p/g2p_en/__pycache__/expand.cpython-37.pyc +0 -0
- dataset/g2p/g2p_en/__pycache__/g2p.cpython-37.pyc +0 -0
- dataset/g2p/g2p_en/checkpoint20.npz +3 -0
- dataset/g2p/g2p_en/expand.py +79 -0
- dataset/g2p/g2p_en/g2p.py +249 -0
- dataset/g2p/g2p_en/homographs.en +379 -0
- dataset/google.py +188 -0
- dataset/google_infe202405.py +192 -0
- dataset/libriphrase.py +331 -0
- dataset/libriphrase_ctc1.py +346 -0
- dataset/qualcomm.py +180 -0
- demo.py +168 -0
- docker/Dockerfile +25 -0
- flagged/Sound/c129aef35ba4cb66620f813cd7268c4be510a66d/ok_google-183000.wav +0 -0
- flagged/Sound/d35a5cf80a9403828bc601a0a761a5f88da06f00/realtek_go-183033.wav +0 -0
- flagged/log.csv +8 -0
- inference.py +141 -0
- model/__pycache__/discriminator.cpython-37.pyc +0 -0
- model/__pycache__/encoder.cpython-37.pyc +0 -0
- model/__pycache__/extractor.cpython-37.pyc +0 -0
- model/__pycache__/log_melspectrogram.cpython-37.pyc +0 -0
- model/__pycache__/speech_embedding.cpython-37.pyc +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
model/google_speech_embedding/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,12 +1,115 @@
|
|
1 |
---
|
2 |
-
title: CL-
|
3 |
-
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: CL-KWS_202408_v1
|
3 |
+
app_file: demo.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 3.34.0
|
|
|
|
|
6 |
---
|
7 |
+
### Datasets
|
8 |
|
9 |
+
* [LibriPhrase]
|
10 |
+
LibriSpeech corpus : https://www.openslr.org/12
|
11 |
+
Recipe for LibriPhrase : https://github.com/gusrud1103/LibriPhrase
|
12 |
+
|
13 |
+
* [Google Speech Commands]
|
14 |
+
http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
|
15 |
+
http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz
|
16 |
+
https://www.tensorflow.org/datasets/catalog/speech_commands
|
17 |
+
|
18 |
+
* [Qualcomm Keyword Speech]
|
19 |
+
https://www.qualcomm.com/developer/software/keyword-speech-dataset
|
20 |
+
|
21 |
+
*[noise][musan]
|
22 |
+
https://www.openslr.org/17/
|
23 |
+
|
24 |
+
## Getting started
|
25 |
+
|
26 |
+
### Environment
|
27 |
+
|
28 |
+
```bash
|
29 |
+
#python=3.7
|
30 |
+
conda create --name [name] python=3.7
|
31 |
+
conda install -c "nvidia/label/cuda-11.6.0" cuda-nvcc
|
32 |
+
conda install -c conda-forge cudnn=8.2.1.32
|
33 |
+
pip install -r requirements.txt
|
34 |
+
pip install numpy==1.18.5
|
35 |
+
pip install tensorflow-model-optimization==0.6.0
|
36 |
+
cd /miniconda3/envs/[name]/lib
|
37 |
+
ln -s libcusolver.so.11 libcusolver.so.10
|
38 |
+
# export export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/share/homes/yiting/miniconda3/envs/pho/lib
|
39 |
+
```
|
40 |
+
|
41 |
+
### Training
|
42 |
+
```bash
|
43 |
+
python train_guided_CTC.py\
|
44 |
+
--epoch 23 \
|
45 |
+
--lr 1e-3 \
|
46 |
+
--loss_weight 1.0 1.0 0.2\
|
47 |
+
--audio_input both \
|
48 |
+
--text_input phoneme \
|
49 |
+
--comment 'user comments for each experiment'
|
50 |
+
```
|
51 |
+
|
52 |
+
```bash
|
53 |
+
python train.py \
|
54 |
+
--epoch 18 \
|
55 |
+
--lr 1e-3 \
|
56 |
+
--loss_weight 1.0 1.0 \
|
57 |
+
--audio_input both \
|
58 |
+
--text_input phoneme \
|
59 |
+
--comment 'user comments for each experiment'
|
60 |
+
```
|
61 |
+
|
62 |
+
### Fine-tuning
|
63 |
+
checkpoint: ./checkpoint_results/checkpoint_guided_ctc/20240725-011006
|
64 |
+
```bash
|
65 |
+
python train_guided_ctc_clap.py \
|
66 |
+
--epoch 5 \
|
67 |
+
--lr 1e-3 \
|
68 |
+
--loss_weight 1.0 1.0 0.01 0.01 \
|
69 |
+
--audio_input both \
|
70 |
+
--text_input phoneme \
|
71 |
+
--load_checkpoint_path '/home/DB/checkpoint_results/checkpoint_guided_ctc/date-time' \
|
72 |
+
--comment 'user comments for each experiment'
|
73 |
+
```
|
74 |
+
|
75 |
+
```bash
|
76 |
+
python train_CLKWS.py \
|
77 |
+
--epoch 4 \
|
78 |
+
--lr 1e-3 \
|
79 |
+
--loss_weight 1.0 1.0 \
|
80 |
+
--audio_input both \
|
81 |
+
--text_input phoneme \
|
82 |
+
--load_checkpoint_path '/home/DB/checkpoint_results/checkpoint/date-time' \
|
83 |
+
--comment 'user comments for each experiment'
|
84 |
+
```
|
85 |
+
|
86 |
+
### Inference
|
87 |
+
keyword list is target_list in google_infe202405.py
|
88 |
+
|
89 |
+
```bash
|
90 |
+
python inference.py --audio_input both --text_input phoneme --load_checkpoint_path 'home/DB/checkpoint_results/checkpoint/20240515-111757'
|
91 |
+
```
|
92 |
+
|
93 |
+
|
94 |
+
### Demo
|
95 |
+
checkpoint:checkpoint: ./checkpoint_results/checkpoint_guided_ctc/20240725-011006
|
96 |
+
./checkpoint_results/checkpoint_gctc_clap/20240725-154258
|
97 |
+
|
98 |
+
```bash
|
99 |
+
python demo.py --audio_input both --text_input phoneme --load_checkpoint_path '/home/DB/checkpoint_results/checkpoint_guided_ctc/20240725-011006' --keyword_list_length 8
|
100 |
+
```
|
101 |
+
|
102 |
+
Demo website :Running on public URL
|
103 |
+
upload file: MONO, WAV, 256kbps, 22050hz
|
104 |
+
dataset/dataloader_demo.py : self.maxlen_a = 56000
|
105 |
+
|
106 |
+
|
107 |
+
### Monitoring
|
108 |
+
|
109 |
+
```bash
|
110 |
+
tensorboard --logdir ./log/ --bind_all
|
111 |
+
```
|
112 |
+
|
113 |
+
### Acknownoledge
|
114 |
+
We acknowledge the following code repositories:
|
115 |
+
https://github.com/ncsoft/PhonMatchNet
|
checkpoint_results/checkpoint_gctc_clap/20240725-154258/checkpoint
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
model_checkpoint_path: "ckpt-29"
|
2 |
+
all_model_checkpoint_paths: "ckpt-29"
|
checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25da31f91bcff94540bf57296b058d07aaaa804c85ad59d5eaf9bc3f9803c62f
|
3 |
+
size 1211835
|
checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.index
ADDED
Binary file (2.23 kB). View file
|
|
checkpoint_results/checkpoint_guided_ctc/20240725-011006/checkpoint
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
model_checkpoint_path: "ckpt-23"
|
2 |
+
all_model_checkpoint_paths: "ckpt-23"
|
checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0228d6d9c71e767409ff8d2a300eda7d8d115185d3b793699cae730715424aa
|
3 |
+
size 3630878
|
checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.index
ADDED
Binary file (6.37 kB). View file
|
|
criterion/__pycache__/total.cpython-37.pyc
ADDED
Binary file (2.78 kB). View file
|
|
criterion/__pycache__/total_ctc1_clap.cpython-37.pyc
ADDED
Binary file (4.29 kB). View file
|
|
criterion/__pycache__/utils.cpython-37.pyc
ADDED
Binary file (1.52 kB). View file
|
|
criterion/total.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
from tensorflow.keras.losses import Loss, MeanSquaredError
|
5 |
+
|
6 |
+
seed = 42
|
7 |
+
tf.random.set_seed(seed)
|
8 |
+
np.random.seed(seed)
|
9 |
+
|
10 |
+
def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
|
11 |
+
"""
|
12 |
+
args
|
13 |
+
speech_label : [B, Ls]
|
14 |
+
text_label : [B, Lt]
|
15 |
+
logits : [B, Lt]
|
16 |
+
logits._keras_mask : [B, Lt]
|
17 |
+
"""
|
18 |
+
# Data pre-processing
|
19 |
+
if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
|
20 |
+
speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
|
21 |
+
elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
|
22 |
+
speech_label = speech_label[:, :text_label.shape[1]]
|
23 |
+
|
24 |
+
# Make paired data between text and speech phonemes
|
25 |
+
paired_label = tf.math.equal(text_label, speech_label)
|
26 |
+
paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
|
27 |
+
paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
28 |
+
logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
29 |
+
|
30 |
+
# Get BinaryCrossEntropy loss
|
31 |
+
BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
32 |
+
loss = BCE(paired_label, logits)
|
33 |
+
|
34 |
+
if reduction == 'sum':
|
35 |
+
loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
|
36 |
+
loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
|
37 |
+
|
38 |
+
return loss
|
39 |
+
|
40 |
+
def detection_loss(y_true, y_pred):
|
41 |
+
BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
42 |
+
return(BFC(y_true, y_pred))
|
43 |
+
|
44 |
+
class TotalLoss(Loss):
|
45 |
+
def __init__(self, weight=1.0):
|
46 |
+
super().__init__()
|
47 |
+
self.weight = weight
|
48 |
+
|
49 |
+
def __call__(self, y_true, y_pred, reduction='sum'):
|
50 |
+
LD = detection_loss(y_true, y_pred)
|
51 |
+
|
52 |
+
return self.weight * LD, LD
|
53 |
+
|
54 |
+
|
55 |
+
class TotalLoss_SCE(Loss):
|
56 |
+
def __init__(self, weight=[1.0, 1.0]):
|
57 |
+
super().__init__()
|
58 |
+
self.weight = weight
|
59 |
+
|
60 |
+
def __call__(self, y_true, y_pred, speech_label, text_label, logit, reduction='sum'):
|
61 |
+
if self.weight[0] != 0.0:
|
62 |
+
LD = detection_loss(y_true, y_pred)
|
63 |
+
else:
|
64 |
+
LD = 0
|
65 |
+
if self.weight[1] != 0.0:
|
66 |
+
LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
|
67 |
+
else:
|
68 |
+
LC = 0
|
69 |
+
return self.weight[0] * LD + self.weight[1] * LC, LD, LC
|
criterion/total_CLKWS.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
from tensorflow.keras.losses import Loss, MeanSquaredError
|
5 |
+
import math
|
6 |
+
seed = 42
|
7 |
+
tf.random.set_seed(seed)
|
8 |
+
np.random.seed(seed)
|
9 |
+
|
10 |
+
def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
|
11 |
+
"""
|
12 |
+
args
|
13 |
+
speech_label : [B, Ls]
|
14 |
+
text_label : [B, Lt]
|
15 |
+
logits : [B, Lt]
|
16 |
+
logits._keras_mask : [B, Lt]
|
17 |
+
"""
|
18 |
+
# Data pre-processing
|
19 |
+
if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
|
20 |
+
speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
|
21 |
+
elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
|
22 |
+
speech_label = speech_label[:, :text_label.shape[1]]
|
23 |
+
|
24 |
+
# Make paired data between text and speech phonemes
|
25 |
+
paired_label = tf.math.equal(text_label, speech_label)
|
26 |
+
paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
|
27 |
+
paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
28 |
+
logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
29 |
+
|
30 |
+
# Get BinaryCrossEntropy loss
|
31 |
+
BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
32 |
+
loss = BCE(paired_label, logits)
|
33 |
+
|
34 |
+
if reduction == 'sum':
|
35 |
+
loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
|
36 |
+
loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
|
37 |
+
|
38 |
+
return loss
|
39 |
+
|
40 |
+
def detection_loss(y_true, y_pred):
|
41 |
+
BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
42 |
+
return(BFC(y_true, y_pred))
|
43 |
+
|
44 |
+
def matrix_loss_0(y_true, y_pred):
|
45 |
+
MBC_0 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
|
46 |
+
return(MBC_0(y_true, y_pred))
|
47 |
+
|
48 |
+
def matrix_loss_1(y_true, y_pred):
|
49 |
+
MBC_1 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
|
50 |
+
return(MBC_1(y_true, y_pred))
|
51 |
+
|
52 |
+
|
53 |
+
class TotalLoss(Loss):
|
54 |
+
def __init__(self, weight=1.0):
|
55 |
+
super().__init__()
|
56 |
+
self.weight = weight
|
57 |
+
|
58 |
+
def __call__(self, y_true, y_pred, reduction='sum'):
|
59 |
+
LD = detection_loss(y_true, y_pred)
|
60 |
+
|
61 |
+
return self.weight * LD, LD
|
62 |
+
|
63 |
+
|
64 |
+
class TotalLoss_SCE(Loss):
|
65 |
+
def __init__(self, weight=[1.0, 1.0]):
|
66 |
+
super().__init__()
|
67 |
+
self.weight = weight
|
68 |
+
|
69 |
+
def __call__(self, y_true, y_pred, speech_label, text_label, logit, prob, reduction='sum'):
|
70 |
+
if self.weight[0] != 0.0:
|
71 |
+
LD = detection_loss(y_true, y_pred)
|
72 |
+
else:
|
73 |
+
LD = 0
|
74 |
+
if self.weight[1] != 0.0:
|
75 |
+
LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
|
76 |
+
else:
|
77 |
+
LC = 0
|
78 |
+
|
79 |
+
|
80 |
+
number_1 = 5
|
81 |
+
number_2 = int(y_pred.shape[0]//number_1)
|
82 |
+
number_3 = int(y_pred.shape[0]//(number_1*number_1))
|
83 |
+
|
84 |
+
y_pred_1 = tf.reshape(prob,[number_2,number_1])
|
85 |
+
y_true_1 = tf.reshape(y_true,[number_2,number_1])
|
86 |
+
|
87 |
+
loss_audio = matrix_loss_0(y_true_1,y_pred_1)
|
88 |
+
|
89 |
+
x=tf.reshape(prob,[number_3,number_1,number_1])
|
90 |
+
x_transposed = tf.transpose(x, perm=[0, 2, 1])
|
91 |
+
y_pred_2 = tf.reshape(x_transposed,[number_2,number_1])
|
92 |
+
y = tf.reshape(y_true,[number_3,number_1,number_1])
|
93 |
+
y_transposed = tf.transpose(y,perm=[0, 2, 1])
|
94 |
+
y_true_2 = tf.reshape(y_transposed,[number_2,number_1])
|
95 |
+
loss_text = matrix_loss_1(y_true_2,y_pred_2)
|
96 |
+
loss = 0.5*loss_audio + 0.5*loss_text
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
return self.weight[0] * LD + self.weight[1] * LC + loss, LD, LC
|
criterion/total_ctc1.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
from tensorflow.keras.losses import Loss, MeanSquaredError
|
5 |
+
|
6 |
+
seed = 42
|
7 |
+
tf.random.set_seed(seed)
|
8 |
+
np.random.seed(seed)
|
9 |
+
|
10 |
+
def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
|
11 |
+
"""
|
12 |
+
args
|
13 |
+
speech_label : [B, Ls]
|
14 |
+
text_label : [B, Lt]
|
15 |
+
logits : [B, Lt]
|
16 |
+
logits._keras_mask : [B, Lt]
|
17 |
+
"""
|
18 |
+
# Data pre-processing
|
19 |
+
if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
|
20 |
+
speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
|
21 |
+
elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
|
22 |
+
speech_label = speech_label[:, :text_label.shape[1]]
|
23 |
+
|
24 |
+
# Make paired data between text and speech phonemes
|
25 |
+
paired_label = tf.math.equal(text_label, speech_label)
|
26 |
+
paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
|
27 |
+
paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
28 |
+
logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
29 |
+
|
30 |
+
# Get BinaryCrossEntropy loss
|
31 |
+
BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
32 |
+
loss = BCE(paired_label, logits)
|
33 |
+
|
34 |
+
if reduction == 'sum':
|
35 |
+
loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
|
36 |
+
loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
|
37 |
+
|
38 |
+
return loss
|
39 |
+
|
40 |
+
def detection_loss(y_true, y_pred):
|
41 |
+
BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
42 |
+
return(BFC(y_true, y_pred))
|
43 |
+
|
44 |
+
def ctc_loss(affinity_matrix, speech_labels, text_labels,n_speech):
|
45 |
+
#logit_length
|
46 |
+
# n_speech = tf.math.reduce_sum(tf.cast(affinity_matrix._keras_mask, tf.float32), -1)
|
47 |
+
|
48 |
+
#logit
|
49 |
+
transposed_logits = tf.transpose(affinity_matrix, perm=[0, 2, 1])
|
50 |
+
# log_probs = tf.math.log(transposed_logits+ 1e-8)
|
51 |
+
# logits_approx = log_probs - tf.reduce_max(log_probs, axis=-1, keepdims=True)
|
52 |
+
|
53 |
+
#label
|
54 |
+
matches = tf.equal(speech_labels, text_labels)
|
55 |
+
indices = tf.range(text_labels.shape[1], dtype=tf.int32)
|
56 |
+
selected_indices = tf.where(matches, indices, tf.fill(tf.shape(text_labels), 0))
|
57 |
+
labels = tf.where(tf.equal(text_labels, 0), text_labels, selected_indices)
|
58 |
+
|
59 |
+
#label_length
|
60 |
+
label_length = tf.math.count_nonzero(labels, axis=1)
|
61 |
+
|
62 |
+
ctc_loss = tf.nn.ctc_loss(labels,transposed_logits,label_length,n_speech,
|
63 |
+
logits_time_major=False,
|
64 |
+
unique=None,
|
65 |
+
blank_index=0,
|
66 |
+
name=None)
|
67 |
+
|
68 |
+
return ctc_loss
|
69 |
+
|
70 |
+
class TotalLoss(Loss):
|
71 |
+
def __init__(self, weight=1.0):
|
72 |
+
super().__init__()
|
73 |
+
self.weight = weight
|
74 |
+
|
75 |
+
def __call__(self, y_true, y_pred, reduction='sum'):
|
76 |
+
LD = detection_loss(y_true, y_pred)
|
77 |
+
|
78 |
+
return self.weight * LD, LD
|
79 |
+
|
80 |
+
|
81 |
+
class TotalLoss_SCE(Loss):
|
82 |
+
def __init__(self, weight=[1.0, 1.0, 0.2]):
|
83 |
+
super().__init__()
|
84 |
+
self.weight = weight
|
85 |
+
|
86 |
+
def __call__(self, y_true, y_pred, speech_label, text_label, logit,affinity_matrix,n_speech, reduction='sum'):
|
87 |
+
ctc = ctc_loss(affinity_matrix, speech_label, text_label,n_speech)
|
88 |
+
|
89 |
+
if self.weight[0] != 0.0:
|
90 |
+
LD = detection_loss(y_true, y_pred)
|
91 |
+
else:
|
92 |
+
LD = 0
|
93 |
+
if self.weight[1] != 0.0:
|
94 |
+
LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
|
95 |
+
else:
|
96 |
+
LC = 0
|
97 |
+
return self.weight[0] * LD + self.weight[1] * LC + self.weight[2]*ctc, LD, LC
|
criterion/total_ctc1_clap.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
from tensorflow.keras.losses import Loss, MeanSquaredError
|
5 |
+
|
6 |
+
seed = 42
|
7 |
+
tf.random.set_seed(seed)
|
8 |
+
np.random.seed(seed)
|
9 |
+
|
10 |
+
def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
|
11 |
+
"""
|
12 |
+
args
|
13 |
+
speech_label : [B, Ls]
|
14 |
+
text_label : [B, Lt]
|
15 |
+
logits : [B, Lt]
|
16 |
+
logits._keras_mask : [B, Lt]
|
17 |
+
"""
|
18 |
+
# Data pre-processing
|
19 |
+
if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
|
20 |
+
speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
|
21 |
+
elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
|
22 |
+
speech_label = speech_label[:, :text_label.shape[1]]
|
23 |
+
|
24 |
+
# Make paired data between text and speech phonemes
|
25 |
+
paired_label = tf.math.equal(text_label, speech_label)
|
26 |
+
paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
|
27 |
+
paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
28 |
+
logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
|
29 |
+
|
30 |
+
# Get BinaryCrossEntropy loss
|
31 |
+
BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
32 |
+
loss = BCE(paired_label, logits)
|
33 |
+
|
34 |
+
if reduction == 'sum':
|
35 |
+
loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
|
36 |
+
loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
|
37 |
+
|
38 |
+
return loss
|
39 |
+
|
40 |
+
def matrix_loss_0(y_true, y_pred):
|
41 |
+
MBC_0 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
|
42 |
+
return(MBC_0(y_true, y_pred))
|
43 |
+
|
44 |
+
def matrix_loss_1(y_true, y_pred):
|
45 |
+
MBC_1 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
|
46 |
+
return(MBC_1(y_true, y_pred))
|
47 |
+
|
48 |
+
def detection_loss(y_true, y_pred):
|
49 |
+
BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
|
50 |
+
return(BFC(y_true, y_pred))
|
51 |
+
|
52 |
+
def ctc_loss(affinity_matrix, speech_labels, text_labels,n_speech):
|
53 |
+
#logit_length
|
54 |
+
# n_speech = tf.math.reduce_sum(tf.cast(affinity_matrix._keras_mask, tf.float32), -1)
|
55 |
+
|
56 |
+
#logit
|
57 |
+
transposed_logits = tf.transpose(affinity_matrix, perm=[0, 2, 1])
|
58 |
+
# log_probs = tf.math.log(transposed_logits+ 1e-8)
|
59 |
+
# logits_approx = log_probs - tf.reduce_max(log_probs, axis=-1, keepdims=True)
|
60 |
+
|
61 |
+
#label
|
62 |
+
matches = tf.equal(speech_labels, text_labels)
|
63 |
+
indices = tf.range(text_labels.shape[1], dtype=tf.int32)
|
64 |
+
selected_indices = tf.where(matches, indices, tf.fill(tf.shape(text_labels), 0))
|
65 |
+
labels = tf.where(tf.equal(text_labels, 0), text_labels, selected_indices)
|
66 |
+
|
67 |
+
#label_length
|
68 |
+
label_length = tf.math.count_nonzero(labels, axis=1)
|
69 |
+
|
70 |
+
# mask = tf.not_equal(labels, 0)
|
71 |
+
# # 应用mask,使用 tf.ragged.boolean_mask 来处理不同长度的数据
|
72 |
+
# labels = tf.ragged.boolean_mask(labels, mask)
|
73 |
+
|
74 |
+
ctc_loss = tf.nn.ctc_loss(labels,transposed_logits,label_length,n_speech,
|
75 |
+
logits_time_major=False,
|
76 |
+
unique=None,
|
77 |
+
blank_index=0,
|
78 |
+
name=None)
|
79 |
+
|
80 |
+
return ctc_loss
|
81 |
+
|
82 |
+
class TotalLoss(Loss):
|
83 |
+
def __init__(self, weight=1.0):
|
84 |
+
super().__init__()
|
85 |
+
self.weight = weight
|
86 |
+
|
87 |
+
def __call__(self, y_true, y_pred, reduction='sum'):
|
88 |
+
LD = detection_loss(y_true, y_pred)
|
89 |
+
|
90 |
+
return self.weight * LD, LD
|
91 |
+
|
92 |
+
|
93 |
+
class TotalLoss_SCE(Loss):
|
94 |
+
def __init__(self, weight=[1.0, 1.0, 0.01, 0.01]):
|
95 |
+
super().__init__()
|
96 |
+
self.weight = weight
|
97 |
+
|
98 |
+
def __call__(self, y_true, y_pred, speech_label, text_label, logit,prob,affinity_matrix,n_speech, reduction='sum'):
|
99 |
+
ctc = ctc_loss(affinity_matrix, speech_label, text_label,n_speech)
|
100 |
+
|
101 |
+
number_1 = 5
|
102 |
+
number_2 = int(y_pred.shape[0]//number_1)
|
103 |
+
number_3 = int(y_pred.shape[0]//(number_1*number_1))
|
104 |
+
y_pred_1 = tf.reshape(prob,[number_2,number_1])
|
105 |
+
y_true_1 = tf.reshape(y_true,[number_2,number_1])
|
106 |
+
|
107 |
+
loss_audio = matrix_loss_0(y_true_1,y_pred_1)
|
108 |
+
x=tf.reshape(prob,[number_3,number_1,number_1])
|
109 |
+
x_transposed = tf.transpose(x, perm=[0, 2, 1])
|
110 |
+
y_pred_2 = tf.reshape(x_transposed,[number_2,number_1])
|
111 |
+
y = tf.reshape(y_true,[number_3,number_1,number_1])
|
112 |
+
y_transposed = tf.transpose(y,perm=[0, 2, 1])
|
113 |
+
y_true_2 = tf.reshape(y_transposed,[number_2,number_1])
|
114 |
+
loss_text = matrix_loss_1(y_true_2,y_pred_2)
|
115 |
+
loss = 0.5*loss_audio + 0.5*loss_text
|
116 |
+
|
117 |
+
if self.weight[0] != 0.0:
|
118 |
+
LD = detection_loss(y_true, y_pred)
|
119 |
+
else:
|
120 |
+
LD = 0
|
121 |
+
if self.weight[1] != 0.0:
|
122 |
+
LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
|
123 |
+
else:
|
124 |
+
LC = 0
|
125 |
+
return self.weight[0] * LD + self.weight[1] * LC + self.weight[2]*ctc + self.weight[3]*loss, LD, LC
|
criterion/utils.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import sklearn.metrics
|
3 |
+
import tensorflow as tf
|
4 |
+
|
5 |
+
def compute_eer(label, pred):
|
6 |
+
# all fpr, tpr, fnr, fnr, threshold are lists (in the format of np.array)
|
7 |
+
fpr, tpr, threshold = sklearn.metrics.roc_curve(label, pred)
|
8 |
+
fnr = 1 - tpr
|
9 |
+
|
10 |
+
# the threshold of fnr == fpr
|
11 |
+
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
|
12 |
+
|
13 |
+
# theoretically eer from fpr and eer from fnr should be identical but they can be slightly differ in reality
|
14 |
+
eer_1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
|
15 |
+
eer_2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]
|
16 |
+
|
17 |
+
# return the mean of eer from fpr and from fnr
|
18 |
+
eer = (eer_1 + eer_2) / 2
|
19 |
+
return eer
|
20 |
+
|
21 |
+
class eer(tf.keras.metrics.Metric):
|
22 |
+
def __init__(self, name='equal_error_rate', **kwargs):
|
23 |
+
super(eer, self).__init__(name=name, **kwargs)
|
24 |
+
self.score = self.add_weight(name='eer', initializer='zeros')
|
25 |
+
self.count = self.add_weight(name='count', initializer='zeros')
|
26 |
+
|
27 |
+
def update_state(self, y_true, y_pred):
|
28 |
+
self.score.assign_add(tf.reduce_sum(tf.py_function(func=compute_eer, inp=[y_true, y_pred], Tout=tf.float32, name='compute_eer')))
|
29 |
+
self.count.assign_add(1)
|
30 |
+
|
31 |
+
def result(self):
|
32 |
+
return tf.math.divide_no_nan(self.score, self.count)
|
dataset/__pycache__/dataloader_demo.cpython-37.pyc
ADDED
Binary file (7.73 kB). View file
|
|
dataset/__pycache__/dataloader_infe.cpython-37.pyc
ADDED
Binary file (6.73 kB). View file
|
|
dataset/__pycache__/google.cpython-37.pyc
ADDED
Binary file (8.57 kB). View file
|
|
dataset/__pycache__/google_infe202405.cpython-37.pyc
ADDED
Binary file (8.64 kB). View file
|
|
dataset/__pycache__/libriphrase.cpython-37.pyc
ADDED
Binary file (13.6 kB). View file
|
|
dataset/__pycache__/libriphrase_ctc1.cpython-37.pyc
ADDED
Binary file (14.3 kB). View file
|
|
dataset/__pycache__/qualcomm.cpython-37.pyc
ADDED
Binary file (8.06 kB). View file
|
|
dataset/dataloader_demo.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, os, re, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from multiprocessing import Pool
|
6 |
+
from scipy.io import wavfile
|
7 |
+
import tensorflow as tf
|
8 |
+
from pydub import AudioSegment
|
9 |
+
from tensorflow.keras.utils import Sequence, OrderedEnqueuer
|
10 |
+
from tensorflow.keras import layers
|
11 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
12 |
+
|
13 |
+
sys.path.append(os.path.dirname(__file__))
|
14 |
+
from g2p.g2p_en.g2p import G2p
|
15 |
+
|
16 |
+
import warnings
|
17 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
18 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
19 |
+
|
20 |
+
class GoogleCommandsDataloader(Sequence):
|
21 |
+
def __init__(self,
|
22 |
+
batch_size,
|
23 |
+
fs = 16000,
|
24 |
+
keyword=['realtek go','ok google','vintage','hackney','crocodile','surroundings','oversaw','northwestern'],
|
25 |
+
wav_path_or_object='/share/nas165/yiting/recording/ok_google/Default_20240725-183008.wav',
|
26 |
+
features='g2p_embed', # phoneme, g2p_embed, both ...
|
27 |
+
|
28 |
+
):
|
29 |
+
|
30 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
31 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
32 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
33 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
34 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
35 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
36 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
37 |
+
' ']
|
38 |
+
|
39 |
+
self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
|
40 |
+
self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
|
41 |
+
|
42 |
+
self.batch_size = batch_size
|
43 |
+
self.fs = fs
|
44 |
+
self.features = features
|
45 |
+
self.nPhoneme = len(phonemes)
|
46 |
+
self.g2p = G2p()
|
47 |
+
self.keyword = keyword
|
48 |
+
self.wav = wav_path_or_object
|
49 |
+
self.__prep__()
|
50 |
+
self.on_epoch_end()
|
51 |
+
|
52 |
+
def __prep__(self):
|
53 |
+
self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
|
54 |
+
anchor = ' '
|
55 |
+
target_dict = {}
|
56 |
+
if isinstance(self.wav, str):
|
57 |
+
anchor = self.wav.split('/')[-2].lower().replace('_', ' ')
|
58 |
+
duration = float(wavfile.read(self.wav)[1].shape[-1])/self.fs
|
59 |
+
else:
|
60 |
+
duration = float(self.wav[1].shape[-1])/self.fs
|
61 |
+
|
62 |
+
# duration = float(wavfile.read(self.wav)[1].shape[-1])/self.fs
|
63 |
+
# duration = float(self.wav_path_or_object.shape[-1])/self.fs
|
64 |
+
|
65 |
+
for i, comparison_text in enumerate(self.keyword):
|
66 |
+
label = 1 if comparison_text == anchor else 0
|
67 |
+
target_dict[i] = {
|
68 |
+
'wav': self.wav,
|
69 |
+
'text': comparison_text,
|
70 |
+
'duration': duration,
|
71 |
+
'label': label
|
72 |
+
}
|
73 |
+
|
74 |
+
print(target_dict)
|
75 |
+
self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
|
76 |
+
print(self.data)
|
77 |
+
# g2p & p2idx by g2p_en package
|
78 |
+
print(">> Convert word to phoneme")
|
79 |
+
self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
80 |
+
print(">> Convert phoneme to index")
|
81 |
+
self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
82 |
+
print(">> Compute phoneme embedding")
|
83 |
+
self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
|
84 |
+
|
85 |
+
# if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
|
86 |
+
# self.data.to_pickle(self.pkl)
|
87 |
+
|
88 |
+
|
89 |
+
# Get longest data
|
90 |
+
self.wav_list = self.data['wav'].values
|
91 |
+
self.idx_list = self.data['pIndex'].values
|
92 |
+
# self.idx_list = [np.insert(lst, 0, 0) for lst in self.idx_list]
|
93 |
+
# self.sIdx_list = [np.insert(lst, 0, 0) for lst in self.sIdx_list]
|
94 |
+
self.emb_list = self.data['g2p_embed'].values
|
95 |
+
self.lab_list = self.data['label'].values
|
96 |
+
self.data = self.data.sort_values(by='duration').reset_index(drop=True)
|
97 |
+
|
98 |
+
# Set dataloader params.
|
99 |
+
self.len = len(self.data)
|
100 |
+
self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
101 |
+
# self.maxlen_a = int(((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)*1.2)
|
102 |
+
# print(self.maxlen_a)
|
103 |
+
self.maxlen_a = 56000
|
104 |
+
def __len__(self):
|
105 |
+
# return total batch-wise length
|
106 |
+
return math.ceil(self.len / self.batch_size)
|
107 |
+
|
108 |
+
def _load_wav(self, wav):
|
109 |
+
return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
|
110 |
+
|
111 |
+
def __getitem__(self, idx):
|
112 |
+
# chunking
|
113 |
+
indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
|
114 |
+
|
115 |
+
# load inputs
|
116 |
+
if isinstance(self.wav, str):
|
117 |
+
batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
118 |
+
else:
|
119 |
+
batch_x = [np.array((self.wav_list[i])[1]).astype(np.float32)/ 32768.0 for i in indices]
|
120 |
+
# batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
121 |
+
if self.features == 'both':
|
122 |
+
batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
123 |
+
batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
124 |
+
else:
|
125 |
+
if self.features == 'phoneme':
|
126 |
+
batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
127 |
+
elif self.features == 'g2p_embed':
|
128 |
+
batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
129 |
+
# load outputs
|
130 |
+
batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
|
131 |
+
|
132 |
+
# padding and masking
|
133 |
+
pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
|
134 |
+
if self.features == 'both':
|
135 |
+
pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
|
136 |
+
pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
|
137 |
+
else:
|
138 |
+
pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
|
139 |
+
pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
|
140 |
+
|
141 |
+
if self.features == 'both':
|
142 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
143 |
+
else:
|
144 |
+
return pad_batch_x, pad_batch_y, pad_batch_z
|
145 |
+
|
146 |
+
def on_epoch_end(self):
|
147 |
+
self.indices = np.arange(self.len)
|
148 |
+
# if self.shuffle == True:
|
149 |
+
# np.random.shuffle(self.indices)
|
150 |
+
|
151 |
+
def convert_sequence_to_dataset(dataloader):
|
152 |
+
def data_generator():
|
153 |
+
for i in range(dataloader.__len__()):
|
154 |
+
if dataloader.features == 'both':
|
155 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
|
156 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
157 |
+
else:
|
158 |
+
pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
|
159 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z
|
160 |
+
|
161 |
+
if dataloader.features == 'both':
|
162 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
163 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
164 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
165 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
166 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
167 |
+
)
|
168 |
+
else:
|
169 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
170 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
171 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
172 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
173 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
174 |
+
)
|
175 |
+
# data_dataset = data_dataset.cache()
|
176 |
+
# data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=output_signature)
|
177 |
+
data_dataset = data_dataset.prefetch(1)
|
178 |
+
|
179 |
+
return data_dataset
|
180 |
+
|
181 |
+
if __name__ == '__main__':
|
182 |
+
dataloader = GoogleCommandsDataloader(2048, testset_only=True, pkl='/home/DB/google_speech_commands/google_testset.pkl', features='g2p_embed')
|
dataset/dataloader_infe.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, os, re, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from multiprocessing import Pool
|
6 |
+
from scipy.io import wavfile
|
7 |
+
import tensorflow as tf
|
8 |
+
|
9 |
+
from tensorflow.keras.utils import Sequence, OrderedEnqueuer
|
10 |
+
from tensorflow.keras import layers
|
11 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
12 |
+
|
13 |
+
sys.path.append(os.path.dirname(__file__))
|
14 |
+
from g2p.g2p_en.g2p import G2p
|
15 |
+
|
16 |
+
import warnings
|
17 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
18 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
19 |
+
|
20 |
+
|
21 |
+
def dataloader(fs = 16000,keyword='',wav_path_or_object=None,g2p=None,
|
22 |
+
features='both' # phoneme, g2p_embed, both ...
|
23 |
+
):
|
24 |
+
|
25 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
26 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
27 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
28 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
29 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
30 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
31 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
32 |
+
' ']
|
33 |
+
|
34 |
+
p2idx = {p: idx for idx, p in enumerate(phonemes)}
|
35 |
+
idx2p = {idx: p for idx, p in enumerate(phonemes)}
|
36 |
+
|
37 |
+
fs = fs
|
38 |
+
wav_path_or_object = wav_path_or_object
|
39 |
+
keyword = keyword
|
40 |
+
|
41 |
+
features = features
|
42 |
+
# g2p = G2p()
|
43 |
+
|
44 |
+
|
45 |
+
data = pd.DataFrame(columns=['wav','wav_label', 'text', 'duration', 'label'])
|
46 |
+
|
47 |
+
target_dict = {}
|
48 |
+
idx = 0
|
49 |
+
|
50 |
+
wav = wav_path_or_object
|
51 |
+
keyword = keyword
|
52 |
+
if isinstance(wav_path_or_object, str):
|
53 |
+
duration = float(wavfile.read(wav)[1].shape[-1])/fs
|
54 |
+
else:
|
55 |
+
duration = float(wav_path_or_object.shape[-1])/fs
|
56 |
+
label = 1
|
57 |
+
anchor_text = wav.split('/')[-2].lower()
|
58 |
+
target_dict[idx] = {
|
59 |
+
'wav': wav,
|
60 |
+
'wav_label': anchor_text,
|
61 |
+
'text': keyword,
|
62 |
+
'duration': duration,
|
63 |
+
'label': label
|
64 |
+
}
|
65 |
+
data = data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
|
66 |
+
|
67 |
+
# g2p & p2idx by g2p_en package
|
68 |
+
# print(">> Convert word to phoneme")
|
69 |
+
data['phoneme'] = data['text'].apply(lambda x: g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
70 |
+
# print(">> Convert phoneme to index")
|
71 |
+
data['pIndex'] = data['phoneme'].apply(lambda x: [p2idx[t] for t in x])
|
72 |
+
# print(">> Compute phoneme embedding")
|
73 |
+
data['g2p_embed'] = data['text'].apply(lambda x: g2p.embedding(x))
|
74 |
+
data['wav_phoneme'] = data['wav_label'].apply(lambda x: g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
75 |
+
data['wav_pIndex'] = data['wav_phoneme'].apply(lambda x: [p2idx[t] for t in x])
|
76 |
+
# print(data['phoneme'])
|
77 |
+
# Get longest data
|
78 |
+
data = data.sort_values(by='duration').reset_index(drop=True)
|
79 |
+
wav_list = data['wav'].values
|
80 |
+
idx_list = data['pIndex'].values
|
81 |
+
emb_list = data['g2p_embed'].values
|
82 |
+
lab_list = data['label'].values
|
83 |
+
sIdx_list = data['wav_pIndex'].values
|
84 |
+
# Set dataloader params.
|
85 |
+
# len = len(data)
|
86 |
+
maxlen_t = int((int(data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
87 |
+
maxlen_a = int((int(data['duration'].values[-1] / 0.5) + 1 ) * fs / 2)
|
88 |
+
maxlen_l = int((int(data['wav_label'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
89 |
+
indices = [0]
|
90 |
+
|
91 |
+
# load inputs
|
92 |
+
if isinstance(wav_path_or_object, str):
|
93 |
+
batch_x = [np.array(wavfile.read(wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
94 |
+
else:
|
95 |
+
batch_x = [wav_list[i] / 32768.0 for i in indices]
|
96 |
+
if features == 'both':
|
97 |
+
batch_p = [np.array(idx_list[i]).astype(np.int32) for i in indices]
|
98 |
+
batch_e = [np.array(emb_list[i]).astype(np.float32) for i in indices]
|
99 |
+
else:
|
100 |
+
if features == 'phoneme':
|
101 |
+
batch_y = [np.array(idx_list[i]).astype(np.int32) for i in indices]
|
102 |
+
elif features == 'g2p_embed':
|
103 |
+
batch_y = [np.array(emb_list[i]).astype(np.float32) for i in indices]
|
104 |
+
# load outputs
|
105 |
+
batch_z = [np.array([lab_list[i]]).astype(np.float32) for i in indices]
|
106 |
+
batch_l = [np.array(sIdx_list[i]).astype(np.int32) for i in indices]
|
107 |
+
# padding and masking
|
108 |
+
pad_batch_x = pad_sequences(np.array(batch_x), maxlen=maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
|
109 |
+
if features == 'both':
|
110 |
+
pad_batch_p = pad_sequences(np.array(batch_p), maxlen=maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
|
111 |
+
pad_batch_e = pad_sequences(np.array(batch_e), maxlen=maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
|
112 |
+
else:
|
113 |
+
pad_batch_y = pad_sequences(np.array(batch_y), maxlen=maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
|
114 |
+
pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
|
115 |
+
pad_batch_l = pad_sequences(np.array(batch_l), maxlen=maxlen_l, value=0.0, padding='post', dtype=batch_l[0].dtype)
|
116 |
+
if features == 'both':
|
117 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z,batch_l
|
118 |
+
else:
|
119 |
+
return pad_batch_x, pad_batch_y, pad_batch_z,batch_l
|
120 |
+
|
121 |
+
# def _load_wav(self, wav):
|
122 |
+
# return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
def convert_sequence_to_dataset(dataloader, wav, text, features):
|
128 |
+
fs = 16000
|
129 |
+
features=features
|
130 |
+
duration = float(wavfile.read(wav)[1].shape[-1])/fs
|
131 |
+
maxlen_t = int((int(len(text) / 10) + 1) * 10)
|
132 |
+
maxlen_a = int((int(duration / 0.5) + 1 ) * fs / 2)
|
133 |
+
wav_label = wav.split('/')[-2].lower()
|
134 |
+
|
135 |
+
|
136 |
+
def data_generator():
|
137 |
+
|
138 |
+
if features == 'both':
|
139 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l = dataloader
|
140 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l
|
141 |
+
else:
|
142 |
+
pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_l = dataloader
|
143 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_l
|
144 |
+
|
145 |
+
if features == 'both':
|
146 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
147 |
+
tf.TensorSpec(shape=(None, maxlen_a), dtype=tf.float32),
|
148 |
+
tf.TensorSpec(shape=(None, maxlen_t), dtype=tf.int32),
|
149 |
+
tf.TensorSpec(shape=(None, maxlen_t, 256), dtype=tf.float32),
|
150 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
151 |
+
tf.TensorSpec(shape=(None, None), dtype=tf.int32),)
|
152 |
+
)
|
153 |
+
else:
|
154 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
155 |
+
tf.TensorSpec(shape=(None, maxlen_a), dtype=tf.float32),
|
156 |
+
tf.TensorSpec(shape=(None, maxlen_t) if features == 'phoneme' else (None, maxlen_t, 256),
|
157 |
+
dtype=tf.int32 if features == 'phoneme' else tf.float32),
|
158 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
159 |
+
tf.TensorSpec(shape=(None, None), dtype=tf.int32),)
|
160 |
+
)
|
161 |
+
# data_dataset = data_dataset.cache()
|
162 |
+
data_dataset = data_dataset.prefetch(1)
|
163 |
+
|
164 |
+
return data_dataset
|
dataset/g2p/LICENSE.txt
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "{}"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright {yyyy} {name of copyright owner}
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
dataset/g2p/g2p_en/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .g2p import G2p
|
dataset/g2p/g2p_en/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (186 Bytes). View file
|
|
dataset/g2p/g2p_en/__pycache__/expand.cpython-37.pyc
ADDED
Binary file (2.39 kB). View file
|
|
dataset/g2p/g2p_en/__pycache__/g2p.cpython-37.pyc
ADDED
Binary file (8.05 kB). View file
|
|
dataset/g2p/g2p_en/checkpoint20.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8af35e4596d8dd5836dfd3fe9b2ba4f97b9c311efe8879544cbcfcbd566d8c6
|
3 |
+
size 3342298
|
dataset/g2p/g2p_en/expand.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
#/usr/bin/python2
|
3 |
+
'''
|
4 |
+
Borrowed
|
5 |
+
from https://github.com/keithito/tacotron/blob/master/text/numbers.py
|
6 |
+
By kyubyong park. kbpark.linguist@gmail.com.
|
7 |
+
https://www.github.com/kyubyong/g2p
|
8 |
+
'''
|
9 |
+
from __future__ import print_function
|
10 |
+
import inflect
|
11 |
+
import re
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
_inflect = inflect.engine()
|
16 |
+
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
17 |
+
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
18 |
+
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
19 |
+
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
20 |
+
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
21 |
+
_number_re = re.compile(r'[0-9]+')
|
22 |
+
|
23 |
+
|
24 |
+
def _remove_commas(m):
|
25 |
+
return m.group(1).replace(',', '')
|
26 |
+
|
27 |
+
|
28 |
+
def _expand_decimal_point(m):
|
29 |
+
return m.group(1).replace('.', ' point ')
|
30 |
+
|
31 |
+
|
32 |
+
def _expand_dollars(m):
|
33 |
+
match = m.group(1)
|
34 |
+
parts = match.split('.')
|
35 |
+
if len(parts) > 2:
|
36 |
+
return match + ' dollars' # Unexpected format
|
37 |
+
dollars = int(parts[0]) if parts[0] else 0
|
38 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
39 |
+
if dollars and cents:
|
40 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
41 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
42 |
+
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
43 |
+
elif dollars:
|
44 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
45 |
+
return '%s %s' % (dollars, dollar_unit)
|
46 |
+
elif cents:
|
47 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
48 |
+
return '%s %s' % (cents, cent_unit)
|
49 |
+
else:
|
50 |
+
return 'zero dollars'
|
51 |
+
|
52 |
+
|
53 |
+
def _expand_ordinal(m):
|
54 |
+
return _inflect.number_to_words(m.group(0))
|
55 |
+
|
56 |
+
|
57 |
+
def _expand_number(m):
|
58 |
+
num = int(m.group(0))
|
59 |
+
if num > 1000 and num < 3000:
|
60 |
+
if num == 2000:
|
61 |
+
return 'two thousand'
|
62 |
+
elif num > 2000 and num < 2010:
|
63 |
+
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
64 |
+
elif num % 100 == 0:
|
65 |
+
return _inflect.number_to_words(num // 100) + ' hundred'
|
66 |
+
else:
|
67 |
+
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
68 |
+
else:
|
69 |
+
return _inflect.number_to_words(num, andword='')
|
70 |
+
|
71 |
+
|
72 |
+
def normalize_numbers(text):
|
73 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
74 |
+
text = re.sub(_pounds_re, r'\1 pounds', text)
|
75 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
76 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
77 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
78 |
+
text = re.sub(_number_re, _expand_number, text)
|
79 |
+
return text
|
dataset/g2p/g2p_en/g2p.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# /usr/bin/python
|
3 |
+
'''
|
4 |
+
By kyubyong park(kbpark.linguist@gmail.com) and Jongseok Kim(https://github.com/ozmig77)
|
5 |
+
https://www.github.com/kyubyong/g2p
|
6 |
+
'''
|
7 |
+
from nltk import pos_tag
|
8 |
+
from nltk.corpus import cmudict
|
9 |
+
import nltk
|
10 |
+
from nltk.tokenize import TweetTokenizer
|
11 |
+
word_tokenize = TweetTokenizer().tokenize
|
12 |
+
import numpy as np
|
13 |
+
import codecs
|
14 |
+
import re
|
15 |
+
import os, sys
|
16 |
+
import unicodedata
|
17 |
+
from builtins import str as unicode
|
18 |
+
|
19 |
+
sys.path.append(os.path.dirname(__file__))
|
20 |
+
from expand import normalize_numbers
|
21 |
+
|
22 |
+
try:
|
23 |
+
nltk.data.find('taggers/averaged_perceptron_tagger.zip')
|
24 |
+
except LookupError:
|
25 |
+
nltk.download('averaged_perceptron_tagger')
|
26 |
+
try:
|
27 |
+
nltk.data.find('corpora/cmudict.zip')
|
28 |
+
except LookupError:
|
29 |
+
nltk.download('cmudict')
|
30 |
+
|
31 |
+
dirname = os.path.dirname(__file__)
|
32 |
+
|
33 |
+
def construct_homograph_dictionary():
|
34 |
+
f = os.path.join(dirname,'homographs.en')
|
35 |
+
homograph2features = dict()
|
36 |
+
for line in codecs.open(f, 'r', 'utf8').read().splitlines():
|
37 |
+
if line.startswith("#"): continue # comment
|
38 |
+
headword, pron1, pron2, pos1 = line.strip().split("|")
|
39 |
+
homograph2features[headword.lower()] = (pron1.split(), pron2.split(), pos1)
|
40 |
+
return homograph2features
|
41 |
+
|
42 |
+
# def segment(text):
|
43 |
+
# '''
|
44 |
+
# Splits text into `tokens`.
|
45 |
+
# :param text: A string.
|
46 |
+
# :return: A list of tokens (string).
|
47 |
+
# '''
|
48 |
+
# print(text)
|
49 |
+
# text = re.sub('([.,?!]( |$))', r' \1', text)
|
50 |
+
# print(text)
|
51 |
+
# return text.split()
|
52 |
+
|
53 |
+
class G2p(object):
|
54 |
+
def __init__(self):
|
55 |
+
super().__init__()
|
56 |
+
self.graphemes = ["<pad>", "<unk>", "</s>"] + list("abcdefghijklmnopqrstuvwxyz")
|
57 |
+
self.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
58 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
|
59 |
+
'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
60 |
+
'EY2', 'F', 'G', 'HH',
|
61 |
+
'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
|
62 |
+
'M', 'N', 'NG', 'OW0', 'OW1',
|
63 |
+
'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
|
64 |
+
'UH0', 'UH1', 'UH2', 'UW',
|
65 |
+
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
|
66 |
+
self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)}
|
67 |
+
self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)}
|
68 |
+
|
69 |
+
self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)}
|
70 |
+
self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)}
|
71 |
+
|
72 |
+
self.cmu = cmudict.dict()
|
73 |
+
self.load_variables()
|
74 |
+
self.homograph2features = construct_homograph_dictionary()
|
75 |
+
|
76 |
+
def load_variables(self):
|
77 |
+
self.variables = np.load(os.path.join(dirname,'checkpoint20.npz'))
|
78 |
+
self.enc_emb = self.variables["enc_emb"] # (29, 64). (len(graphemes), emb)
|
79 |
+
self.enc_w_ih = self.variables["enc_w_ih"] # (3*128, 64)
|
80 |
+
self.enc_w_hh = self.variables["enc_w_hh"] # (3*128, 128)
|
81 |
+
self.enc_b_ih = self.variables["enc_b_ih"] # (3*128,)
|
82 |
+
self.enc_b_hh = self.variables["enc_b_hh"] # (3*128,)
|
83 |
+
|
84 |
+
self.dec_emb = self.variables["dec_emb"] # (74, 64). (len(phonemes), emb)
|
85 |
+
self.dec_w_ih = self.variables["dec_w_ih"] # (3*128, 64)
|
86 |
+
self.dec_w_hh = self.variables["dec_w_hh"] # (3*128, 128)
|
87 |
+
self.dec_b_ih = self.variables["dec_b_ih"] # (3*128,)
|
88 |
+
self.dec_b_hh = self.variables["dec_b_hh"] # (3*128,)
|
89 |
+
self.fc_w = self.variables["fc_w"] # (74, 128)
|
90 |
+
self.fc_b = self.variables["fc_b"] # (74,)
|
91 |
+
|
92 |
+
def sigmoid(self, x):
|
93 |
+
return 1 / (1 + np.exp(-x))
|
94 |
+
|
95 |
+
def grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
|
96 |
+
rzn_ih = np.matmul(x, w_ih.T) + b_ih
|
97 |
+
rzn_hh = np.matmul(h, w_hh.T) + b_hh
|
98 |
+
|
99 |
+
rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:]
|
100 |
+
rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:]
|
101 |
+
|
102 |
+
rz = self.sigmoid(rz_ih + rz_hh)
|
103 |
+
r, z = np.split(rz, 2, -1)
|
104 |
+
|
105 |
+
n = np.tanh(n_ih + r * n_hh)
|
106 |
+
h = (1 - z) * n + z * h
|
107 |
+
|
108 |
+
return h
|
109 |
+
|
110 |
+
def gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None):
|
111 |
+
if h0 is None:
|
112 |
+
h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
|
113 |
+
h = h0 # initial hidden state
|
114 |
+
outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
|
115 |
+
for t in range(steps):
|
116 |
+
h = self.grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) # (b, h)
|
117 |
+
outputs[:, t, ::] = h
|
118 |
+
return outputs
|
119 |
+
|
120 |
+
def encode(self, word):
|
121 |
+
chars = list(word) + ["</s>"]
|
122 |
+
x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
|
123 |
+
x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)
|
124 |
+
|
125 |
+
return x
|
126 |
+
|
127 |
+
def predict(self, word):
|
128 |
+
# encoder
|
129 |
+
enc = self.encode(word)
|
130 |
+
enc = self.gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
|
131 |
+
self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
|
132 |
+
last_hidden = enc[:, -1, :]
|
133 |
+
|
134 |
+
# decoder
|
135 |
+
dec = np.take(self.dec_emb, [2], axis=0) # 2: <s>
|
136 |
+
h = last_hidden
|
137 |
+
|
138 |
+
preds = []
|
139 |
+
for i in range(20):
|
140 |
+
h = self.grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) # (b, h)
|
141 |
+
logits = np.matmul(h, self.fc_w.T) + self.fc_b
|
142 |
+
pred = logits.argmax()
|
143 |
+
if pred == 3: break # 3: </s>
|
144 |
+
preds.append(pred)
|
145 |
+
dec = np.take(self.dec_emb, [pred], axis=0)
|
146 |
+
|
147 |
+
preds = [self.idx2p.get(idx, "<unk>") for idx in preds]
|
148 |
+
|
149 |
+
return preds
|
150 |
+
|
151 |
+
def __call__(self, text):
|
152 |
+
# preprocessing
|
153 |
+
text = unicode(text)
|
154 |
+
text = normalize_numbers(text)
|
155 |
+
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
156 |
+
if unicodedata.category(char) != 'Mn') # Strip accents
|
157 |
+
text = text.lower()
|
158 |
+
text = text.replace("_", " ")
|
159 |
+
text = re.sub("[^ a-z'.,?!\-]", "", text)
|
160 |
+
text = text.replace("i.e.", "that is")
|
161 |
+
text = text.replace("e.g.", "for example")
|
162 |
+
|
163 |
+
# tokenization
|
164 |
+
words = word_tokenize(text)
|
165 |
+
tokens = pos_tag(words) # tuples of (word, tag)
|
166 |
+
|
167 |
+
# steps
|
168 |
+
prons = []
|
169 |
+
for word in words:
|
170 |
+
if re.search("[a-z]", word) is None:
|
171 |
+
continue
|
172 |
+
|
173 |
+
# elif word in self.homograph2features: # Check homograph
|
174 |
+
# pron1, pron2, pos1 = self.homograph2features[word]
|
175 |
+
# if pos.startswith(pos1):
|
176 |
+
# pron = pron1
|
177 |
+
# else:
|
178 |
+
# pron = pron2
|
179 |
+
# elif word in self.cmu: # lookup CMU dict
|
180 |
+
# pron = self.cmu[word][0]
|
181 |
+
# else: # predict for oov
|
182 |
+
|
183 |
+
pron = self.predict(word)
|
184 |
+
|
185 |
+
prons.extend(pron)
|
186 |
+
prons.extend([" "])
|
187 |
+
|
188 |
+
return prons[:-1]
|
189 |
+
|
190 |
+
def embedding(self, text):
|
191 |
+
# preprocessing
|
192 |
+
text = unicode(text)
|
193 |
+
text = normalize_numbers(text)
|
194 |
+
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
195 |
+
if unicodedata.category(char) != 'Mn') # Strip accents
|
196 |
+
text = text.lower()
|
197 |
+
text = re.sub("[^ a-z'.,?!\-]", "", text)
|
198 |
+
text = text.replace("i.e.", "that is")
|
199 |
+
text = text.replace("e.g.", "for example")
|
200 |
+
|
201 |
+
# tokenization
|
202 |
+
words = word_tokenize(text)
|
203 |
+
|
204 |
+
# embedding func.
|
205 |
+
def _get(self, word):
|
206 |
+
# encoder
|
207 |
+
enc = self.encode(word)
|
208 |
+
enc = self.gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
|
209 |
+
self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
|
210 |
+
last_hidden = enc[:, -1, :]
|
211 |
+
|
212 |
+
# decoder
|
213 |
+
dec = np.take(self.dec_emb, [2], axis=0) # 2: <s>
|
214 |
+
h = last_hidden
|
215 |
+
|
216 |
+
preds = []
|
217 |
+
emb = np.empty((0, self.dec_emb[0,:].shape[-1]))
|
218 |
+
for i in range(20):
|
219 |
+
h = self.grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) # (b, h)
|
220 |
+
logits = np.matmul(h, self.fc_w.T) + self.fc_b
|
221 |
+
pred = logits.argmax()
|
222 |
+
if pred == 3: break # 3: </s>
|
223 |
+
dec = np.take(self.dec_emb, [pred], axis=0)
|
224 |
+
emb = np.append(emb, h, axis=0)
|
225 |
+
|
226 |
+
return emb
|
227 |
+
|
228 |
+
# steps
|
229 |
+
embed = np.empty((0, self.dec_emb[0,:].shape[-1]))
|
230 |
+
for word in words:
|
231 |
+
if re.search("[a-z]", word) is None:
|
232 |
+
continue
|
233 |
+
embed = np.append(embed, _get(self, word), axis=0)
|
234 |
+
embed = np.append(embed, np.take(self.dec_emb, [0], axis=0), axis=0)
|
235 |
+
|
236 |
+
return embed[:-1,:]
|
237 |
+
|
238 |
+
if __name__ == '__main__':
|
239 |
+
texts = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'hey_android', 'hey_snapdragon', 'hi_galaxy', 'hi_lumina']
|
240 |
+
# "I have $250 in my pocket.", # number -> spell-out
|
241 |
+
# "popular pets, e.g. cats and dogs", # e.g. -> for example
|
242 |
+
# "I refuse to collect the refuse around here.", # homograph
|
243 |
+
# "I'm an activationist."] # newly coined word
|
244 |
+
g2p = G2p()
|
245 |
+
for text in texts:
|
246 |
+
out = g2p(text)
|
247 |
+
emb = g2p.embedding(text)
|
248 |
+
print(out)
|
249 |
+
print(emb.shape)
|
dataset/g2p/g2p_en/homographs.en
ADDED
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#This is based on http://www minpairs talktalk net/graph html
|
2 |
+
#Each line is formatted as follows:
|
3 |
+
#HEADWORD|PRONUNCIATION1|PRONUNCIATION2|POS
|
4 |
+
#HEADWORD should have PRONUNCIATION1 only if it's part-of-speech is POS
|
5 |
+
#Otherwise PRONUNCIATION2 is applied
|
6 |
+
#May, 2018
|
7 |
+
#Kyubyong Park
|
8 |
+
#https://github|com/kyubyong/g2p
|
9 |
+
ABSENT|AH1 B S AE1 N T|AE1 B S AH0 N T|V
|
10 |
+
ABSTRACT|AE0 B S T R AE1 K T|AE1 B S T R AE2 K T|V
|
11 |
+
ABSTRACTS|AE0 B S T R AE1 K T S|AE1 B S T R AE0 K T S|V
|
12 |
+
ABUSE|AH0 B Y UW1 Z|AH0 B Y UW1 S|V
|
13 |
+
ABUSES|AH0 B Y UW1 Z IH0 Z|AH0 B Y UW1 S IH0 Z|V
|
14 |
+
ACCENT|AH0 K S EH1 N T|AE1 K S EH2 N T|V
|
15 |
+
ACCENTS|AE1 K S EH0 N T S|AE1 K S EH0 N T S|V
|
16 |
+
ADDICT|AH0 D IH1 K T|AE1 D IH2 K T|V
|
17 |
+
ADDICTS|AH0 D IH1 K T S|AE1 D IH2 K T S|V
|
18 |
+
ADVOCATE|AE1 D V AH0 K EY2 T|AE1 D V AH0 K AH0 T|V
|
19 |
+
ADVOCATES|AE1 D V AH0 K EY2 T S|AE1 D V AH0 K AH0 T S|V
|
20 |
+
AFFECT|AH0 F EH1 K T|AE1 F EH0 K T|V
|
21 |
+
AFFECTS|AH0 F EH1 K T S|AE1 F EH0 K T S|V
|
22 |
+
AFFIX|AH0 F IH1 K S|AE1 F IH0 K S|V
|
23 |
+
AFFIXES|AH0 F IH1 K S IH0 Z|AE1 F IH0 K S IH0 Z|V
|
24 |
+
AGGLOMERATE|AH0 G L AA1 M ER0 EY2 T|AH0 G L AA1 M ER0 AH0 T|V
|
25 |
+
AGGREGATE|AE1 G R AH0 G EY0 T|AE1 G R AH0 G AH0 T|V
|
26 |
+
AGGREGATES|AE1 G R AH0 G EY2 T S|AE1 G R AH0 G IH0 T S|V
|
27 |
+
ALLIES|AH0 L AY1 Z|AE1 L AY0 Z|V
|
28 |
+
ALLOY|AH0 L OY1|AE1 L OY2|V
|
29 |
+
ALLOYS|AH0 L OY1 Z|AE1 L OY2 Z|V
|
30 |
+
ALLY|AH0 L AY1|AE1 L AY0|V
|
31 |
+
ALTERNATE|AO1 L T ER0 N EY2 T|AO0 L T ER1 N AH0 T|V
|
32 |
+
ANALYSES|AH0 N AE1 L IH0 S IY2 Z|AE1 N AH0 L AY0 Z IH2 Z|V
|
33 |
+
ANIMATE|AE1 N AH0 M EY2 T|AE1 N AH0 M AH0 T|V
|
34 |
+
ANNEX|AH0 N EH1 K S|AE1 N EH2 K S|V
|
35 |
+
ANNEXES|AH0 N EH1 K S IH0 Z|AE1 N EH2 K S IH0 Z|V
|
36 |
+
APPROPRIATE|AH0 P R OW1 P R IY0 EY2 T|AH0 P R OW1 P R IY0 AH0 T|V
|
37 |
+
APPROXIMATE|AH0 P R AA1 K S AH0 M EY2 T|AH0 P R AA1 K S AH0 M AH0 T|V
|
38 |
+
ARTICULATE|AA0 R T IH1 K Y AH0 L AH0 T|AA0 R T IH1 K Y AH0 L EY2 T|V
|
39 |
+
ASPIRATE|AE1 S P ER0 EY2 T|AE1 S P ER0 AH0 T|V
|
40 |
+
ASPIRATES|AE1 S P ER0 EY2 T S|AE1 S P ER0 AH0 T S|V
|
41 |
+
ASSOCIATE|AH0 S OW1 S IY0 EY2 T|AH0 S OW1 S IY0 AH0 T|V
|
42 |
+
ASSOCIATES|AH0 S OW1 S IY0 EY2 T S|AH0 S OW1 S IY0 AH0 T S|V
|
43 |
+
ATTRIBUTE|AH0 T R IH1 B Y UW2 T|AE1 T R IH0 B Y UW0 T|V
|
44 |
+
ATTRIBUTES|AH0 T R IH1 B Y UW2 T S|AE1 T R IH0 B Y UW0 T S|V
|
45 |
+
BATHS|B AE1 TH S|B AE1 DH Z|V
|
46 |
+
BLESSED|B L EH1 S IH0 D|B L EH1 S T|V
|
47 |
+
CERTIFICATE|S ER0 T IH1 F IH0 K AH0 T|S ER0 T IH1 F IH0 K EY2 T|V
|
48 |
+
CERTIFICATES|S ER0 T IH1 F IH0 K EY2 T S|S ER0 T IH1 F IH0 K AH0 T S|V
|
49 |
+
CLOSE|K L OW1 Z|K L OW1 S|V
|
50 |
+
CLOSER|K L OW1 Z ER0|K L OW1 S ER0|N
|
51 |
+
CLOSES|K L OW1 Z IH0 Z|K L OW1 S IH0 Z|V
|
52 |
+
COLLECT|K AH0 L EH1 K T|K AA1 L EH0 K T|V
|
53 |
+
COLLECTS|K AH0 L EH1 K T S|K AA1 L EH0 K T S|V
|
54 |
+
COMBAT|K AH0 M B AE1 T|K AA1 M B AE0 T|V
|
55 |
+
COMBATS|K AH0 M B AE1 T S|K AH1 M B AE0 T S|V
|
56 |
+
COMBINE|K AH0 M B AY1 N|K AA1 M B AY0 N|V
|
57 |
+
COMMUNE|K AH0 M Y UW1 N|K AA1 M Y UW0 N|V
|
58 |
+
COMMUNES|K AH0 M Y UW1 N Z|K AA1 M Y UW0 N Z|V
|
59 |
+
COMPACT|K AH0 M P AE1 K T|K AA1 M P AE0 K T|V
|
60 |
+
COMPACTS|K AH0 M P AE1 K T S|K AA1 M P AE0 K T S|V
|
61 |
+
COMPLEX|K AH0 M P L EH1 K S| K AA1 M P L EH0 K S|ADJ
|
62 |
+
COMPLIMENT|K AA1 M P L AH0 M EH0 N T|K AA1 M P L AH0 M AH0 N T|V
|
63 |
+
COMPLIMENTS|K AA1 M P L AH0 M EH0 N T S|K AA1 M P L AH0 M AH0 N T S|V
|
64 |
+
COMPOUND|K AH0 M P AW1 N D|K AA1 M P AW0 N D|V
|
65 |
+
COMPOUNDS|K AH0 M P AW1 N D Z|K AA1 M P AW0 N D Z|V
|
66 |
+
COMPRESS|K AH0 M P R EH1 S|K AA1 M P R EH0 S|V
|
67 |
+
COMPRESSES|K AH0 M P R EH1 S IH0 Z|K AA1 M P R EH0 S AH0 Z|V
|
68 |
+
CONCERT|K AH0 N S ER1 T|K AA1 N S ER0 T|V
|
69 |
+
CONCERTS|K AH0 N S ER1 T S|K AA1 N S ER0 T S|V
|
70 |
+
CONDUCT|K AA0 N D AH1 K T|K AA1 N D AH0 K T|V
|
71 |
+
CONFEDERATE|K AH0 N F EH1 D ER0 EY2 T|K AH0 N F EH1 D ER0 AH0 T|V
|
72 |
+
CONFEDERATES|K AH0 N F EH1 D ER0 EY2 T S|K AH0 N F EH1 D ER0 AH0 T S|V
|
73 |
+
CONFINES|K AH0 N F AY1 N Z|K AA1 N F AY2 N Z|V
|
74 |
+
CONFLICT|K AH0 N F L IH1 K T|K AA1 N F L IH0 K T|V
|
75 |
+
CONFLICTS|K AH0 N F L IH1 K T S|K AA1 N F L IH0 K T S|V
|
76 |
+
CONGLOMERATE|K AH0 N G L AA1 M ER0 EY2 T|K AH0 N G L AA1 M ER0 AH0 T|V
|
77 |
+
CONGLOMERATES|K AH0 N G L AA1 M ER0 EY2 T S|K AH0 N G L AA1 M ER0 AH0 T S|V
|
78 |
+
CONSCRIPT|K AH0 N S K R IH1 P T|K AA1 N S K R IH0 P T|V
|
79 |
+
CONSCRIPTS|K AH0 N S K R IH1 P T S|K AA1 N S K R IH0 P T S|V
|
80 |
+
CONSOLE|K AH0 N S OW1 L|K AA1 N S OW0 L|V
|
81 |
+
CONSOLES|K AH0 N S OW1 L Z|K AA1 N S OW0 L Z|V
|
82 |
+
CONSORT|K AH0 N S AO1 R T|K AA1 N S AO0 R T|V
|
83 |
+
CONSTRUCT|K AH0 N S T R AH1 K T|K AA1 N S T R AH0 K T|V
|
84 |
+
CONSTRUCTS|K AH0 N S T R AH1 K T S|K AA1 N S T R AH0 K T S|V
|
85 |
+
CONSUMMATE|K AA1 N S AH0 M EY2 T|K AA0 N S AH1 M AH0 T|V
|
86 |
+
CONTENT|K AA1 N T EH0 N T|K AH0 N T EH1 N T|N
|
87 |
+
CONTENTS|K AH0 N T EH1 N T S|K AA1 N T EH0 N T S|V
|
88 |
+
CONTEST|K AH0 N T EH1 S T|K AA1 N T EH0 S T|V
|
89 |
+
CONTESTS|K AH0 N T EH1 S T S|K AA1 N T EH0 S T S|V
|
90 |
+
CONTRACT|K AH0 N T R AE1 K T|K AA1 N T R AE2 K T|V
|
91 |
+
CONTRACTS|K AH0 N T R AE1 K T S|K AA1 N T R AE2 K T S|V
|
92 |
+
CONTRAST|K AH0 N T R AE1 S T|K AA1 N T R AE0 S T|V
|
93 |
+
CONTRASTS|K AH0 N T R AE1 S T S|K AA1 N T R AE0 S T S|V
|
94 |
+
CONVERSE|K AH0 N V ER1 S|K AA1 N V ER0 S|V
|
95 |
+
CONVERT|K AH0 N V ER1 T|K AA1 N V ER0 T|V
|
96 |
+
CONVERTS|K AH0 N V ER1 T S|K AA1 N V ER0 T S|V
|
97 |
+
CONVICT|K AH0 N V IH1 K T|K AA1 N V IH0 K T|V
|
98 |
+
CONVICTS|K AH0 N V IH1 K T S|K AA1 N V IH0 K T S|V
|
99 |
+
COORDINATE|K OW0 AO1 R D AH0 N EY2 T|K OW0 AO1 R D AH0 N AH0 T|V
|
100 |
+
COORDINATES|K OW0 AO1 R D AH0 N EY2 T S|K OW0 AO1 R D AH0 N AH0 T S|V
|
101 |
+
COUNTERBALANCE|K AW1 N T ER0 B AE2 L AH0 N S|K AW2 N T ER0 B AE1 L AH0 N S|V
|
102 |
+
COUNTERBALANCES|K AW2 N T ER0 B AE1 L AH0 N S IH0 Z|K AW1 N T ER0 B AE2 L AH0 N S IH0 Z|V
|
103 |
+
CRABBED|K R AE1 B D|K R AE1 B IH0 D|V
|
104 |
+
CROOKED|K R UH1 K T|K R UH1 K AH0 D|V
|
105 |
+
CURATE|K Y UH0 R AH1 T|K Y UH1 R AH0 T|V
|
106 |
+
CURSED|K ER1 S T|K ER1 S IH0 D|V
|
107 |
+
DECOY|D IY0 K OY1|D IY1 K OY0|V
|
108 |
+
DECOYS|D IY0 K OY1 Z|D IY1 K OY0 Z|V
|
109 |
+
DECREASE|D IH0 K R IY1 S|D IY1 K R IY2 S|V
|
110 |
+
DECREASES|D IH0 K R IY1 S IH0 Z|D IY1 K R IY2 S IH0 Z|V
|
111 |
+
DEFECT|D IH0 F EH1 K T|D IY1 F EH0 K T|V
|
112 |
+
DEFECTS|D IH0 F EH1 K T S|D IY1 F EH0 K T S|V
|
113 |
+
DEGENERATE|D IH0 JH EH1 N ER0 EY2 T|D IH0 JH EH1 N ER0 AH0 T|V
|
114 |
+
DEGENERATES|D IH0 JH EH1 N ER0 EY2 T S|D IH0 JH EH1 N ER0 AH0 T S|V
|
115 |
+
DELEGATE|D EH1 L AH0 G EY2 T|D EH1 L AH0 G AH0 T|V
|
116 |
+
DELEGATES|D EH1 L AH0 G EY2 T S|D EH1 L AH0 G AH0 T S|V
|
117 |
+
DELIBERATE|D IH0 L IH1 B ER0 EY2 T|D IH0 L IH1 B ER0 AH0 T|V
|
118 |
+
DESERT|D IH0 Z ER1 T|D EH1 Z ER0 T|V
|
119 |
+
DESERTS|D IH0 Z ER1 T S|D EH1 Z ER0 T S|V
|
120 |
+
DESOLATE|D EH1 S AH0 L EY2 T|D EH1 S AH0 L AH0 T|V
|
121 |
+
DIAGNOSES|D AY1 AH0 G N OW2 Z IY0 Z|D AY2 AH0 G N OW1 S IY0 Z|V
|
122 |
+
DICTATE|D IH0 K T EY1 T|D IH1 K T EY2 T|V
|
123 |
+
DICTATES|D IH0 K T EY1 T S|D IH1 K T EY2 T S|V
|
124 |
+
DIFFUSE|D IH0 F Y UW1 Z|D IH0 F Y UW1 S|V
|
125 |
+
DIGEST|D AY0 JH EH1 S T|D AY1 JH EH0 S T|V
|
126 |
+
DIGESTS|D AY2 JH EH1 S T S|D AY1 JH EH0 S T S|V
|
127 |
+
DISCARD|D IH0 S K AA1 R D|D IH1 S K AA0 R D|V
|
128 |
+
DISCARDS|D IH0 S K AA1 R D Z|D IH1 S K AA0 R D Z|V
|
129 |
+
DISCHARGE|D IH0 S CH AA1 R JH|D IH1 S CH AA2 R JH|V
|
130 |
+
DISCHARGES|D IH0 S CH AA1 R JH AH0 Z|D IH1 S CH AA2 R JH AH0 Z|V
|
131 |
+
DISCOUNT|D IH0 S K AW1 N T|D IH1 S K AW0 N T|V
|
132 |
+
DISCOUNTS|D IH0 S K AW1 N T S|D IH1 S K AW2 N T S|V
|
133 |
+
DISCOURSE|D IH0 S K AO1 R S|D IH1 S K AO0 R S|V
|
134 |
+
DISCOURSES|D IH0 S K AO1 R S IH0 Z|D IH1 S K AO0 R S IH0 Z|V
|
135 |
+
DOCUMENT|D AA1 K Y UW0 M EH0 N T|D AA1 K Y AH0 M AH0 N T|V
|
136 |
+
DOCUMENTS|D AA1 K Y UW0 M EH0 N T S|D AA1 K Y AH0 M AH0 N T S|V
|
137 |
+
DOGGED|D AO1 G IH0 D|D AO1 G D|V
|
138 |
+
DUPLICATE|D UW1 P L AH0 K EY2 T|D UW1 P L AH0 K AH0 T|V
|
139 |
+
DUPLICATES|D UW1 P L AH0 K EY2 T S|D UW1 P L AH0 K AH0 T S|V
|
140 |
+
EJACULATE|IH0 JH AE1 K Y UW0 L EY2 T|IH0 JH AE1 K Y UW0 L AH0 T|V
|
141 |
+
EJACULATES|IH0 JH AE1 K Y UW0 L EY2 T S|IH0 JH AE1 K Y UW0 L AH0 T S|V
|
142 |
+
ELABORATE|IH0 L AE1 B ER0 EY2 T|IH0 L AE1 B R AH0 T|V
|
143 |
+
ENTRANCE|IH0 N T R AH1 N S|EH1 N T R AH0 N S|V
|
144 |
+
ENTRANCES|IH0 N T R AH1 N S AH0 Z|EH1 N T R AH0 N S AH0 Z|V
|
145 |
+
ENVELOPE|IH0 N V EH1 L AH0 P|EH1 N V AH0 L OW2 P|V
|
146 |
+
ENVELOPES|IH0 N V EH1 L AH0 P S|EH1 N V AH0 L OW2 P S|V
|
147 |
+
ESCORT|EH0 S K AO1 R T|EH1 S K AO0 R T|V
|
148 |
+
ESCORTS|EH0 S K AO1 R T S|EH1 S K AO0 R T S|V
|
149 |
+
ESSAY|EH0 S EY1|EH1 S EY2|V
|
150 |
+
ESSAYS|EH0 S EY1 Z|EH1 S EY2 Z|V
|
151 |
+
ESTIMATE|EH1 S T AH0 M EY2 T|EH1 S T AH0 M AH0 T|V
|
152 |
+
ESTIMATES|EH1 S T AH0 M EY2 T S|EH1 S T AH0 M AH0 T S|V
|
153 |
+
EXCESS|IH0 K S EH1 S|EH1 K S EH2 S|V
|
154 |
+
EXCISE|EH0 K S AY1 S|EH1 K S AY0 Z|V
|
155 |
+
EXCUSE|IH0 K S K Y UW1 Z|IH0 K S K Y UW1 S|V
|
156 |
+
EXCUSES|IH0 K S K Y UW1 Z IH0 Z|IH0 K S K Y UW1 S IH0 Z|V
|
157 |
+
EXPATRIATE|EH0 K S P EY1 T R IY0 EY2 T|EH0 K S P EY1 T R IY0 AH0 T|V
|
158 |
+
EXPATRIATES|EH0 K S P EY1 T R IY0 EY2 T S|EH0 K S P EY1 T R IY0 AH0 T S|V
|
159 |
+
EXPLOIT|EH1 K S P L OY2 T|EH2 K S P L OY1 T|V
|
160 |
+
EXPLOITS|EH1 K S P L OY2 T S|EH2 K S P L OY1 T S|V
|
161 |
+
EXPORT|IH0 K S P AO1 R T|EH1 K S P AO0 R T|V
|
162 |
+
EXPORTS|IH0 K S P AO1 R T S|EH1 K S P AO0 R T S|V
|
163 |
+
EXTRACT|IH0 K S T R AE1 K T|EH1 K S T R AE2 K T|V
|
164 |
+
EXTRACTS|IH0 K S T R AE1 K T S|EH1 K S T R AE2 K T S|V
|
165 |
+
FERMENT|F ER0 M EH1 N T|F ER1 M EH0 N T|V
|
166 |
+
FERMENTS|F ER0 M EH1 N T S|F ER1 M EH0 N T S|V
|
167 |
+
FRAGMENT|F R AE1 G M AH0 N T|F R AE0 G M EH1 N T|V
|
168 |
+
FRAGMENTS|F R AE0 G M EH1 N T S|F R AE1 G M AH0 N T S|V
|
169 |
+
FREQUENT|F R IY1 K W EH2 N T|F R IY1 K W AH0 N T|V
|
170 |
+
GRADUATE|G R AE1 JH AH0 W EY2 T|G R AE1 JH AH0 W AH0 T|V
|
171 |
+
GRADUATES|G R AE1 JH AH0 W EY2 T S|G R AE1 JH AH0 W AH0 T S|V
|
172 |
+
HOUSE|HH AW1 Z|HH AW1 S|V
|
173 |
+
IMPACT|IH2 M P AE1 K T|IH1 M P AE0 K T|V
|
174 |
+
IMPACTS|IH2 M P AE1 K T S|IH1 M P AE0 K T S|V
|
175 |
+
IMPLANT|IH2 M P L AE1 N T|IH1 M P L AE2 N T|V
|
176 |
+
IMPLANTS|IH2 M P L AE1 N T S|IH1 M P L AE2 N T S|V
|
177 |
+
IMPLEMENT|IH1 M P L AH0 M EH0 N T|IH1 M P L AH0 M AH0 N T|V
|
178 |
+
IMPLEMENTS|IH1 M P L AH0 M EH0 N T S|IH1 M P L AH0 M AH0 N T S|V
|
179 |
+
IMPORT|IH2 M P AO1 R T|IH1 M P AO2 R T|V
|
180 |
+
IMPORTS|IH2 M P AO1 R T S|IH1 M P AO2 R T S|V
|
181 |
+
IMPRESS|IH0 M P R EH1 S|IH1 M P R EH0 S|V
|
182 |
+
IMPRINT|IH1 M P R IH0 N T|IH2 M P R IH1 N T|V
|
183 |
+
IMPRINTS|IH2 M P R IH1 N T S|IH1 M P R IH0 N T S|V
|
184 |
+
INCENSE|IH2 N S EH1 N S|IH1 N S EH2 N S|V
|
185 |
+
INCLINE|IH2 N K L AY1 N|IH1 N K L AY0 N|V
|
186 |
+
INCLINES|IH2 N K L AY1 N Z|IH1 N K L AY0 N Z|V
|
187 |
+
INCORPORATE|IH2 N K AO1 R P ER0 EY2 T|IH2 N K AO1 R P ER0 AH0 T|V
|
188 |
+
INCREASE|IH2 N K R IY1 S|IH1 N K R IY2 S|V
|
189 |
+
INCREASES|IH2 N K R IY1 S IH0 Z|IH1 N K R IY2 S IH0 Z|V
|
190 |
+
INDENT|IH2 N D EH1 N T|IH1 N D EH0 N T|V
|
191 |
+
INDENTS|IH2 N D EH1 N T S|IH1 N D EH0 N T S|V
|
192 |
+
INEBRIATE|IH2 N EH1 B R IY0 EY2 T|IH2 N EH1 B R IY0 AH0 T|V
|
193 |
+
INEBRIATES|IH2 N EH1 B R IY0 EY2 T S|IH2 N EH1 B R IY0 AH0 T S|V
|
194 |
+
INITIATE|IH2 N IH1 SH IY0 EY2 T|IH2 N IH1 SH IY0 AH0 T|V
|
195 |
+
INITIATES|IH2 N IH1 SH IY0 EY2 T S|IH2 N IH1 SH IY0 AH0 T S|V
|
196 |
+
INLAY|IH2 N L EY1|IH1 N L EY2|V
|
197 |
+
INLAYS|IH2 N L EY1 Z|IH1 N L EY2 Z|V
|
198 |
+
INSERT|IH2 N S ER1 T|IH1 N S ER2 T|V
|
199 |
+
INSERTS|IH2 N S ER1 T S|IH1 N S ER2 T S|V
|
200 |
+
INSET|IH2 N S EH1 T|IH1 N S EH2 T|V
|
201 |
+
INSETS|IH2 N S EH1 T S|IH1 N S EH2 T S|V
|
202 |
+
INSTINCT|IH2 N S T IH1 NG K T|IH1 N S T IH0 NG K T|V
|
203 |
+
INSULT|IH2 N S AH1 L T|IH1 N S AH2 L T|V
|
204 |
+
INSULTS|IH2 N S AH1 L T S|IH1 N S AH2 L T S|V
|
205 |
+
INTERCHANGE|IH2 T ER0 CH EY1 N JH|IH1 N T ER0 CH EY2 N JH|V
|
206 |
+
INTERCHANGES|IH2 T ER0 CH EY1 N JH IH0 Z|IH1 N T ER0 CH EY2 N JH IH0 Z|V
|
207 |
+
INTERDICT|IH2 N T ER0 D IH1 K T|IH1 N T ER0 D IH2 K T|V
|
208 |
+
INTERDICTS|IH2 N T ER0 D IH1 K T S|IH1 N T ER0 D IH2 K T S|V
|
209 |
+
INTERN|IH0 N T ER1 N|IH1 N T ER0 N|V
|
210 |
+
INTERNS|IH0 N T ER1 N Z|IH1 N T ER0 N Z|V
|
211 |
+
INTIMATE|IH1 N T IH0 M EY2 T|IH1 N T AH0 M AH0 T|V
|
212 |
+
INTIMATES|IH1 N T IH0 M EY2 T S|IH1 N T AH0 M AH0 T S|V
|
213 |
+
INTROVERT|IH2 N T R AO0 V ER1 T|IH1 N T R AO0 V ER2 T|V
|
214 |
+
INTROVERTS|IH2 N T R AO0 V ER1 T S|IH1 N T R AO0 V ER2 T S|V
|
215 |
+
INVERSE|IH1 N V ER0 S|IH2 N V ER1 S|V
|
216 |
+
INVITE|IH2 N V AY1 T|IH1 N V AY0 T|V
|
217 |
+
INVITES|IH2 N V AY1 T S|IH1 N V AY0 T S|V
|
218 |
+
JAGGED|JH AE1 G D|JH AE1 G IH0 D|V
|
219 |
+
LEARNED|L ER1 N IH0 D|L ER1 N D|V
|
220 |
+
LEGITIMATE|L AH0 JH IH1 T AH0 M EY2 T|L AH0 JH IH1 T AH0 M AH0 T|V
|
221 |
+
MANDATE|M AE1 N D EY2 T|M AE2 N D EY1 T|V
|
222 |
+
MISCONDUCT|M IH2 S K AA1 N D AH0 K T|M IH2 S K AA0 N D AH1 K T|V
|
223 |
+
MISPRINT|M IH2 S P R IH1 N T|M IH1 S P R IH0 N T|V
|
224 |
+
MISPRINTS|M IH2 S P R IH1 N T S|M IH1 S P R IH0 N T S|V
|
225 |
+
MISUSE|M IH0 S Y UW1 S|M IH0 S Y UW1 Z|V
|
226 |
+
MISUSES|M IH0 S Y UW1 Z IH0 Z|M IH0 S Y UW1 S IH0 Z|V
|
227 |
+
MODERATE|M AA1 D ER0 EY2 T|M AA1 D ER0 AH0 T|V
|
228 |
+
MODERATES|M AA1 D ER0 EY2 T S|M AA1 D ER0 AH0 T S|V
|
229 |
+
MOUTH|M AW1 TH|M AW1 DH|V
|
230 |
+
MOUTHS|M AW1 DH Z|M AW1 TH S|V
|
231 |
+
OBJECT|AA1 B JH EH0 K T|AH0 B JH EH1 K T|V
|
232 |
+
OBJECTS|AH0 B JH EH1 K T S|AA1 B JH EH0 K T S|V
|
233 |
+
ORNAMENT|AO1 R N AH0 M EH0 N T|AO1 R N AH0 M AH0 N T|V
|
234 |
+
ORNAMENTS|AO1 R N AH0 M EH0 N T S|AO1 R N AH0 M AH0 N T S|V
|
235 |
+
OVERCHARGE|OW2 V ER0 CH AA1 R JH|OW1 V ER0 CH AA2 R JH|V
|
236 |
+
OVERCHARGES|OW2 V ER0 CH AA1 R JH IH0 Z|OW1 V ER0 CH AA2 R JH IH0 Z|V
|
237 |
+
OVERFLOW|OW2 V ER0 F L OW1|OW1 V ER0 F L OW2|V
|
238 |
+
OVERFLOWS|OW2 V ER0 F L OW1 Z|OW1 V ER0 F L OW2 Z|V
|
239 |
+
OVERHANG|OW2 V ER0 HH AE1 NG|OW1 V ER0 HH AE2 NG|V
|
240 |
+
OVERHANGS|OW2 V ER0 HH AE1 NG Z|OW1 V ER0 HH AE2 NG Z|V
|
241 |
+
OVERHAUL|OW2 V ER0 HH AO1 L|OW1 V ER0 HH AO2 L|V
|
242 |
+
OVERHAULS|OW2 V ER0 HH AO1 L Z|OW1 V ER0 HH AO2 L Z|V
|
243 |
+
OVERLAP|OW2 V ER0 L AE1 P|OW1 V ER0 L AE2 P|V
|
244 |
+
OVERLAPS|OW2 V ER0 L AE1 P S|OW1 V ER0 L AE2 P S|V
|
245 |
+
OVERLAY|OW2 V ER0 L EY1|OW1 V ER0 L EY2|V
|
246 |
+
OVERLAYS|OW2 V ER0 L EY1 Z|OW1 V ER0 L EY2 Z|V
|
247 |
+
OVERWORK|OW2 V ER0 W ER1 K|OW1 V ER0 W ER2 K|V
|
248 |
+
PERFECT|P ER0 F EH1 K T|P ER1 F IH2 K T|V
|
249 |
+
PERFUME|P ER0 F Y UW1 M|P ER1 F Y UW0 M|V
|
250 |
+
PERFUMES|P ER0 F Y UW1 M Z|P ER1 F Y UW0 M Z|V
|
251 |
+
PERMIT|P ER0 M IH1 T|P ER1 M IH2 T|V
|
252 |
+
PERMITS|P ER0 M IH1 T S|P ER1 M IH2 T S|V
|
253 |
+
PERVERT|P ER0 V ER1 T|P ER1 V ER0 T|V
|
254 |
+
PERVERTS|P ER0 V ER1 T S|P ER1 V ER0 T S|V
|
255 |
+
PONTIFICATE|P AA0 N T IH1 F AH0 K AH0 T|P AA0 N T IH1 F AH0 K EY2 T|V
|
256 |
+
PONTIFICATES|P AA0 N T IH1 F AH0 K EY2 T S|P AA0 N T IH1 F AH0 K AH0 T S|V
|
257 |
+
PRECIPITATE|P R IH0 S IH1 P IH0 T AH0 T|P R IH0 S IH1 P IH0 T EY2 T|V
|
258 |
+
PREDICATE|P R EH1 D IH0 K AH0 T|P R EH1 D AH0 K EY2 T|V
|
259 |
+
PREDICATES|P R EH1 D AH0 K EY2 T S|P R EH1 D IH0 K AH0 T S|V
|
260 |
+
PREFIX|P R IY2 F IH1 K S|P R IY1 F IH0 K S|V
|
261 |
+
PREFIXES|P R IY2 F IH1 K S IH0 JH|P R IY1 F IH0 K S IH0 JH|V
|
262 |
+
PRESAGE|P R EH2 S IH1 JH|P R EH1 S IH0 JH|V
|
263 |
+
PRESAGES|P R EH2 S IH1 JH IH0 JH|P R EH1 S IH0 JH IH0 JH|V
|
264 |
+
PRESENT|P R IY0 Z EH1 N T|P R EH1 Z AH0 N T|V
|
265 |
+
PRESENTS|P R IY0 Z EH1 N T S|P R EH1 Z AH0 N T S|V
|
266 |
+
PROCEEDS|P R AH0 S IY1 D Z|P R OW1 S IY0 D Z|V
|
267 |
+
PROCESS|P R AO2 S EH1 S|P R AA1 S EH2 S|V
|
268 |
+
PROCESSES|P R AA1 S EH0 S AH0 Z|P R AO2 S EH1 S AH0 Z|V
|
269 |
+
PROCESSING|P R AA0 S EH1 S IH0 NG|P R AA1 S EH0 S IH0 NG|V
|
270 |
+
PRODUCE|P R AH0 D UW1 S|P R OW1 D UW0 S|V
|
271 |
+
PROGRESS|P R AH0 G R EH1 S|P R AA1 G R EH2 S|V
|
272 |
+
PROGRESSES|P R OW0 G R EH1 S AH0 Z|P R AA1 G R EH2 S AH0 Z|V
|
273 |
+
PROJECT|P R AA0 JH EH1 K T|P R AA1 JH EH0 K T|V
|
274 |
+
PROJECTS|P R AA0 JH EH1 K T S|P R AA1 JH EH0 K T S|V
|
275 |
+
PROSPECT|P R AH2 S P EH1 K T|P R AA1 S P EH0 K T|V
|
276 |
+
PROSPECTS|P R AH2 S P EH1 K T S|P R AA1 S P EH0 K T S|V
|
277 |
+
PROSTRATE|P R AA0 S T R EY1 T|P R AA1 S T R EY0 T|V
|
278 |
+
PROTEST|P R AH0 T EH1 S T|P R OW1 T EH2 S T|V
|
279 |
+
PROTESTS|P R AH0 T EH1 S T S|P R OW1 T EH2 S T S|V
|
280 |
+
PURPORT|P ER0 P AO1 R T|P ER1 P AO2 R T|V
|
281 |
+
QUADRUPLE|K W AA1 D R UW0 P AH0 L|K W AA0 D R UW1 P AH0 L|V
|
282 |
+
QUADRUPLES|K W AA0 D R UW1 P AH0 L Z|K W AA1 D R UW0 P AH0 L Z|V
|
283 |
+
RAGGED|R AE1 G D|R AE1 G AH0 D|V
|
284 |
+
RAMPAGE|R AE2 M P EY1 JH|R AE1 M P EY2 JH|V
|
285 |
+
RAMPAGES|R AE2 M P EY1 JH IH0 Z|R AE1 M P EY2 JH IH0 Z|V
|
286 |
+
READ|R IY1 D|R EH1 D|VBD
|
287 |
+
REBEL|R EH1 B AH0 L|R IH0 B EH1 L|V
|
288 |
+
REBELS|R IH0 B EH1 L Z|R EH1 B AH0 L Z|V
|
289 |
+
REBOUND|R IY0 B AW1 N D|R IY1 B AW0 N D|V
|
290 |
+
REBOUNDS|R IY0 B AW1 N D Z|R IY1 B AW0 N D Z|V
|
291 |
+
RECALL|R IH0 K AO1 L|R IY1 K AO2 L|V
|
292 |
+
RECALLS|R IH0 K AO1 L Z|R IY1 K AO2 L Z|V
|
293 |
+
RECAP|R IH0 K AE1 P|R IY1 K AE2 P|V
|
294 |
+
RECAPPED|R IH0 K AE1 P T|R IY1 K AE2 P T|V
|
295 |
+
RECAPPING|R IH0 K AE1 P IH0 NG|R IY1 K AE2 P IH0 NG|V
|
296 |
+
RECAPS|R IH0 K AE1 P S|R IY1 K AE2 P S|V
|
297 |
+
RECOUNT|R IY2 K AW1 N T| R IH1 K AW0 N T|V
|
298 |
+
RECOUNTS|R IY2 K AW1 N T S| R IH1 K AW0 N T S|V
|
299 |
+
RECORD|R IH0 K AO1 R D|R EH1 K ER0 D|V
|
300 |
+
RECORDS|R IH0 K AO1 R D Z|R EH1 K ER0 D Z|V
|
301 |
+
REFILL|R IY0 F IH1 L|R IY1 F IH0 L|V
|
302 |
+
REFILLS|R IY0 F IH1 L Z|R IY1 F IH0 L Z|V
|
303 |
+
REFIT|R IY0 F IH1 T|R IY1 F IH0 T|V
|
304 |
+
REFITS|R IY0 F IH1 T S|R IY1 F IH0 T S|V
|
305 |
+
REFRESH|R IH0 F R EH1 SH|R IH1 F R EH0 SH|V
|
306 |
+
REFUND|R IH0 F AH1 N D|R IY1 F AH2 N D|V
|
307 |
+
REFUNDS|R IH0 F AH1 N D Z|R IY1 F AH2 N D Z|V
|
308 |
+
REFUSE|R IH0 F Y UW1 Z|R EH1 F Y UW2 Z|V
|
309 |
+
REGENERATE|R IY0 JH EH1 N ER0 EY2 T|R IY0 JH EH1 N ER0 AH0 T|V
|
310 |
+
REHASH|R IY0 HH AE1 SH|R IY1 HH AE0 SH|V
|
311 |
+
REHASHES|R IY0 HH AE1 SH IH0 Z|R IY1 HH AE0 SH IH0 Z|V
|
312 |
+
REINCARNATE|R IY2 IH0 N K AA1 R N EY2 T|R IY2 IH0 N K AA1 R N AH0 T|V
|
313 |
+
REJECT|R IH0 JH EH1 K T|R IY1 JH EH0 K T|V
|
314 |
+
REJECTS|R IH0 JH EH1 K T S|R IY1 JH EH0 K T S|V
|
315 |
+
RELAY|R IY2 L EY1|R IY1 L EY2|V
|
316 |
+
RELAYING|R IY2 L EY1 IH0 NG|R IY1 L EY2 IH0 NG|V
|
317 |
+
RELAYS|R IY2 L EY1 Z|R IY1 L EY2 Z|V
|
318 |
+
REMAKE|R IY2 M EY1 K|R IY1 M EY0 K|V
|
319 |
+
REMAKES|R IY2 M EY1 K S|R IY1 M EY0 K S|V
|
320 |
+
REPLAY|R IY0 P L EY1|R IY1 P L EY0|V
|
321 |
+
REPLAYS|R IY0 P L EY1 Z|R IY1 P L EY0 Z|V
|
322 |
+
REPRINT|R IY0 P R IH1 N T|R IY1 P R IH0 N T|V
|
323 |
+
REPRINTS|R IY0 P R IH1 N T S|R IY1 P R IH0 N T S|V
|
324 |
+
RERUN|R IY2 R AH1 N|R IY1 R AH0 N|V
|
325 |
+
RERUNS|R IY2 R AH1 N Z|R IY1 R AH0 N Z|V
|
326 |
+
RESUME|R IY0 Z UW1 M|R EH1 Z AH0 M EY2|V
|
327 |
+
RETAKE|R IY0 T EY1 K|R IY1 T EY0 K|V
|
328 |
+
RETAKES|R IY0 T EY1 K S|R IY1 T EY0 K S|V
|
329 |
+
RETHINK|R IY2 TH IH1 NG K|R IY1 TH IH0 NG K|V
|
330 |
+
RETHINKS|R IY2 TH IH1 NG K S|R IY1 TH IH0 NG K S|V
|
331 |
+
RETREAD|R IY2 T R EH1 D|R IY1 T R EH0 D|V
|
332 |
+
RETREADS|R IY2 T R EH1 D Z|R IY1 T R EH0 D Z|V
|
333 |
+
REWRITE|R IY0 R AY1 T|R IY1 R AY2 T|V
|
334 |
+
REWRITES|R IY0 R AY1 T S|R IY1 R AY2 T S|V
|
335 |
+
SEGMENT|S EH1 G M AH0 N T|S EH2 G M EH1 N T|V
|
336 |
+
SEGMENTS|S EH2 G M EH1 N T S|S EH1 G M AH0 N T S|V
|
337 |
+
SEPARATE|S EH1 P ER0 EY2 T|S EH1 P ER0 IH0 T|V
|
338 |
+
SEPARATES|S EH1 P ER0 EY2 T S|S EH1 P ER0 IH0 T S|V
|
339 |
+
SUBCONTRACT|S AH0 B K AA1 N T R AE2 K T|S AH2 B K AA0 N T R AE1 K T|V
|
340 |
+
SUBCONTRACTS|S AH2 B K AA0 N T R AE1 K T S|S AH0 B K AA1 N T R AE2 K T S|V
|
341 |
+
SUBJECT|S AH0 B JH EH1 K T|S AH1 B JH IH0 K T|V
|
342 |
+
SUBJECTS|S AH0 B JH EH1 K T S|S AH1 B JH IH0 K T S|V
|
343 |
+
SUBORDINATE|S AH0 B AO1 R D AH0 N EY2 T|S AH0 B AO1 R D AH0 N AH0 T|V
|
344 |
+
SUBORDINATES|S AH0 B AO1 R D AH0 N EY2 T S|S AH0 B AO1 R D AH0 N AH0 T S|V
|
345 |
+
SUPPLEMENT|S AH1 P L AH0 M EH0 N T|S AH1 P L AH0 M AH0 N T|V
|
346 |
+
SUPPLEMENTS|S AH1 P L AH0 M EH0 N T S|S AH1 P L AH0 M AH0 N T S|V
|
347 |
+
SURMISE|S ER0 M AY1 Z|S ER1 M AY0 Z|V
|
348 |
+
SURMISES|S ER0 M AY1 Z IH0 Z|S ER1 M AY0 Z IH0 Z|V
|
349 |
+
SURVEY|S ER0 V EY1|S ER1 V EY2|V
|
350 |
+
SURVEYS|S ER0 V EY1 Z|S ER1 V EY2 Z|V
|
351 |
+
SUSPECT|S AH0 S P EH1 K T|S AH1 S P EH2 K T|V
|
352 |
+
SUSPECTS|S AH0 S P EH1 K T S|S AH1 S P EH2 K T S|V
|
353 |
+
SYNDICATE|S IH1 N D AH0 K EY2 T|S IH1 N D IH0 K AH0 T|V
|
354 |
+
SYNDICATES|S IH1 N D IH0 K EY2 T S|S IH1 N D IH0 K AH0 T S|V
|
355 |
+
TORMENT|T AO1 R M EH2 N T|T AO0 R M EH1 N T|V
|
356 |
+
TRANSFER|T R AE0 N S F ER1|T R AE1 N S F ER0|V
|
357 |
+
TRANSFERS|T R AE0 N S F ER1 Z|T R AE1 N S F ER0 Z|V
|
358 |
+
TRANSPLANT|T R AE0 N S P L AE1 N T|T R AE1 N S P L AE0 N T|V
|
359 |
+
TRANSPLANTS|T R AE0 N S P L AE1 N T S|T R AE1 N S P L AE0 N T S|V
|
360 |
+
TRANSPORT|T R AE0 N S P AO1 R T|T R AE1 N S P AO0 R T|V
|
361 |
+
TRANSPORTS|T R AE0 N S P AO1 R T S|T R AE1 N S P AO0 R T S|V
|
362 |
+
TRIPLICATE|T R IH1 P L IH0 K EY2 T|T R IH1 P L IH0 K AH0 T|V
|
363 |
+
TRIPLICATES|T R IH1 P L IH0 K EY2 T S|T R IH1 P L IH0 K AH0 T S|V
|
364 |
+
UNDERCUT|AH2 N D ER0 K AH1 T|AH1 N D ER0 K AH2 T|V
|
365 |
+
UNDERESTIMATE|AH1 N D ER0 EH1 S T AH0 M EY2 T|AH1 N D ER0 EH1 S T AH0 M AH0 T|V
|
366 |
+
UNDERESTIMATES|AH1 N D ER0 EH1 S T AH0 M EY2 T S|AH1 N D ER0 EH1 S T AH0 M AH0 T S|V
|
367 |
+
UNDERLINE|AH2 N D ER0 L AY1 N|AH1 N D ER0 L AY2 N|V
|
368 |
+
UNDERLINES|AH2 N D ER0 L AY1 N Z|AH1 N D ER0 L AY2 N Z|V
|
369 |
+
UNDERTAKING|AH2 N D ER0 T EY1 K IH0 NG|AH1 N D ER0 T EY2 K IH0 NG|V
|
370 |
+
UNDERTAKINGS|AH2 N D ER0 T EY1 K IH0 NG Z|AH1 N D ER0 T EY2 K IH0 NG Z|V
|
371 |
+
UNUSED|AH0 N Y UW1 Z D|AH0 N Y UW1 S T|V
|
372 |
+
UPGRADE|AH0 P G R EY1 D|AH1 P G R EY0 D|V
|
373 |
+
UPGRADES|AH0 P G R EY1 D Z|AH1 P G R EY0 D Z|V
|
374 |
+
UPLIFT|AH2 P L IH1 F T|AH1 P L IH0 F T|V
|
375 |
+
UPSET|AH0 P S EH1 T|AH1 P S EH2 T|V
|
376 |
+
UPSETS|AH0 P S EH1 T S|AH1 P S EH2 T S|V
|
377 |
+
USE|Y UW1 Z|Y UW1 S|V
|
378 |
+
USED|Y UW1 Z D|Y UW1 S T|VBN
|
379 |
+
USES|Y UW1 Z IH0 Z|Y UW1 S IH0 Z|V
|
dataset/google.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, os, re, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from multiprocessing import Pool
|
6 |
+
from scipy.io import wavfile
|
7 |
+
import tensorflow as tf
|
8 |
+
|
9 |
+
from tensorflow.keras.utils import Sequence, OrderedEnqueuer
|
10 |
+
from tensorflow.keras import layers
|
11 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
12 |
+
|
13 |
+
sys.path.append(os.path.dirname(__file__))
|
14 |
+
from g2p.g2p_en.g2p import G2p
|
15 |
+
|
16 |
+
import warnings
|
17 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
18 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
19 |
+
|
20 |
+
class GoogleCommandsDataloader(Sequence):
|
21 |
+
def __init__(self,
|
22 |
+
batch_size,
|
23 |
+
fs = 16000,
|
24 |
+
wav_dir='/home/DB/google_speech_commands',
|
25 |
+
target_list=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'],
|
26 |
+
features='g2p_embed', # phoneme, g2p_embed, both ...
|
27 |
+
shuffle=True,
|
28 |
+
testset_only=False,
|
29 |
+
pkl=None,
|
30 |
+
):
|
31 |
+
|
32 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
33 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
34 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
35 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
36 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
37 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
38 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
39 |
+
' ']
|
40 |
+
|
41 |
+
self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
|
42 |
+
self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
|
43 |
+
|
44 |
+
self.batch_size = batch_size
|
45 |
+
self.fs = fs
|
46 |
+
self.wav_dir = wav_dir
|
47 |
+
self.target_list = [x.lower() for x in target_list]
|
48 |
+
self.testset_only = testset_only
|
49 |
+
self.features = features
|
50 |
+
self.shuffle = shuffle
|
51 |
+
self.pkl = pkl
|
52 |
+
self.nPhoneme = len(phonemes)
|
53 |
+
self.g2p = G2p()
|
54 |
+
|
55 |
+
self.__prep__()
|
56 |
+
self.on_epoch_end()
|
57 |
+
|
58 |
+
def __prep__(self):
|
59 |
+
self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
|
60 |
+
|
61 |
+
if (self.pkl is not None) and (os.path.isfile(self.pkl)):
|
62 |
+
print(">> Load dataset from {}".format(self.pkl))
|
63 |
+
self.data = pd.read_pickle(self.pkl)
|
64 |
+
else:
|
65 |
+
print(">> Make dataset from {}".format(self.wav_dir))
|
66 |
+
target_dict = {}
|
67 |
+
idx = 0
|
68 |
+
for target in self.target_list:
|
69 |
+
print(">> Extract from {}".format(target))
|
70 |
+
if self.testset_only:
|
71 |
+
test_list = os.path.join(self.wav_dir, 'testing_list.txt')
|
72 |
+
with open(test_list, "r") as f:
|
73 |
+
wav_list = f.readlines()
|
74 |
+
wav_list = [os.path.join(self.wav_dir, x.strip()) for x in wav_list]
|
75 |
+
wav_list = [x for x in wav_list if target == x.split('/')[-2]]
|
76 |
+
else:
|
77 |
+
wav_list = [str(x) for x in Path(os.path.join(self.wav_dir, target)).rglob('*.wav')]
|
78 |
+
for wav in wav_list:
|
79 |
+
anchor_text = wav.split('/')[-2].lower()
|
80 |
+
duration = float(wavfile.read(wav)[1].shape[-1])/self.fs
|
81 |
+
for comparison_text in self.target_list:
|
82 |
+
label = 1 if anchor_text == comparison_text else 0
|
83 |
+
target_dict[idx] = {
|
84 |
+
'wav': wav,
|
85 |
+
'text': comparison_text,
|
86 |
+
'duration': duration,
|
87 |
+
'label': label
|
88 |
+
}
|
89 |
+
idx += 1
|
90 |
+
self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
|
91 |
+
|
92 |
+
# g2p & p2idx by g2p_en package
|
93 |
+
print(">> Convert word to phoneme")
|
94 |
+
self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
95 |
+
print(">> Convert phoneme to index")
|
96 |
+
self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
97 |
+
print(">> Compute phoneme embedding")
|
98 |
+
self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
|
99 |
+
|
100 |
+
if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
|
101 |
+
self.data.to_pickle(self.pkl)
|
102 |
+
|
103 |
+
# Get longest data
|
104 |
+
self.data = self.data.sort_values(by='duration').reset_index(drop=True)
|
105 |
+
self.wav_list = self.data['wav'].values
|
106 |
+
self.idx_list = self.data['pIndex'].values
|
107 |
+
self.emb_list = self.data['g2p_embed'].values
|
108 |
+
self.lab_list = self.data['label'].values
|
109 |
+
|
110 |
+
# Set dataloader params.
|
111 |
+
self.len = len(self.data)
|
112 |
+
self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
113 |
+
self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
|
114 |
+
|
115 |
+
def __len__(self):
|
116 |
+
# return total batch-wise length
|
117 |
+
return math.ceil(self.len / self.batch_size)
|
118 |
+
|
119 |
+
def _load_wav(self, wav):
|
120 |
+
return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
|
121 |
+
|
122 |
+
def __getitem__(self, idx):
|
123 |
+
# chunking
|
124 |
+
indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
|
125 |
+
|
126 |
+
# load inputs
|
127 |
+
batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
128 |
+
if self.features == 'both':
|
129 |
+
batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
130 |
+
batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
131 |
+
else:
|
132 |
+
if self.features == 'phoneme':
|
133 |
+
batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
134 |
+
elif self.features == 'g2p_embed':
|
135 |
+
batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
136 |
+
# load outputs
|
137 |
+
batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
|
138 |
+
|
139 |
+
# padding and masking
|
140 |
+
pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
|
141 |
+
if self.features == 'both':
|
142 |
+
pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
|
143 |
+
pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
|
144 |
+
else:
|
145 |
+
pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
|
146 |
+
pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
|
147 |
+
|
148 |
+
if self.features == 'both':
|
149 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
150 |
+
else:
|
151 |
+
return pad_batch_x, pad_batch_y, pad_batch_z
|
152 |
+
|
153 |
+
def on_epoch_end(self):
|
154 |
+
self.indices = np.arange(self.len)
|
155 |
+
if self.shuffle == True:
|
156 |
+
np.random.shuffle(self.indices)
|
157 |
+
|
158 |
+
def convert_sequence_to_dataset(dataloader):
|
159 |
+
def data_generator():
|
160 |
+
for i in range(dataloader.__len__()):
|
161 |
+
if dataloader.features == 'both':
|
162 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
|
163 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
164 |
+
else:
|
165 |
+
pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
|
166 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z
|
167 |
+
|
168 |
+
if dataloader.features == 'both':
|
169 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
170 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
171 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
172 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
173 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
174 |
+
)
|
175 |
+
else:
|
176 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
177 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
178 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
179 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
180 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
181 |
+
)
|
182 |
+
# data_dataset = data_dataset.cache()
|
183 |
+
data_dataset = data_dataset.prefetch(1)
|
184 |
+
|
185 |
+
return data_dataset
|
186 |
+
|
187 |
+
if __name__ == '__main__':
|
188 |
+
dataloader = GoogleCommandsDataloader(2048, testset_only=True, pkl='/home/DB/google_speech_commands/google_testset.pkl', features='g2p_embed')
|
dataset/google_infe202405.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, os, re, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from multiprocessing import Pool
|
6 |
+
from scipy.io import wavfile
|
7 |
+
import tensorflow as tf
|
8 |
+
|
9 |
+
from tensorflow.keras.utils import Sequence, OrderedEnqueuer
|
10 |
+
from tensorflow.keras import layers
|
11 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
12 |
+
|
13 |
+
sys.path.append(os.path.dirname(__file__))
|
14 |
+
from g2p.g2p_en.g2p import G2p
|
15 |
+
|
16 |
+
import warnings
|
17 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
18 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
19 |
+
|
20 |
+
class GoogleCommandsDataloader(Sequence):
|
21 |
+
def __init__(self,
|
22 |
+
batch_size,
|
23 |
+
fs = 16000,
|
24 |
+
wav_dir='/home/DB/kws_google/data2',
|
25 |
+
target_list=['bed','three','bird','cat','dog','eight','five','four','happy','house','marvin','nine',
|
26 |
+
'one','seven','sheila','six','tree','two','wow','zero'],
|
27 |
+
features='g2p_embed', # phoneme, g2p_embed, both ...
|
28 |
+
shuffle=True,
|
29 |
+
testset_only=False,
|
30 |
+
pkl=None,
|
31 |
+
):
|
32 |
+
|
33 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
34 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
35 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
36 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
37 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
38 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
39 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
40 |
+
' ']
|
41 |
+
|
42 |
+
self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
|
43 |
+
self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
|
44 |
+
|
45 |
+
self.batch_size = batch_size
|
46 |
+
self.fs = fs
|
47 |
+
self.wav_dir = wav_dir
|
48 |
+
self.target_list = [x.lower() for x in target_list]
|
49 |
+
self.testset_only = testset_only
|
50 |
+
self.features = features
|
51 |
+
self.shuffle = shuffle
|
52 |
+
self.pkl = pkl
|
53 |
+
self.nPhoneme = len(phonemes)
|
54 |
+
self.g2p = G2p()
|
55 |
+
|
56 |
+
self.__prep__()
|
57 |
+
self.on_epoch_end()
|
58 |
+
|
59 |
+
def __prep__(self):
|
60 |
+
self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
|
61 |
+
|
62 |
+
if (self.pkl is not None) and (os.path.isfile(self.pkl)):
|
63 |
+
print(">> Load dataset from {}".format(self.pkl))
|
64 |
+
self.data = pd.read_pickle(self.pkl)
|
65 |
+
else:
|
66 |
+
print(">> Make dataset from {}".format(self.wav_dir))
|
67 |
+
target_dict = {}
|
68 |
+
idx = 0
|
69 |
+
for target in self.target_list:
|
70 |
+
print(">> Extract from {}".format(target))
|
71 |
+
if self.testset_only:
|
72 |
+
test_list = os.path.join(self.wav_dir, 'testing_list.txt')
|
73 |
+
with open(test_list, "r") as f:
|
74 |
+
wav_list = f.readlines()
|
75 |
+
wav_list = [os.path.join(self.wav_dir, x.strip()) for x in wav_list]
|
76 |
+
wav_list = [x for x in wav_list if target == x.split('/')[-2]]
|
77 |
+
else:
|
78 |
+
wav_list = [str(x) for x in Path(os.path.join(self.wav_dir, target)).rglob('*.wav')]
|
79 |
+
|
80 |
+
for wav in wav_list:
|
81 |
+
anchor_text = wav.split('/')[-2].lower()
|
82 |
+
duration = float(wavfile.read(wav)[1].shape[-1])/self.fs
|
83 |
+
for comparison_text in self.target_list:
|
84 |
+
label = 1 if anchor_text == comparison_text else 0
|
85 |
+
target_dict[idx] = {
|
86 |
+
'wav': wav,
|
87 |
+
'text': comparison_text,
|
88 |
+
'duration': duration,
|
89 |
+
'label': label
|
90 |
+
}
|
91 |
+
idx += 1
|
92 |
+
self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
|
93 |
+
|
94 |
+
# g2p & p2idx by g2p_en package
|
95 |
+
print(">> Convert word to phoneme")
|
96 |
+
self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
97 |
+
print(">> Convert phoneme to index")
|
98 |
+
self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
99 |
+
print(">> Compute phoneme embedding")
|
100 |
+
self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
|
101 |
+
|
102 |
+
if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
|
103 |
+
self.data.to_pickle(self.pkl)
|
104 |
+
|
105 |
+
|
106 |
+
# Get longest data
|
107 |
+
self.wav_list = self.data['wav'].values
|
108 |
+
self.idx_list = self.data['pIndex'].values
|
109 |
+
self.emb_list = self.data['g2p_embed'].values
|
110 |
+
self.lab_list = self.data['label'].values
|
111 |
+
self.data = self.data.sort_values(by='duration').reset_index(drop=True)
|
112 |
+
|
113 |
+
# Set dataloader params.
|
114 |
+
self.len = len(self.data)
|
115 |
+
self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
116 |
+
self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
|
117 |
+
|
118 |
+
def __len__(self):
|
119 |
+
# return total batch-wise length
|
120 |
+
return math.ceil(self.len / self.batch_size)
|
121 |
+
|
122 |
+
def _load_wav(self, wav):
|
123 |
+
return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
|
124 |
+
|
125 |
+
def __getitem__(self, idx):
|
126 |
+
# chunking
|
127 |
+
indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
|
128 |
+
|
129 |
+
# load inputs
|
130 |
+
batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
131 |
+
if self.features == 'both':
|
132 |
+
batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
133 |
+
batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
134 |
+
else:
|
135 |
+
if self.features == 'phoneme':
|
136 |
+
batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
137 |
+
elif self.features == 'g2p_embed':
|
138 |
+
batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
139 |
+
# load outputs
|
140 |
+
batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
|
141 |
+
|
142 |
+
# padding and masking
|
143 |
+
pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
|
144 |
+
if self.features == 'both':
|
145 |
+
pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
|
146 |
+
pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
|
147 |
+
else:
|
148 |
+
pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
|
149 |
+
pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
|
150 |
+
|
151 |
+
if self.features == 'both':
|
152 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
153 |
+
else:
|
154 |
+
return pad_batch_x, pad_batch_y, pad_batch_z
|
155 |
+
|
156 |
+
def on_epoch_end(self):
|
157 |
+
self.indices = np.arange(self.len)
|
158 |
+
# if self.shuffle == True:
|
159 |
+
# np.random.shuffle(self.indices)
|
160 |
+
|
161 |
+
def convert_sequence_to_dataset(dataloader):
|
162 |
+
def data_generator():
|
163 |
+
for i in range(dataloader.__len__()):
|
164 |
+
if dataloader.features == 'both':
|
165 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
|
166 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
167 |
+
else:
|
168 |
+
pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
|
169 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z
|
170 |
+
|
171 |
+
if dataloader.features == 'both':
|
172 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
173 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
174 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
175 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
176 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
177 |
+
)
|
178 |
+
else:
|
179 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
180 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
181 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
182 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
183 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
184 |
+
)
|
185 |
+
# data_dataset = data_dataset.cache()
|
186 |
+
# data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=output_signature)
|
187 |
+
data_dataset = data_dataset.prefetch(1)
|
188 |
+
|
189 |
+
return data_dataset
|
190 |
+
|
191 |
+
if __name__ == '__main__':
|
192 |
+
dataloader = GoogleCommandsDataloader(2048, testset_only=True, pkl='/home/DB/google_speech_commands/google_testset.pkl', features='g2p_embed')
|
dataset/libriphrase.py
ADDED
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, os, re, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import Levenshtein
|
6 |
+
from multiprocessing import Pool
|
7 |
+
from scipy.io import wavfile
|
8 |
+
import tensorflow as tf
|
9 |
+
|
10 |
+
from tensorflow.keras.utils import Sequence, OrderedEnqueuer
|
11 |
+
from tensorflow.keras import layers
|
12 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
13 |
+
|
14 |
+
sys.path.append(os.path.dirname(__file__))
|
15 |
+
from g2p.g2p_en.g2p import G2p
|
16 |
+
|
17 |
+
import warnings
|
18 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
19 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
20 |
+
|
21 |
+
class LibriPhraseDataloader(Sequence):
|
22 |
+
def __init__(self,
|
23 |
+
batch_size,
|
24 |
+
fs = 16000,
|
25 |
+
wav_dir='/homw/DB/LibriPhrase/wav_dir',
|
26 |
+
noise_dir='/homw/DB/noise',
|
27 |
+
csv_dir='/homw/DB/LibriPhrase/data',
|
28 |
+
train_csv = ['train_100h', 'train_360h'],
|
29 |
+
test_csv = ['train_500h',],
|
30 |
+
types='both', # easy, hard
|
31 |
+
features='g2p_embed', # phoneme, g2p_embed, both ...
|
32 |
+
train=True,
|
33 |
+
shuffle=True,
|
34 |
+
pkl=None,
|
35 |
+
edit_dist=False,
|
36 |
+
):
|
37 |
+
|
38 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
39 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
40 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
41 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
42 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
43 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
44 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
45 |
+
' ']
|
46 |
+
|
47 |
+
self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
|
48 |
+
self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
|
49 |
+
|
50 |
+
self.batch_size = batch_size
|
51 |
+
self.fs = fs
|
52 |
+
self.wav_dir = wav_dir
|
53 |
+
self.csv_dir = csv_dir
|
54 |
+
self.noise_dir = noise_dir
|
55 |
+
self.train_csv = train_csv
|
56 |
+
self.test_csv = test_csv
|
57 |
+
self.types = types
|
58 |
+
self.features = features
|
59 |
+
self.train = train
|
60 |
+
self.shuffle = shuffle
|
61 |
+
self.pkl = pkl
|
62 |
+
self.edit_dist = edit_dist
|
63 |
+
self.nPhoneme = len(phonemes)
|
64 |
+
self.g2p = G2p()
|
65 |
+
|
66 |
+
self.__prep__()
|
67 |
+
self.on_epoch_end()
|
68 |
+
|
69 |
+
def __prep__(self):
|
70 |
+
if self.train:
|
71 |
+
print(">> Preparing noise DB")
|
72 |
+
noise_list = [str(x) for x in Path(self.noise_dir).rglob('*.wav')]
|
73 |
+
self.noise = np.array([])
|
74 |
+
for noise in noise_list:
|
75 |
+
fs, data = wavfile.read(noise)
|
76 |
+
assert fs == self.fs, ">> Error : Un-match sampling freq.\n{} -> {}".format(noise, fs)
|
77 |
+
data = data.astype(np.float32) / 32768.0
|
78 |
+
data = (data / np.max(data)) * 0.5
|
79 |
+
self.noise = np.append(self.noise, data)
|
80 |
+
|
81 |
+
self.data = pd.DataFrame(columns=['wav_label', 'wav', 'text', 'duration', 'label', 'type'])
|
82 |
+
|
83 |
+
if (self.pkl is not None) and (os.path.isfile(self.pkl)):
|
84 |
+
print(">> Load dataset from {}".format(self.pkl))
|
85 |
+
self.data = pd.read_pickle(self.pkl)
|
86 |
+
else:
|
87 |
+
for db in self.train_csv if self.train else self.test_csv:
|
88 |
+
csv_list = [str(x) for x in Path(self.csv_dir).rglob('*' + db + '*word*')]
|
89 |
+
for n_word in csv_list:
|
90 |
+
print(">> processing : {} ".format(n_word))
|
91 |
+
df = pd.read_csv(n_word)
|
92 |
+
# Split train dataset to match & unmatch case
|
93 |
+
anc_pos = df[['anchor_text', 'anchor', 'anchor_text', 'anchor_dur']]
|
94 |
+
anc_neg = df[['anchor_text', 'anchor', 'comparison_text', 'anchor_dur', 'target', 'type']]
|
95 |
+
com_pos = df[['comparison_text', 'comparison', 'comparison_text', 'comparison_dur']]
|
96 |
+
com_neg = df[['comparison_text', 'comparison', 'anchor_text', 'comparison_dur', 'target', 'type']]
|
97 |
+
anc_pos.columns = ['wav_label', 'anchor', 'anchor_text', 'anchor_dur']
|
98 |
+
com_pos.columns = ['wav_label', 'comparison', 'comparison_text', 'comparison_dur']
|
99 |
+
anc_pos['label'] = 1
|
100 |
+
anc_pos['type'] = df['type']
|
101 |
+
com_pos['label'] = 1
|
102 |
+
com_pos['type'] = df['type']
|
103 |
+
# Concat
|
104 |
+
self.data = self.data.append(anc_pos.rename(columns={y: x for x, y in zip(self.data.columns, anc_pos.columns)}), ignore_index=True)
|
105 |
+
self.data = self.data.append(anc_neg.rename(columns={y: x for x, y in zip(self.data.columns, anc_neg.columns)}), ignore_index=True)
|
106 |
+
self.data = self.data.append(com_pos.rename(columns={y: x for x, y in zip(self.data.columns, com_pos.columns)}), ignore_index=True)
|
107 |
+
self.data = self.data.append(com_neg.rename(columns={y: x for x, y in zip(self.data.columns, com_neg.columns)}), ignore_index=True)
|
108 |
+
|
109 |
+
# Append wav directory path
|
110 |
+
self.data['wav'] = self.data['wav'].apply(lambda x: os.path.join(self.wav_dir, x))
|
111 |
+
# g2p & p2idx by g2p_en package
|
112 |
+
print(">> Convert word to phoneme")
|
113 |
+
self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
114 |
+
print(">> Convert speech word to phoneme")
|
115 |
+
self.data['wav_phoneme'] = self.data['wav_label'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
116 |
+
print(">> Convert phoneme to index")
|
117 |
+
self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
118 |
+
print(">> Convert speech phoneme to index")
|
119 |
+
self.data['wav_pIndex'] = self.data['wav_phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
120 |
+
print(">> Compute phoneme embedding")
|
121 |
+
self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
|
122 |
+
print(">> Calucate Edit distance ratio")
|
123 |
+
self.data['dist'] = self.data.apply(lambda x: Levenshtein.ratio(re.sub(r"[^a-zA-Z0-9]+", ' ', x['wav_label']), re.sub(r"[^a-zA-Z0-9]+", ' ', x['text'])), axis=1)
|
124 |
+
|
125 |
+
if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
|
126 |
+
self.data.to_pickle(self.pkl)
|
127 |
+
|
128 |
+
# Masking dataset type
|
129 |
+
if self.types == 'both':
|
130 |
+
pass
|
131 |
+
elif self.types == 'easy':
|
132 |
+
self.data = self.data.loc[self.data['type'] == 'diffspk_easyneg']
|
133 |
+
elif self.types == 'hard':
|
134 |
+
self.data = self.data.loc[self.data['type'] == 'diffspk_hardneg']
|
135 |
+
|
136 |
+
# Get longest data
|
137 |
+
self.data = self.data.sort_values(by='duration').reset_index(drop=True)
|
138 |
+
self.wav_list = self.data['wav'].values
|
139 |
+
self.idx_list = self.data['pIndex'].values
|
140 |
+
self.sIdx_list = self.data['wav_pIndex'].values
|
141 |
+
self.emb_list = self.data['g2p_embed'].values
|
142 |
+
self.lab_list = self.data['label'].values
|
143 |
+
if self.edit_dist:
|
144 |
+
self.dist_list = self.data['dist'].values
|
145 |
+
|
146 |
+
# Set dataloader params.
|
147 |
+
self.len = len(self.data)
|
148 |
+
self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
149 |
+
self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
|
150 |
+
self.maxlen_l = int((int(self.data['wav_label'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
151 |
+
|
152 |
+
def __len__(self):
|
153 |
+
# return total batch-wise length
|
154 |
+
return math.ceil(self.len / self.batch_size)
|
155 |
+
|
156 |
+
def _load_wav(self, wav):
|
157 |
+
return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
|
158 |
+
|
159 |
+
def _mixing_snr(self, clean, snr=[5, 15]):
|
160 |
+
def _cal_adjusted_rms(clean_rms, snr):
|
161 |
+
a = float(snr) / 20
|
162 |
+
noise_rms = clean_rms / (10**a)
|
163 |
+
return noise_rms
|
164 |
+
|
165 |
+
def _cal_rms(amp):
|
166 |
+
return np.sqrt(np.mean(np.square(amp), axis=-1))
|
167 |
+
|
168 |
+
start = np.random.randint(0, len(self.noise)-len(clean))
|
169 |
+
divided_noise = self.noise[start: start + len(clean)]
|
170 |
+
clean_rms = _cal_rms(clean)
|
171 |
+
noise_rms = _cal_rms(divided_noise)
|
172 |
+
adj_noise_rms = _cal_adjusted_rms(clean_rms, np.random.randint(snr[0], snr[1]))
|
173 |
+
|
174 |
+
adj_noise_amp = divided_noise * (adj_noise_rms / (noise_rms + 1e-7))
|
175 |
+
noisy = clean + adj_noise_amp
|
176 |
+
|
177 |
+
if np.max(noisy) > 1:
|
178 |
+
noisy = noisy / np.max(noisy)
|
179 |
+
|
180 |
+
return noisy
|
181 |
+
|
182 |
+
def __getitem__(self, idx):
|
183 |
+
# chunking
|
184 |
+
indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
|
185 |
+
|
186 |
+
# load inputs
|
187 |
+
batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
188 |
+
if self.features == 'both':
|
189 |
+
batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
190 |
+
batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
191 |
+
else:
|
192 |
+
if self.features == 'phoneme':
|
193 |
+
batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
194 |
+
elif self.features == 'g2p_embed':
|
195 |
+
batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
196 |
+
# load outputs
|
197 |
+
batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
|
198 |
+
batch_l = [np.array(self.sIdx_list[i]).astype(np.int32) for i in indices]
|
199 |
+
batch_t = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
200 |
+
if self.edit_dist:
|
201 |
+
batch_d = [np.array([self.dist_list[i]]).astype(np.float32) for i in indices]
|
202 |
+
|
203 |
+
# padding and masking
|
204 |
+
pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
|
205 |
+
if self.features == 'both':
|
206 |
+
pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
|
207 |
+
pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
|
208 |
+
else:
|
209 |
+
pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
|
210 |
+
pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
|
211 |
+
pad_batch_l = pad_sequences(np.array(batch_l), maxlen=self.maxlen_l, value=0.0, padding='post', dtype=batch_l[0].dtype)
|
212 |
+
pad_batch_t = pad_sequences(np.array(batch_t), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_t[0].dtype)
|
213 |
+
if self.edit_dist:
|
214 |
+
pad_batch_d = pad_sequences(np.array(batch_d), value=0.0, padding='post', dtype=batch_d[0].dtype)
|
215 |
+
|
216 |
+
# Noisy option
|
217 |
+
if self.train:
|
218 |
+
batch_x_noisy = [self._mixing_snr(x) for x in batch_x]
|
219 |
+
pad_batch_x_noisy = pad_sequences(np.array(batch_x_noisy), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x_noisy[0].dtype)
|
220 |
+
|
221 |
+
if self.train:
|
222 |
+
if self.features == 'both':
|
223 |
+
return pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
|
224 |
+
else:
|
225 |
+
return pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
|
226 |
+
else:
|
227 |
+
if self.features == 'both':
|
228 |
+
if self.edit_dist:
|
229 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
|
230 |
+
else:
|
231 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
232 |
+
else:
|
233 |
+
if self.edit_dist:
|
234 |
+
return pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
|
235 |
+
else:
|
236 |
+
return pad_batch_x, pad_batch_y, pad_batch_z
|
237 |
+
|
238 |
+
def on_epoch_end(self):
|
239 |
+
self.indices = np.arange(self.len)
|
240 |
+
if self.shuffle == True:
|
241 |
+
np.random.shuffle(self.indices)
|
242 |
+
|
243 |
+
def convert_sequence_to_dataset(dataloader):
|
244 |
+
def data_generator():
|
245 |
+
for i in range(dataloader.__len__()):
|
246 |
+
if dataloader.train:
|
247 |
+
if dataloader.features == 'both':
|
248 |
+
pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
|
249 |
+
yield pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
|
250 |
+
else:
|
251 |
+
pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
|
252 |
+
yield pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
|
253 |
+
else:
|
254 |
+
if dataloader.features == 'both':
|
255 |
+
if dataloader.edit_dist:
|
256 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d = dataloader[i]
|
257 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
|
258 |
+
else:
|
259 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
|
260 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
261 |
+
else:
|
262 |
+
if dataloader.edit_dist:
|
263 |
+
pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d = dataloader[i]
|
264 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
|
265 |
+
else:
|
266 |
+
pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
|
267 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z
|
268 |
+
|
269 |
+
if dataloader.train:
|
270 |
+
if dataloader.features == 'both':
|
271 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
272 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
273 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
274 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
275 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
276 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
277 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
|
278 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
|
279 |
+
)
|
280 |
+
else:
|
281 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
282 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
283 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
284 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
285 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
286 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
287 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
|
288 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
|
289 |
+
)
|
290 |
+
else:
|
291 |
+
if dataloader.features == 'both':
|
292 |
+
if dataloader.edit_dist:
|
293 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
294 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
295 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
296 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
297 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
298 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
299 |
+
)
|
300 |
+
else:
|
301 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
302 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
303 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
304 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
305 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
306 |
+
)
|
307 |
+
else:
|
308 |
+
if dataloader.edit_dist:
|
309 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
310 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
311 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
312 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
313 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
314 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
315 |
+
)
|
316 |
+
else:
|
317 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
318 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
319 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
320 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
321 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
322 |
+
)
|
323 |
+
# data_dataset = data_dataset.cache()
|
324 |
+
data_dataset = data_dataset.prefetch(1)
|
325 |
+
|
326 |
+
return data_dataset
|
327 |
+
|
328 |
+
if __name__ == '__main__':
|
329 |
+
GLOBAL_BATCH_SIZE = 2048
|
330 |
+
train_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=True, types='both', shuffle=True, pkl='/home/DB/LibriPhrase/data/train_both.pkl', features='g2p_embed')
|
331 |
+
test_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=False, edit_dist=True, types='both', shuffle=False, pkl='/home/DB/LibriPhrase/data/test_both.pkl', features='g2p_embed')
|
dataset/libriphrase_ctc1.py
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, os, re, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import Levenshtein
|
6 |
+
from multiprocessing import Pool
|
7 |
+
from scipy.io import wavfile
|
8 |
+
import tensorflow as tf
|
9 |
+
|
10 |
+
from tensorflow.keras.utils import Sequence, OrderedEnqueuer
|
11 |
+
from tensorflow.keras import layers
|
12 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
13 |
+
|
14 |
+
sys.path.append(os.path.dirname(__file__))
|
15 |
+
from g2p.g2p_en.g2p import G2p
|
16 |
+
|
17 |
+
import warnings
|
18 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
19 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
20 |
+
|
21 |
+
class LibriPhraseDataloader(Sequence):
|
22 |
+
def __init__(self,
|
23 |
+
batch_size,
|
24 |
+
fs = 16000,
|
25 |
+
wav_dir='/share/nas165/yiting/LibriPhrase/LibriPhrase_data',
|
26 |
+
noise_dir='/share/nas165/yiting/EEND/corpora/JHU/musan/musan/noise/sound-bible',
|
27 |
+
csv_dir='/share/nas165/yiting/LibriPhrase/data',
|
28 |
+
train_csv = ['train100h','train_360h'],
|
29 |
+
test_csv = ['train_500h',],
|
30 |
+
types='both', # easy, hard
|
31 |
+
features='g2p_embed', # phoneme, g2p_embed, both ...
|
32 |
+
train=True,
|
33 |
+
shuffle=True,
|
34 |
+
pkl=None,
|
35 |
+
edit_dist=False,
|
36 |
+
):
|
37 |
+
|
38 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
39 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
40 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
41 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
42 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
43 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
44 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
45 |
+
' ']
|
46 |
+
|
47 |
+
self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
|
48 |
+
self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
|
49 |
+
|
50 |
+
self.batch_size = batch_size
|
51 |
+
self.fs = fs
|
52 |
+
self.wav_dir = wav_dir
|
53 |
+
self.csv_dir = csv_dir
|
54 |
+
self.noise_dir = noise_dir
|
55 |
+
self.train_csv = train_csv
|
56 |
+
self.test_csv = test_csv
|
57 |
+
self.types = types
|
58 |
+
self.features = features
|
59 |
+
self.train = train
|
60 |
+
self.shuffle = shuffle
|
61 |
+
self.pkl = pkl
|
62 |
+
self.edit_dist = edit_dist
|
63 |
+
self.nPhoneme = len(phonemes)
|
64 |
+
self.g2p = G2p()
|
65 |
+
|
66 |
+
self.__prep__()
|
67 |
+
self.on_epoch_end()
|
68 |
+
|
69 |
+
def __prep__(self):
|
70 |
+
if self.train:
|
71 |
+
print(">> Preparing noise DB")
|
72 |
+
noise_list = [str(x) for x in Path(self.noise_dir).rglob('*.wav')]
|
73 |
+
self.noise = np.array([])
|
74 |
+
for noise in noise_list:
|
75 |
+
fs, data = wavfile.read(noise)
|
76 |
+
assert fs == self.fs, ">> Error : Un-match sampling freq.\n{} -> {}".format(noise, fs)
|
77 |
+
data = data.astype(np.float32) / 32768.0
|
78 |
+
data = (data / np.max(data)) * 0.5
|
79 |
+
self.noise = np.append(self.noise, data)
|
80 |
+
|
81 |
+
self.data = pd.DataFrame(columns=['wav_label', 'wav', 'text', 'duration', 'label', 'type'])
|
82 |
+
def process_text(self, x):
|
83 |
+
if isinstance(x, str):
|
84 |
+
# Only apply re.sub if x is a string
|
85 |
+
return re.sub(r"[^a-zA-Z0-9]+", ' ', x)
|
86 |
+
else:
|
87 |
+
# Handle other cases, e.g., return x as is or convert to string
|
88 |
+
return str(x)
|
89 |
+
if (self.pkl is not None) and (os.path.isfile(self.pkl)):
|
90 |
+
print(">> Load dataset from {}".format(self.pkl))
|
91 |
+
self.data = pd.read_pickle(self.pkl)
|
92 |
+
else:
|
93 |
+
for db in self.train_csv if self.train else self.test_csv:
|
94 |
+
csv_list = [str(x) for x in Path(self.csv_dir).rglob('*' + db + '*word*')]
|
95 |
+
for n_word in csv_list:
|
96 |
+
print(">> processing : {} ".format(n_word))
|
97 |
+
df = pd.read_csv(n_word)
|
98 |
+
# Split train dataset to match & unmatch case
|
99 |
+
anc_pos = df[['anchor_text', 'anchor', 'anchor_text', 'anchor_dur']]
|
100 |
+
anc_neg = df[['anchor_text', 'anchor', 'comparison_text', 'anchor_dur', 'target', 'type']]
|
101 |
+
com_pos = df[['comparison_text', 'comparison', 'comparison_text', 'comparison_dur']]
|
102 |
+
com_neg = df[['comparison_text', 'comparison', 'anchor_text', 'comparison_dur', 'target', 'type']]
|
103 |
+
anc_pos.columns = ['wav_label', 'anchor', 'anchor_text', 'anchor_dur']
|
104 |
+
com_pos.columns = ['wav_label', 'comparison', 'comparison_text', 'comparison_dur']
|
105 |
+
anc_pos['label'] = 1
|
106 |
+
anc_pos['type'] = df['type']
|
107 |
+
com_pos['label'] = 1
|
108 |
+
com_pos['type'] = df['type']
|
109 |
+
# Concat
|
110 |
+
self.data = self.data.append(anc_pos.rename(columns={y: x for x, y in zip(self.data.columns, anc_pos.columns)}), ignore_index=True)
|
111 |
+
self.data = self.data.append(anc_neg.rename(columns={y: x for x, y in zip(self.data.columns, anc_neg.columns)}), ignore_index=True)
|
112 |
+
self.data = self.data.append(com_pos.rename(columns={y: x for x, y in zip(self.data.columns, com_pos.columns)}), ignore_index=True)
|
113 |
+
self.data = self.data.append(com_neg.rename(columns={y: x for x, y in zip(self.data.columns, com_neg.columns)}), ignore_index=True)
|
114 |
+
|
115 |
+
# Append wav directory path
|
116 |
+
self.data['wav'] = self.data['wav'].apply(lambda x: os.path.join(self.wav_dir, x))
|
117 |
+
# g2p & p2idx by g2p_en package
|
118 |
+
print(">> Convert word to phoneme")
|
119 |
+
self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
120 |
+
print(">> Convert speech word to phoneme")
|
121 |
+
self.data['wav_phoneme'] = self.data['wav_label'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
122 |
+
print(">> Convert phoneme to index")
|
123 |
+
self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
124 |
+
print(">> Convert speech phoneme to index")
|
125 |
+
self.data['wav_pIndex'] = self.data['wav_phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
126 |
+
print(">> Compute phoneme embedding")
|
127 |
+
self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
|
128 |
+
|
129 |
+
print('wav_label',self.data['wav_label'])
|
130 |
+
print('text',self.data['text'])
|
131 |
+
|
132 |
+
self.data['dist'] = self.data.apply(lambda x: Levenshtein.ratio(re.sub(r"[^a-zA-Z0-9]+", ' ', x['wav_label']), re.sub(r"[^a-zA-Z0-9]+", ' ', x['text'])), axis=1)
|
133 |
+
|
134 |
+
#備註解掉的地方
|
135 |
+
if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
|
136 |
+
self.data.to_pickle(self.pkl)
|
137 |
+
|
138 |
+
# Masking dataset type
|
139 |
+
if self.types == 'both':
|
140 |
+
pass
|
141 |
+
elif self.types == 'easy':
|
142 |
+
self.data = self.data.loc[self.data['type'] == 'diffspk_easyneg']
|
143 |
+
elif self.types == 'hard':
|
144 |
+
self.data = self.data.loc[self.data['type'] == 'diffspk_hardneg']
|
145 |
+
|
146 |
+
# Get longest data
|
147 |
+
self.data = self.data.sort_values(by='duration').reset_index(drop=True)
|
148 |
+
self.wav_list = self.data['wav'].values
|
149 |
+
self.idx_list = self.data['pIndex'].values
|
150 |
+
self.sIdx_list = self.data['wav_pIndex'].values
|
151 |
+
self.idx_list = [np.insert(lst, 0, 0) for lst in self.idx_list]
|
152 |
+
self.sIdx_list = [np.insert(lst, 0, 0) for lst in self.sIdx_list]
|
153 |
+
self.emb_list = self.data['g2p_embed'].values
|
154 |
+
self.lab_list = self.data['label'].values
|
155 |
+
if self.edit_dist:
|
156 |
+
self.dist_list = self.data['dist'].values
|
157 |
+
|
158 |
+
# Set dataloader params.
|
159 |
+
self.len = len(self.data)
|
160 |
+
self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
161 |
+
self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
|
162 |
+
self.maxlen_l = int((int(self.data['wav_label'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
163 |
+
|
164 |
+
def __len__(self):
|
165 |
+
# return total batch-wise length
|
166 |
+
return math.ceil(self.len / self.batch_size)
|
167 |
+
|
168 |
+
def _load_wav(self, wav):
|
169 |
+
return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
|
170 |
+
|
171 |
+
def _mixing_snr(self, clean, snr=[5, 15]):
|
172 |
+
def _cal_adjusted_rms(clean_rms, snr):
|
173 |
+
a = float(snr) / 20
|
174 |
+
noise_rms = clean_rms / (10**a)
|
175 |
+
return noise_rms
|
176 |
+
|
177 |
+
def _cal_rms(amp):
|
178 |
+
return np.sqrt(np.mean(np.square(amp), axis=-1))
|
179 |
+
|
180 |
+
start = np.random.randint(0, len(self.noise)-len(clean))
|
181 |
+
divided_noise = self.noise[start: start + len(clean)]
|
182 |
+
clean_rms = _cal_rms(clean)
|
183 |
+
noise_rms = _cal_rms(divided_noise)
|
184 |
+
adj_noise_rms = _cal_adjusted_rms(clean_rms, np.random.randint(snr[0], snr[1]))
|
185 |
+
|
186 |
+
adj_noise_amp = divided_noise * (adj_noise_rms / (noise_rms + 1e-7))
|
187 |
+
noisy = clean + adj_noise_amp
|
188 |
+
|
189 |
+
if np.max(noisy) > 1:
|
190 |
+
noisy = noisy / np.max(noisy)
|
191 |
+
|
192 |
+
return noisy
|
193 |
+
|
194 |
+
def __getitem__(self, idx):
|
195 |
+
# chunking
|
196 |
+
indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
|
197 |
+
|
198 |
+
# load inputs
|
199 |
+
batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
200 |
+
if self.features == 'both':
|
201 |
+
batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
202 |
+
batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
203 |
+
else:
|
204 |
+
if self.features == 'phoneme':
|
205 |
+
batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
206 |
+
elif self.features == 'g2p_embed':
|
207 |
+
batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
208 |
+
# load outputs
|
209 |
+
batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
|
210 |
+
batch_l = [np.array(self.sIdx_list[i]).astype(np.int32) for i in indices]
|
211 |
+
batch_t = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
212 |
+
if self.edit_dist:
|
213 |
+
batch_d = [np.array([self.dist_list[i]]).astype(np.float32) for i in indices]
|
214 |
+
|
215 |
+
# padding and masking
|
216 |
+
pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
|
217 |
+
if self.features == 'both':
|
218 |
+
pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
|
219 |
+
pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
|
220 |
+
else:
|
221 |
+
pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
|
222 |
+
pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
|
223 |
+
pad_batch_l = pad_sequences(np.array(batch_l), maxlen=self.maxlen_l, value=0.0, padding='post', dtype=batch_l[0].dtype)
|
224 |
+
pad_batch_t = pad_sequences(np.array(batch_t), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_t[0].dtype)
|
225 |
+
|
226 |
+
if self.edit_dist:
|
227 |
+
pad_batch_d = pad_sequences(np.array(batch_d), value=0.0, padding='post', dtype=batch_d[0].dtype)
|
228 |
+
|
229 |
+
# Noisy option
|
230 |
+
if self.train:
|
231 |
+
batch_x_noisy = [self._mixing_snr(x) for x in batch_x]
|
232 |
+
pad_batch_x_noisy = pad_sequences(np.array(batch_x_noisy), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x_noisy[0].dtype)
|
233 |
+
|
234 |
+
if self.train:
|
235 |
+
if self.features == 'both':
|
236 |
+
return pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
|
237 |
+
else:
|
238 |
+
return pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
|
239 |
+
else:
|
240 |
+
if self.features == 'both':
|
241 |
+
if self.edit_dist:
|
242 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
|
243 |
+
else:
|
244 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
245 |
+
else:
|
246 |
+
if self.edit_dist:
|
247 |
+
return pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
|
248 |
+
else:
|
249 |
+
return pad_batch_x, pad_batch_y, pad_batch_z
|
250 |
+
|
251 |
+
def on_epoch_end(self):
|
252 |
+
self.indices = np.arange(self.len)
|
253 |
+
if self.shuffle == True:
|
254 |
+
np.random.shuffle(self.indices)
|
255 |
+
|
256 |
+
def convert_sequence_to_dataset(dataloader):
|
257 |
+
def data_generator():
|
258 |
+
for i in range(dataloader.__len__()):
|
259 |
+
if dataloader.train:
|
260 |
+
if dataloader.features == 'both':
|
261 |
+
pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
|
262 |
+
yield pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
|
263 |
+
else:
|
264 |
+
pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
|
265 |
+
yield pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
|
266 |
+
else:
|
267 |
+
if dataloader.features == 'both':
|
268 |
+
if dataloader.edit_dist:
|
269 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d = dataloader[i]
|
270 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
|
271 |
+
else:
|
272 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
|
273 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
274 |
+
else:
|
275 |
+
if dataloader.edit_dist:
|
276 |
+
pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d = dataloader[i]
|
277 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
|
278 |
+
else:
|
279 |
+
pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
|
280 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z
|
281 |
+
|
282 |
+
if dataloader.train:
|
283 |
+
if dataloader.features == 'both':
|
284 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
285 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
286 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
287 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
288 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
289 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
290 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
|
291 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
|
292 |
+
)
|
293 |
+
else:
|
294 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
295 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
296 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
297 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
298 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
299 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
300 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
|
301 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
|
302 |
+
)
|
303 |
+
else:
|
304 |
+
if dataloader.features == 'both':
|
305 |
+
if dataloader.edit_dist:
|
306 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
307 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
308 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
309 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
310 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
311 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
312 |
+
)
|
313 |
+
else:
|
314 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
315 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
316 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
317 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
318 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
319 |
+
)
|
320 |
+
else:
|
321 |
+
if dataloader.edit_dist:
|
322 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
323 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
324 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
325 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
326 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
|
327 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
328 |
+
)
|
329 |
+
else:
|
330 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
331 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
332 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
333 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
334 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
335 |
+
)
|
336 |
+
# data_dataset = data_dataset.cache()
|
337 |
+
data_dataset = data_dataset.prefetch(1)
|
338 |
+
|
339 |
+
return data_dataset
|
340 |
+
|
341 |
+
if __name__ == '__main__':
|
342 |
+
GLOBAL_BATCH_SIZE = 2048
|
343 |
+
train_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=True, types='both', shuffle=True, features='g2p_embed')
|
344 |
+
test_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=False, edit_dist=True, types='both', shuffle=False, features='g2p_embed')
|
345 |
+
train_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=True, types='both', shuffle=True, pkl='/share/nas165/yiting/PhonMatchNet/data/train_both.pkl', features='g2p_embed')
|
346 |
+
test_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=False, edit_dist=True, types='both', shuffle=False, pkl='/share/nas165/yiting/PhonMatchNet/data/test_both.pkl', features='g2p_embed')
|
dataset/qualcomm.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, os, re, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from multiprocessing import Pool
|
6 |
+
from scipy.io import wavfile
|
7 |
+
import tensorflow as tf
|
8 |
+
|
9 |
+
from tensorflow.keras.utils import Sequence, OrderedEnqueuer
|
10 |
+
from tensorflow.keras import layers
|
11 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
12 |
+
|
13 |
+
sys.path.append(os.path.dirname(__file__))
|
14 |
+
from g2p.g2p_en.g2p import G2p
|
15 |
+
|
16 |
+
import warnings
|
17 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
18 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
19 |
+
|
20 |
+
class QualcommKeywordSpeechDataloader(Sequence):
|
21 |
+
def __init__(self,
|
22 |
+
batch_size,
|
23 |
+
fs = 16000,
|
24 |
+
wav_dir='/home/DB/qualcomm_keyword_speech_dataset',
|
25 |
+
target_list=['hey_android', 'hey_snapdragon', 'hi_galaxy', 'hi_lumina'],
|
26 |
+
features='g2p_embed', # phoneme, g2p_embed, both ...
|
27 |
+
shuffle=True,
|
28 |
+
pkl=None,
|
29 |
+
):
|
30 |
+
|
31 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
32 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
33 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
34 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
35 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
36 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
37 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
38 |
+
' ']
|
39 |
+
|
40 |
+
self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
|
41 |
+
self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
|
42 |
+
|
43 |
+
self.batch_size = batch_size
|
44 |
+
self.fs = fs
|
45 |
+
self.wav_dir = wav_dir
|
46 |
+
self.target_list = target_list
|
47 |
+
self.features = features
|
48 |
+
self.shuffle = shuffle
|
49 |
+
self.pkl = pkl
|
50 |
+
self.nPhoneme = len(phonemes)
|
51 |
+
self.g2p = G2p()
|
52 |
+
|
53 |
+
self.__prep__()
|
54 |
+
self.on_epoch_end()
|
55 |
+
|
56 |
+
def __prep__(self):
|
57 |
+
self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
|
58 |
+
|
59 |
+
if (self.pkl is not None) and (os.path.isfile(self.pkl)):
|
60 |
+
print(">> Load dataset from {}".format(self.pkl))
|
61 |
+
self.data = pd.read_pickle(self.pkl)
|
62 |
+
else:
|
63 |
+
print(">> Make dataset from {}".format(self.wav_dir))
|
64 |
+
target_dict = {}
|
65 |
+
idx = 0
|
66 |
+
for target in self.target_list:
|
67 |
+
print(">> Extract from {}".format(target))
|
68 |
+
wav_list = [str(x) for x in Path(os.path.join(self.wav_dir, target)).rglob('*.wav')]
|
69 |
+
for wav in wav_list:
|
70 |
+
anchor_text = wav.split('/')[-3].lower().replace('_', ' ')
|
71 |
+
duration = float(wavfile.read(wav)[1].shape[-1])/self.fs
|
72 |
+
for comparison_text in self.target_list:
|
73 |
+
comparison_text = comparison_text.replace('_', ' ')
|
74 |
+
label = 1 if anchor_text == comparison_text else 0
|
75 |
+
target_dict[idx] = {
|
76 |
+
'wav': wav,
|
77 |
+
'text': comparison_text,
|
78 |
+
'duration': duration,
|
79 |
+
'label': label
|
80 |
+
}
|
81 |
+
idx += 1
|
82 |
+
self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
|
83 |
+
|
84 |
+
# g2p & p2idx by g2p_en package
|
85 |
+
print(">> Convert word to phoneme")
|
86 |
+
self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
|
87 |
+
print(">> Convert phoneme to index")
|
88 |
+
self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
|
89 |
+
print(">> Compute phoneme embedding")
|
90 |
+
self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
|
91 |
+
|
92 |
+
if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
|
93 |
+
self.data.to_pickle(self.pkl)
|
94 |
+
|
95 |
+
# Get longest data
|
96 |
+
self.data = self.data.sort_values(by='duration').reset_index(drop=True)
|
97 |
+
self.wav_list = self.data['wav'].values
|
98 |
+
self.idx_list = self.data['pIndex'].values
|
99 |
+
self.emb_list = self.data['g2p_embed'].values
|
100 |
+
self.lab_list = self.data['label'].values
|
101 |
+
|
102 |
+
# Set dataloader params.
|
103 |
+
self.len = len(self.data)
|
104 |
+
self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
|
105 |
+
self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
|
106 |
+
|
107 |
+
def __len__(self):
|
108 |
+
# return total batch-wise length
|
109 |
+
return math.ceil(self.len / self.batch_size)
|
110 |
+
|
111 |
+
def _load_wav(self, wav):
|
112 |
+
return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
|
113 |
+
|
114 |
+
def __getitem__(self, idx):
|
115 |
+
# chunking
|
116 |
+
indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
|
117 |
+
|
118 |
+
# load inputs
|
119 |
+
batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
|
120 |
+
if self.features == 'both':
|
121 |
+
batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
122 |
+
batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
123 |
+
else:
|
124 |
+
if self.features == 'phoneme':
|
125 |
+
batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
|
126 |
+
elif self.features == 'g2p_embed':
|
127 |
+
batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
|
128 |
+
# load outputs
|
129 |
+
batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
|
130 |
+
|
131 |
+
# padding and masking
|
132 |
+
pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
|
133 |
+
if self.features == 'both':
|
134 |
+
pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
|
135 |
+
pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
|
136 |
+
else:
|
137 |
+
pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
|
138 |
+
pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
|
139 |
+
|
140 |
+
if self.features == 'both':
|
141 |
+
return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
142 |
+
else:
|
143 |
+
return pad_batch_x, pad_batch_y, pad_batch_z
|
144 |
+
|
145 |
+
def on_epoch_end(self):
|
146 |
+
self.indices = np.arange(self.len)
|
147 |
+
if self.shuffle == True:
|
148 |
+
np.random.shuffle(self.indices)
|
149 |
+
|
150 |
+
def convert_sequence_to_dataset(dataloader):
|
151 |
+
def data_generator():
|
152 |
+
for i in range(dataloader.__len__()):
|
153 |
+
if dataloader.features == 'both':
|
154 |
+
pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
|
155 |
+
yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
|
156 |
+
else:
|
157 |
+
pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
|
158 |
+
yield pad_batch_x, pad_batch_y, pad_batch_z
|
159 |
+
|
160 |
+
if dataloader.features == 'both':
|
161 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
162 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
163 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
|
164 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
|
165 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
166 |
+
)
|
167 |
+
else:
|
168 |
+
data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
|
169 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
|
170 |
+
tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
|
171 |
+
dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
|
172 |
+
tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
|
173 |
+
)
|
174 |
+
# data_dataset = data_dataset.cache()
|
175 |
+
data_dataset = data_dataset.prefetch(1)
|
176 |
+
|
177 |
+
return data_dataset
|
178 |
+
|
179 |
+
if __name__ == '__main__':
|
180 |
+
dataloader = QualcommKeywordSpeechDataloader(2048, pkl='/home/DB/qualcomm_keyword_speech_dataset/qualcomm.pkl', features='g2p_embed')
|
demo.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, warnings, argparse
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
from model import ukws
|
5 |
+
from dataset import dataloader_demo
|
6 |
+
import gradio as gr
|
7 |
+
# import librosa
|
8 |
+
|
9 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
10 |
+
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
11 |
+
warnings.filterwarnings('ignore')
|
12 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
13 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
14 |
+
warnings.simplefilter("ignore")
|
15 |
+
|
16 |
+
seed = 42
|
17 |
+
tf.random.set_seed(seed)
|
18 |
+
np.random.seed(seed)
|
19 |
+
|
20 |
+
|
21 |
+
parser = argparse.ArgumentParser()
|
22 |
+
|
23 |
+
parser.add_argument('--text_input', required=False, type=str, default='g2p_embed')
|
24 |
+
parser.add_argument('--audio_input', required=False, type=str, default='both')
|
25 |
+
parser.add_argument('--load_checkpoint_path', required=True, type=str)
|
26 |
+
parser.add_argument('--keyword_list_length', required=True, type=int)
|
27 |
+
parser.add_argument('--stack_extractor', action='store_true')
|
28 |
+
parser.add_argument('--comment', required=False, type=str)
|
29 |
+
args = parser.parse_args()
|
30 |
+
|
31 |
+
gpus = tf.config.experimental.list_physical_devices('GPU')
|
32 |
+
if gpus:
|
33 |
+
try:
|
34 |
+
for gpu in gpus:
|
35 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
36 |
+
except RuntimeError as e:
|
37 |
+
print(e)
|
38 |
+
|
39 |
+
strategy = tf.distribute.MirroredStrategy()
|
40 |
+
batch_size = args.keyword_list_length
|
41 |
+
# Batch size per GPU
|
42 |
+
GLOBAL_BATCH_SIZE = batch_size * strategy.num_replicas_in_sync
|
43 |
+
# BATCH_SIZE_PER_REPLICA = GLOBAL_BATCH_SIZE / strategy.num_replicas_in_sync
|
44 |
+
|
45 |
+
# Make Dataloader
|
46 |
+
text_input = args.text_input
|
47 |
+
audio_input = args.audio_input
|
48 |
+
load_checkpoint_path = args.load_checkpoint_path
|
49 |
+
|
50 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
51 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
52 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
53 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
54 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
55 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
56 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
57 |
+
' ']
|
58 |
+
# Number of phonemes
|
59 |
+
vocab = len(phonemes)
|
60 |
+
|
61 |
+
# Model params.
|
62 |
+
kwargs = {
|
63 |
+
'vocab' : vocab,
|
64 |
+
'text_input' : text_input,
|
65 |
+
'audio_input' : audio_input,
|
66 |
+
'frame_length' : 400,
|
67 |
+
'hop_length' : 160,
|
68 |
+
'num_mel' : 40,
|
69 |
+
'sample_rate' : 16000,
|
70 |
+
'log_mel' : False,
|
71 |
+
'stack_extractor' : args.stack_extractor,
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
# Make tensorboard dict.
|
77 |
+
global keyword
|
78 |
+
param = kwargs
|
79 |
+
param['comment'] = args.comment
|
80 |
+
|
81 |
+
|
82 |
+
with strategy.scope():
|
83 |
+
|
84 |
+
|
85 |
+
model = ukws.BaseUKWS(**kwargs)
|
86 |
+
if args.load_checkpoint_path:
|
87 |
+
checkpoint_dir=args.load_checkpoint_path
|
88 |
+
checkpoint = tf.train.Checkpoint(model=model)
|
89 |
+
checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)
|
90 |
+
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
|
91 |
+
if latest_checkpoint:
|
92 |
+
checkpoint.restore(latest_checkpoint)
|
93 |
+
print("Checkpoint restored!")
|
94 |
+
else:
|
95 |
+
print("No checkpoint found.")
|
96 |
+
|
97 |
+
def inference(audio,keyword):
|
98 |
+
|
99 |
+
if isinstance(keyword, str):
|
100 |
+
keyword = [kw.strip() for kw in keyword.split(',')]
|
101 |
+
|
102 |
+
test_google_dataset = dataloader_demo.GoogleCommandsDataloader(batch_size=GLOBAL_BATCH_SIZE, features=text_input, wav_path_or_object=audio, keyword = keyword)
|
103 |
+
|
104 |
+
test_google_dataset = dataloader_demo.convert_sequence_to_dataset(test_google_dataset)
|
105 |
+
|
106 |
+
test_google_dist_dataset = strategy.experimental_distribute_dataset(test_google_dataset)
|
107 |
+
|
108 |
+
|
109 |
+
# @tf.function
|
110 |
+
def test_step_metric_only(inputs,keyword_list):
|
111 |
+
clean_speech = inputs[0]
|
112 |
+
text = inputs[1]
|
113 |
+
labels = inputs[2]
|
114 |
+
prob, affinity_matrix = model(clean_speech, text, training=False)[:2]
|
115 |
+
prob=tf.round(prob * 1000) / 1000
|
116 |
+
prob = prob.numpy().flatten()
|
117 |
+
max_indices = np.argmax(prob,axis=0)
|
118 |
+
if prob[max_indices] >= 0.8:
|
119 |
+
keyword = keyword_list[ max_indices]
|
120 |
+
else :
|
121 |
+
keyword = 'no keyword'
|
122 |
+
|
123 |
+
print('keyword:',keyword_list)
|
124 |
+
print('prob',prob)
|
125 |
+
msg = ''
|
126 |
+
for k, p in zip(keyword_list, prob):
|
127 |
+
msg += '{} | {:.2f} \n'.format(k, p)
|
128 |
+
|
129 |
+
return keyword, msg
|
130 |
+
|
131 |
+
for x in test_google_dist_dataset:
|
132 |
+
keyword, prob = test_step_metric_only(x,keyword)
|
133 |
+
|
134 |
+
|
135 |
+
return keyword, prob
|
136 |
+
|
137 |
+
# keyword = ['realtek go','ok google','vintage','hackney','crocodile','surroundings','oversaw','northwestern']
|
138 |
+
# audio = '/share/nas165/yiting/recording/ok_google/Default_20240725-183000.wav'
|
139 |
+
# inference(audio,keyword)
|
140 |
+
|
141 |
+
demo = gr.Interface(
|
142 |
+
fn=inference,
|
143 |
+
inputs=[gr.Audio(source="upload", label="Sound"),
|
144 |
+
gr.Textbox(placeholder="Keyword List Here...", label="keyword_list")],
|
145 |
+
examples=[
|
146 |
+
["./recording/ok_google/ok_google-183000.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
147 |
+
["./recording/ok_google/ok_google-183005.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
148 |
+
["./recording/ok_google/ok_google-183008.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
149 |
+
["./recording/ok_google/ok_google-183011.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
150 |
+
["./recording/ok_google/ok_google-183015.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
151 |
+
["./recording/realtek_go/realtek_go-183029.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
152 |
+
["./recording/realtek_go/realtek_go-183033.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
153 |
+
["./recording/realtek_go/realtek_go-183036.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
154 |
+
["./recording/realtek_go/realtek_go-183039.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
155 |
+
["./recording/realtek_go/realtek_go-183043.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
|
156 |
+
],
|
157 |
+
outputs=[gr.Textbox(label="keyword"), gr.Textbox(label="Confidence Score of keyword")],
|
158 |
+
)
|
159 |
+
|
160 |
+
demo.launch(server_name='0.0.0.0', server_port=7860,share=True)
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
|
167 |
+
|
168 |
+
|
docker/Dockerfile
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM tensorflow/tensorflow:2.4.1-gpu
|
2 |
+
|
3 |
+
# Install dependency
|
4 |
+
RUN apt-key adv --keyserver keyserver.ubuntu.com --recv A4B469963BF863CC
|
5 |
+
RUN apt-get update -y && apt-get install -y \
|
6 |
+
git \
|
7 |
+
libsndfile1
|
8 |
+
|
9 |
+
# Install python packages
|
10 |
+
RUN python -m pip install --upgrade pip && pip install \
|
11 |
+
levenshtein \
|
12 |
+
six \
|
13 |
+
audioread \
|
14 |
+
librosa \
|
15 |
+
PySoundFile \
|
16 |
+
scipy \
|
17 |
+
tqdm \
|
18 |
+
pandas \
|
19 |
+
nltk \
|
20 |
+
inflect
|
21 |
+
|
22 |
+
RUN python -m pip uninstall -y numpy
|
23 |
+
RUN python -m pip install numpy==1.18.5
|
24 |
+
|
25 |
+
WORKDIR /home
|
flagged/Sound/c129aef35ba4cb66620f813cd7268c4be510a66d/ok_google-183000.wav
ADDED
Binary file (96.3 kB). View file
|
|
flagged/Sound/d35a5cf80a9403828bc601a0a761a5f88da06f00/realtek_go-183033.wav
ADDED
Binary file (101 kB). View file
|
|
flagged/log.csv
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Sound,keyword_list,keyword,Confidence Score of keyword,flag,username,timestamp
|
2 |
+
/share/nas165/yiting/CL-KWS_202408_v1/flagged/Sound/c129aef35ba4cb66620f813cd7268c4be510a66d/ok_google-183000.wav,"realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern",,,,,2024-09-11 09:54:49.824521
|
3 |
+
/share/nas165/yiting/CL-KWS_202408_v1/flagged/Sound/d35a5cf80a9403828bc601a0a761a5f88da06f00/realtek_go-183033.wav,"realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern",ok google,"ok cortana | 0.11
|
4 |
+
ok google | 0.97
|
5 |
+
hey google | 0.46
|
6 |
+
oh come google | 0.87
|
7 |
+
ok gogo | 0.91
|
8 |
+
",,,2024-09-11 10:23:11.972172
|
inference.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys, os, datetime, warnings, argparse
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from model import ukws
|
6 |
+
from dataset import google_infe202405
|
7 |
+
|
8 |
+
|
9 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
10 |
+
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
11 |
+
warnings.filterwarnings('ignore')
|
12 |
+
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
|
13 |
+
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
|
14 |
+
warnings.simplefilter("ignore")
|
15 |
+
|
16 |
+
seed = 42
|
17 |
+
tf.random.set_seed(seed)
|
18 |
+
np.random.seed(seed)
|
19 |
+
|
20 |
+
|
21 |
+
parser = argparse.ArgumentParser()
|
22 |
+
|
23 |
+
parser.add_argument('--text_input', required=False, type=str, default='g2p_embed')
|
24 |
+
parser.add_argument('--audio_input', required=False, type=str, default='both')
|
25 |
+
parser.add_argument('--load_checkpoint_path', required=True, type=str)
|
26 |
+
|
27 |
+
parser.add_argument('--google_pkl', required=False, type=str, default='/home/DB/data/google_test_all.pkl')
|
28 |
+
parser.add_argument('--stack_extractor', action='store_true')
|
29 |
+
args = parser.parse_args()
|
30 |
+
|
31 |
+
gpus = tf.config.experimental.list_physical_devices('GPU')
|
32 |
+
if gpus:
|
33 |
+
try:
|
34 |
+
for gpu in gpus:
|
35 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
36 |
+
except RuntimeError as e:
|
37 |
+
print(e)
|
38 |
+
|
39 |
+
strategy = tf.distribute.MirroredStrategy()
|
40 |
+
|
41 |
+
# Batch size per GPU
|
42 |
+
GLOBAL_BATCH_SIZE = 1000 * strategy.num_replicas_in_sync
|
43 |
+
BATCH_SIZE_PER_REPLICA = GLOBAL_BATCH_SIZE / strategy.num_replicas_in_sync
|
44 |
+
|
45 |
+
# Make Dataloader
|
46 |
+
text_input = args.text_input
|
47 |
+
audio_input = args.audio_input
|
48 |
+
load_checkpoint_path = args.load_checkpoint_path
|
49 |
+
|
50 |
+
|
51 |
+
test_google_dataset = google_infe202405.GoogleCommandsDataloader(batch_size=GLOBAL_BATCH_SIZE, features=text_input, shuffle=False, pkl=args.google_pkl)
|
52 |
+
|
53 |
+
test_google_dataset = google_infe202405.convert_sequence_to_dataset(test_google_dataset)
|
54 |
+
|
55 |
+
test_google_dist_dataset = strategy.experimental_distribute_dataset(test_google_dataset)
|
56 |
+
|
57 |
+
phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
|
58 |
+
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
|
59 |
+
'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
|
60 |
+
'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
|
61 |
+
'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
|
62 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
|
63 |
+
'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
|
64 |
+
' ']
|
65 |
+
# Number of phonemes
|
66 |
+
vocab = len(phonemes)
|
67 |
+
|
68 |
+
# Model params.
|
69 |
+
kwargs = {
|
70 |
+
'vocab' : vocab,
|
71 |
+
'text_input' : text_input,
|
72 |
+
'audio_input' : audio_input,
|
73 |
+
'frame_length' : 400,
|
74 |
+
'hop_length' : 160,
|
75 |
+
'num_mel' : 40,
|
76 |
+
'sample_rate' : 16000,
|
77 |
+
'log_mel' : False,
|
78 |
+
'stack_extractor' : args.stack_extractor,
|
79 |
+
}
|
80 |
+
|
81 |
+
|
82 |
+
# Make tensorboard dict.
|
83 |
+
param = kwargs
|
84 |
+
|
85 |
+
|
86 |
+
with strategy.scope():
|
87 |
+
|
88 |
+
|
89 |
+
model = ukws.BaseUKWS(**kwargs)
|
90 |
+
|
91 |
+
|
92 |
+
if args.load_checkpoint_path:
|
93 |
+
checkpoint_dir=args.load_checkpoint_path
|
94 |
+
checkpoint = tf.train.Checkpoint(model=model)
|
95 |
+
checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)
|
96 |
+
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
|
97 |
+
if latest_checkpoint:
|
98 |
+
checkpoint.restore(latest_checkpoint)
|
99 |
+
print("Checkpoint restored!")
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
# @tf.function
|
104 |
+
def test_step_metric_only(inputs):
|
105 |
+
|
106 |
+
clean_speech = inputs[0]
|
107 |
+
text = inputs[1]
|
108 |
+
labels = inputs[2]
|
109 |
+
|
110 |
+
prob = model(clean_speech, text, training=False)[0]
|
111 |
+
|
112 |
+
dim1=labels.shape[0]//20
|
113 |
+
prob = tf.reshape(prob,[dim1,20])
|
114 |
+
labels = tf.reshape(labels,[dim1,20])
|
115 |
+
predictions = tf.math.argmax(prob, axis=1)
|
116 |
+
actuals = tf.math.argmax(labels, axis=1)
|
117 |
+
|
118 |
+
true_count = tf.reduce_sum(tf.cast(tf.math.equal(predictions , actuals), tf.float32)).numpy()
|
119 |
+
num_testdata = dim1
|
120 |
+
return true_count, num_testdata
|
121 |
+
|
122 |
+
|
123 |
+
def distributed_test_step_metric_only(dataset_inputs):
|
124 |
+
true_count, num_testdata = strategy.run(test_step_metric_only, args=(dataset_inputs,))
|
125 |
+
return true_count, num_testdata
|
126 |
+
|
127 |
+
|
128 |
+
total_true_count = 0
|
129 |
+
total_num_testdata = 0
|
130 |
+
for x in test_google_dist_dataset:
|
131 |
+
true_count, num_testdata = distributed_test_step_metric_only(x)
|
132 |
+
total_true_count += true_count
|
133 |
+
total_num_testdata += num_testdata
|
134 |
+
accuracy = total_true_count / total_num_testdata * 100.0
|
135 |
+
print("準確率:", accuracy, "%")
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
|
model/__pycache__/discriminator.cpython-37.pyc
ADDED
Binary file (2.35 kB). View file
|
|
model/__pycache__/encoder.cpython-37.pyc
ADDED
Binary file (5.6 kB). View file
|
|
model/__pycache__/extractor.cpython-37.pyc
ADDED
Binary file (3.82 kB). View file
|
|
model/__pycache__/log_melspectrogram.cpython-37.pyc
ADDED
Binary file (2.17 kB). View file
|
|
model/__pycache__/speech_embedding.cpython-37.pyc
ADDED
Binary file (1.75 kB). View file
|
|