Adds config and weights of UNIVERSE++
Browse files- config.yaml +74 -36
- weights.ckpt +2 -2
config.yaml
CHANGED
|
@@ -36,49 +36,45 @@ datamodule:
|
|
| 36 |
fs: 16000
|
| 37 |
split: train
|
| 38 |
audio_len: 2.0
|
| 39 |
-
augmentation: false
|
| 40 |
vb-val-16k:
|
| 41 |
_target_: open_universe.datasets.NoisyDataset
|
| 42 |
audio_path: ${..vb-train-16k.audio_path}
|
| 43 |
fs: ${..vb-train-16k.fs}
|
| 44 |
split: val
|
| 45 |
audio_len: null
|
| 46 |
-
augmentation: false
|
| 47 |
vb-test-16k:
|
| 48 |
_target_: open_universe.datasets.NoisyDataset
|
| 49 |
audio_path: ${..vb-train-16k.audio_path}
|
| 50 |
fs: ${..vb-train-16k.fs}
|
| 51 |
split: test
|
| 52 |
audio_len: null
|
| 53 |
-
augmentation: false
|
| 54 |
vb-train-24k:
|
| 55 |
_target_: open_universe.datasets.NoisyDataset
|
| 56 |
audio_path: data/voicebank_demand/24k
|
| 57 |
fs: 24000
|
| 58 |
split: train
|
| 59 |
audio_len: 2.0
|
| 60 |
-
augmentation: false
|
| 61 |
vb-val-24k:
|
| 62 |
_target_: open_universe.datasets.NoisyDataset
|
| 63 |
audio_path: ${..vb-train-24k.audio_path}
|
| 64 |
fs: ${..vb-train-24k.fs}
|
| 65 |
split: val
|
| 66 |
audio_len: null
|
| 67 |
-
augmentation: false
|
| 68 |
vb-test-24k:
|
| 69 |
_target_: open_universe.datasets.NoisyDataset
|
| 70 |
audio_path: ${..vb-train-24k.audio_path}
|
| 71 |
fs: ${..vb-train-24k.fs}
|
| 72 |
split: test
|
| 73 |
audio_len: null
|
| 74 |
-
augmentation: false
|
| 75 |
model:
|
| 76 |
-
_target_: open_universe.networks.universe.
|
| 77 |
fs: 16000
|
| 78 |
normalization_norm: 2
|
| 79 |
normalization_kwargs:
|
| 80 |
ref: both
|
| 81 |
level_db: -26.0
|
|
|
|
|
|
|
| 82 |
score_model:
|
| 83 |
_target_: open_universe.networks.universe.ScoreNetwork
|
| 84 |
fb_kernel_size: 3
|
|
@@ -93,9 +89,9 @@ model:
|
|
| 93 |
encoder_gru_conv_sandwich: false
|
| 94 |
extra_conv_block: true
|
| 95 |
decoder_act_type: prelu
|
| 96 |
-
use_weight_norm:
|
| 97 |
-
|
| 98 |
-
|
| 99 |
condition_model:
|
| 100 |
_target_: open_universe.networks.universe.ConditionerNetwork
|
| 101 |
fb_kernel_size: ${model.score_model.fb_kernel_size}
|
|
@@ -107,7 +103,6 @@ model:
|
|
| 107 |
extra_conv_block: ${model.score_model.extra_conv_block}
|
| 108 |
decoder_act_type: prelu
|
| 109 |
use_weight_norm: ${model.score_model.use_weight_norm}
|
| 110 |
-
seq_model: ${model.score_model.seq_model}
|
| 111 |
use_antialiasing: false
|
| 112 |
diffusion:
|
| 113 |
schedule: geometric
|
|
@@ -116,17 +111,39 @@ model:
|
|
| 116 |
n_steps: 8
|
| 117 |
epsilon: 1.3
|
| 118 |
losses:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
weights:
|
|
|
|
| 120 |
score: 1.0
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
mdn_n_comp: 3
|
| 124 |
-
mdn_alpha_per_sample: true
|
| 125 |
score_loss:
|
| 126 |
_target_: torch.nn.MSELoss
|
| 127 |
training:
|
| 128 |
audio_len: ${datamodule.datasets.vb-train-16k.audio_len}
|
| 129 |
-
time_sampling:
|
| 130 |
dynamic_mixing: false
|
| 131 |
ema_decay: 0.999
|
| 132 |
validation:
|
|
@@ -134,31 +151,52 @@ model:
|
|
| 134 |
main_loss_mode: max
|
| 135 |
n_bins: 5
|
| 136 |
max_enh_batches: 4
|
| 137 |
-
num_tb_samples: 0
|
| 138 |
enh_losses:
|
| 139 |
val/:
|
| 140 |
_target_: open_universe.metrics.EvalMetrics
|
| 141 |
audio_fs: ${model.fs}
|
| 142 |
optimizer:
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
scheduler:
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
grad_clipper:
|
| 163 |
_target_: open_universe.utils.FixedClipper
|
| 164 |
max_norm: 1000.0
|
|
@@ -167,7 +205,7 @@ trainer:
|
|
| 167 |
accumulate_grad_batches: 1
|
| 168 |
min_epochs: 1
|
| 169 |
max_epochs: -1
|
| 170 |
-
max_steps:
|
| 171 |
deterministic: warn
|
| 172 |
accelerator: gpu
|
| 173 |
devices: -1
|
|
|
|
| 36 |
fs: 16000
|
| 37 |
split: train
|
| 38 |
audio_len: 2.0
|
|
|
|
| 39 |
vb-val-16k:
|
| 40 |
_target_: open_universe.datasets.NoisyDataset
|
| 41 |
audio_path: ${..vb-train-16k.audio_path}
|
| 42 |
fs: ${..vb-train-16k.fs}
|
| 43 |
split: val
|
| 44 |
audio_len: null
|
|
|
|
| 45 |
vb-test-16k:
|
| 46 |
_target_: open_universe.datasets.NoisyDataset
|
| 47 |
audio_path: ${..vb-train-16k.audio_path}
|
| 48 |
fs: ${..vb-train-16k.fs}
|
| 49 |
split: test
|
| 50 |
audio_len: null
|
|
|
|
| 51 |
vb-train-24k:
|
| 52 |
_target_: open_universe.datasets.NoisyDataset
|
| 53 |
audio_path: data/voicebank_demand/24k
|
| 54 |
fs: 24000
|
| 55 |
split: train
|
| 56 |
audio_len: 2.0
|
|
|
|
| 57 |
vb-val-24k:
|
| 58 |
_target_: open_universe.datasets.NoisyDataset
|
| 59 |
audio_path: ${..vb-train-24k.audio_path}
|
| 60 |
fs: ${..vb-train-24k.fs}
|
| 61 |
split: val
|
| 62 |
audio_len: null
|
|
|
|
| 63 |
vb-test-24k:
|
| 64 |
_target_: open_universe.datasets.NoisyDataset
|
| 65 |
audio_path: ${..vb-train-24k.audio_path}
|
| 66 |
fs: ${..vb-train-24k.fs}
|
| 67 |
split: test
|
| 68 |
audio_len: null
|
|
|
|
| 69 |
model:
|
| 70 |
+
_target_: open_universe.networks.universe.UniverseGAN
|
| 71 |
fs: 16000
|
| 72 |
normalization_norm: 2
|
| 73 |
normalization_kwargs:
|
| 74 |
ref: both
|
| 75 |
level_db: -26.0
|
| 76 |
+
edm:
|
| 77 |
+
noise: 0.25
|
| 78 |
score_model:
|
| 79 |
_target_: open_universe.networks.universe.ScoreNetwork
|
| 80 |
fb_kernel_size: 3
|
|
|
|
| 89 |
encoder_gru_conv_sandwich: false
|
| 90 |
extra_conv_block: true
|
| 91 |
decoder_act_type: prelu
|
| 92 |
+
use_weight_norm: true
|
| 93 |
+
use_antialiasing: true
|
| 94 |
+
time_embedding: simple
|
| 95 |
condition_model:
|
| 96 |
_target_: open_universe.networks.universe.ConditionerNetwork
|
| 97 |
fb_kernel_size: ${model.score_model.fb_kernel_size}
|
|
|
|
| 103 |
extra_conv_block: ${model.score_model.extra_conv_block}
|
| 104 |
decoder_act_type: prelu
|
| 105 |
use_weight_norm: ${model.score_model.use_weight_norm}
|
|
|
|
| 106 |
use_antialiasing: false
|
| 107 |
diffusion:
|
| 108 |
schedule: geometric
|
|
|
|
| 111 |
n_steps: 8
|
| 112 |
epsilon: 1.3
|
| 113 |
losses:
|
| 114 |
+
multi_period_discriminator:
|
| 115 |
+
mpd_reshapes:
|
| 116 |
+
- 2
|
| 117 |
+
- 3
|
| 118 |
+
- 5
|
| 119 |
+
- 7
|
| 120 |
+
- 11
|
| 121 |
+
use_spectral_norm: false
|
| 122 |
+
discriminator_channel_mult: 1
|
| 123 |
+
multi_resolution_discriminator:
|
| 124 |
+
resolutions:
|
| 125 |
+
- - 1024
|
| 126 |
+
- 120
|
| 127 |
+
- 600
|
| 128 |
+
- - 2048
|
| 129 |
+
- 240
|
| 130 |
+
- 1200
|
| 131 |
+
- - 512
|
| 132 |
+
- 50
|
| 133 |
+
- 240
|
| 134 |
+
use_spectral_norm: false
|
| 135 |
+
discriminator_channel_mult: 1
|
| 136 |
+
disc_freeze_step: 0
|
| 137 |
weights:
|
| 138 |
+
mel_l1: 45.0
|
| 139 |
score: 1.0
|
| 140 |
+
use_signal_decoupling: true
|
| 141 |
+
signal_decoupling_act: snake
|
|
|
|
|
|
|
| 142 |
score_loss:
|
| 143 |
_target_: torch.nn.MSELoss
|
| 144 |
training:
|
| 145 |
audio_len: ${datamodule.datasets.vb-train-16k.audio_len}
|
| 146 |
+
time_sampling: time_normal_0.95
|
| 147 |
dynamic_mixing: false
|
| 148 |
ema_decay: 0.999
|
| 149 |
validation:
|
|
|
|
| 151 |
main_loss_mode: max
|
| 152 |
n_bins: 5
|
| 153 |
max_enh_batches: 4
|
|
|
|
| 154 |
enh_losses:
|
| 155 |
val/:
|
| 156 |
_target_: open_universe.metrics.EvalMetrics
|
| 157 |
audio_fs: ${model.fs}
|
| 158 |
optimizer:
|
| 159 |
+
accumulate_grad_batches: 1
|
| 160 |
+
generator:
|
| 161 |
+
_target_: torch.optim.AdamW
|
| 162 |
+
lr: 0.0002
|
| 163 |
+
weight_decay: 0.01
|
| 164 |
+
betas:
|
| 165 |
+
- 0.8
|
| 166 |
+
- 0.99
|
| 167 |
+
weight_decay_exclude:
|
| 168 |
+
- prelu
|
| 169 |
+
- bias
|
| 170 |
+
discriminator:
|
| 171 |
+
_target_: torch.optim.AdamW
|
| 172 |
+
lr: 0.0002
|
| 173 |
+
betas:
|
| 174 |
+
- 0.8
|
| 175 |
+
- 0.99
|
| 176 |
+
grad_clip_vals:
|
| 177 |
+
mrd: 1000.0
|
| 178 |
+
mpd: 1000.0
|
| 179 |
+
score: 1000.0
|
| 180 |
+
cond: 1000.0
|
| 181 |
scheduler:
|
| 182 |
+
generator:
|
| 183 |
+
scheduler:
|
| 184 |
+
_target_: open_universe.utils.schedulers.LinearWarmupCosineAnnealingLR
|
| 185 |
+
T_warmup: 20000
|
| 186 |
+
T_cosine: 400000
|
| 187 |
+
eta_min: 1.6e-06
|
| 188 |
+
T_max: ${trainer.max_steps}
|
| 189 |
+
interval: step
|
| 190 |
+
frequency: 1
|
| 191 |
+
discriminator:
|
| 192 |
+
scheduler:
|
| 193 |
+
_target_: open_universe.utils.schedulers.LinearWarmupCosineAnnealingLR
|
| 194 |
+
T_warmup: 20000
|
| 195 |
+
T_cosine: 400000
|
| 196 |
+
eta_min: 1.6e-06
|
| 197 |
+
T_max: ${trainer.max_steps}
|
| 198 |
+
interval: step
|
| 199 |
+
frequency: 1
|
| 200 |
grad_clipper:
|
| 201 |
_target_: open_universe.utils.FixedClipper
|
| 202 |
max_norm: 1000.0
|
|
|
|
| 205 |
accumulate_grad_batches: 1
|
| 206 |
min_epochs: 1
|
| 207 |
max_epochs: -1
|
| 208 |
+
max_steps: 600000
|
| 209 |
deterministic: warn
|
| 210 |
accelerator: gpu
|
| 211 |
devices: -1
|
weights.ckpt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d90ab343c86501a23d5dd0011242d1129ad2f54d8cebec68c55dd387037879c
|
| 3 |
+
size 1025936580
|