novateur commited on
Commit
af1973e
1 Parent(s): 58940de

Upload 2 files

Browse files
wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seed_everything: 3407
2
+
3
+ data:
4
+ class_path: decoder.dataset.VocosDataModule
5
+ init_args:
6
+ train_params:
7
+ filelist_path: ./WavTokenizer/data/train/libritts_train
8
+ sampling_rate: 24000
9
+ num_samples: 72000
10
+ batch_size: 40 # 20
11
+ num_workers: 8
12
+
13
+ val_params:
14
+ filelist_path: ./WavTokenizer/data/infer/librttts_val
15
+ sampling_rate: 24000
16
+ num_samples: 72000
17
+ batch_size: 5 # 10
18
+ num_workers: 8
19
+
20
+ model:
21
+ class_path: decoder.experiment.WavTokenizer
22
+ init_args:
23
+ sample_rate: 24000
24
+ initial_learning_rate: 2e-4
25
+ mel_loss_coeff: 45
26
+ mrd_loss_coeff: 1.0
27
+ num_warmup_steps: 0 # Optimizers warmup steps
28
+ pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
29
+
30
+ # automatic evaluation
31
+ evaluate_utmos: true
32
+ evaluate_pesq: true
33
+ evaluate_periodicty: true
34
+
35
+ resume: false
36
+ resume_config: ./WavTokenizer/configs/wavtokenizer_smalldata_frame40_3s_nq1_code16384_dim512_kmeans800_attn.yaml
37
+ resume_model: ./version_3/checkpoints/vocos_checkpoint_epoch=31_step=157696_val_loss=5.9855.ckpt
38
+
39
+ feature_extractor:
40
+ class_path: decoder.feature_extractors.EncodecFeatures
41
+ init_args:
42
+ encodec_model: encodec_24khz
43
+ bandwidths: [6.6, 6.6, 6.6, 6.6]
44
+ train_codebooks: true
45
+ num_quantizers: 1
46
+ dowmsamples: [6, 5, 5, 4]
47
+ vq_bins: 4096
48
+ vq_kmeans: 200
49
+
50
+ backbone:
51
+ class_path: decoder.models.VocosBackbone
52
+ init_args:
53
+ input_channels: 512
54
+ dim: 768
55
+ intermediate_dim: 2304
56
+ num_layers: 12
57
+ adanorm_num_embeddings: 4
58
+
59
+ head:
60
+ class_path: decoder.heads.ISTFTHead
61
+ init_args:
62
+ dim: 768
63
+ n_fft: 2400
64
+ hop_length: 600
65
+ padding: same
66
+
67
+ trainer:
68
+ logger:
69
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
70
+ init_args:
71
+ save_dir: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn/
72
+ callbacks:
73
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
74
+ - class_path: pytorch_lightning.callbacks.ModelSummary
75
+ init_args:
76
+ max_depth: 2
77
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
78
+ init_args:
79
+ monitor: val_loss
80
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
81
+ save_top_k: 10
82
+ save_last: true
83
+ - class_path: decoder.helpers.GradNormCallback
84
+
85
+ # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
86
+ # This equals to 1M steps per generator and 1M per discriminator
87
+ max_steps: 20000000
88
+ # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
89
+ limit_val_batches: 200
90
+ accelerator: gpu
91
+ strategy: ddp
92
+ devices: [0,1,2,3,4,5,6,7]
93
+ log_every_n_steps: 1000
wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seed_everything: 3407
2
+
3
+ data:
4
+ class_path: decoder.dataset.VocosDataModule
5
+ init_args:
6
+ train_params:
7
+ filelist_path: ./WavTokenizer/data/train/libritts_train
8
+ sampling_rate: 24000
9
+ num_samples: 72000
10
+ batch_size: 40 # 20
11
+ num_workers: 8
12
+
13
+ val_params:
14
+ filelist_path: ./WavTokenizer/data/infer/librttts_val
15
+ sampling_rate: 24000
16
+ num_samples: 72000
17
+ batch_size: 5 # 10
18
+ num_workers: 8
19
+
20
+ model:
21
+ class_path: decoder.experiment.WavTokenizer
22
+ init_args:
23
+ sample_rate: 24000
24
+ initial_learning_rate: 2e-4
25
+ mel_loss_coeff: 45
26
+ mrd_loss_coeff: 1.0
27
+ num_warmup_steps: 0 # Optimizers warmup steps
28
+ pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
29
+
30
+ # automatic evaluation
31
+ evaluate_utmos: true
32
+ evaluate_pesq: true
33
+ evaluate_periodicty: true
34
+
35
+ resume: false
36
+ resume_config: ./WavTokenizer/configs/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn.yaml
37
+ resume_model: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn/vocos_checkpoint_epoch=35_step=187848_val_loss=4.1611.ckpt
38
+
39
+ feature_extractor:
40
+ class_path: decoder.feature_extractors.EncodecFeatures
41
+ init_args:
42
+ encodec_model: encodec_24khz
43
+ bandwidths: [6.6, 6.6, 6.6, 6.6]
44
+ train_codebooks: true
45
+ num_quantizers: 1
46
+ dowmsamples: [8, 5, 4, 2]
47
+ vq_bins: 4096
48
+ vq_kmeans: 200
49
+
50
+ backbone:
51
+ class_path: decoder.models.VocosBackbone
52
+ init_args:
53
+ input_channels: 512
54
+ dim: 768
55
+ intermediate_dim: 2304
56
+ num_layers: 12
57
+ adanorm_num_embeddings: 4
58
+
59
+ head:
60
+ class_path: decoder.heads.ISTFTHead
61
+ init_args:
62
+ dim: 768
63
+ n_fft: 1280
64
+ hop_length: 320
65
+ padding: same
66
+
67
+ trainer:
68
+ logger:
69
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
70
+ init_args:
71
+ save_dir: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn/
72
+ callbacks:
73
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
74
+ - class_path: pytorch_lightning.callbacks.ModelSummary
75
+ init_args:
76
+ max_depth: 2
77
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
78
+ init_args:
79
+ monitor: val_loss
80
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
81
+ save_top_k: 10
82
+ save_last: true
83
+ - class_path: decoder.helpers.GradNormCallback
84
+
85
+ # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
86
+ # This equals to 1M steps per generator and 1M per discriminator
87
+ max_steps: 20000000
88
+ # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
89
+ limit_val_batches: 100
90
+ accelerator: gpu
91
+ strategy: ddp
92
+ devices: [0,1,2,3,4,5,6,7]
93
+ log_every_n_steps: 1000