File size: 2,359 Bytes
4a3768d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
audio:
  chunk_size: 131584
  dim_f: 1024
  dim_t: 256
  hop_length: 512
  n_fft: 2048
  num_channels: 2
  sample_rate: 44100
  min_mean_abs: 0.001

model:
  dim: 384
  depth: 12
  stereo: true
  num_stems: 1
  time_transformer_depth: 1
  freq_transformer_depth: 1
  linear_transformer_depth: 0
  freqs_per_bands: !!python/tuple
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 2
    - 4
    - 4
    - 4
    - 4
    - 4
    - 4
    - 4
    - 4
    - 4
    - 4
    - 4
    - 4
    - 12
    - 12
    - 12
    - 12
    - 12
    - 12
    - 12
    - 12
    - 24
    - 24
    - 24
    - 24
    - 24
    - 24
    - 24
    - 24
    - 48
    - 48
    - 48
    - 48
    - 48
    - 48
    - 48
    - 48
    - 128
    - 129
  dim_head: 64
  heads: 8
  attn_dropout: 0.1
  ff_dropout: 0.1
  flash_attn: true
  dim_freqs_in: 1025
  stft_n_fft: 2048
  stft_hop_length: 512
  stft_win_length: 2048
  stft_normalized: false
  mask_estimator_depth: 2
  multi_stft_resolution_loss_weight: 1.0
  multi_stft_resolutions_window_sizes: !!python/tuple
  - 4096
  - 2048
  - 1024
  - 512
  - 256
  multi_stft_hop_size: 147
  multi_stft_normalized: False

training:
  batch_size: 4
  gradient_accumulation_steps: 1
  grad_clip: 0
  instruments:
  - Vocals
  - Instrumental
  lr: 5.0e-05
  patience: 2
  reduce_factor: 0.95
  target_instrument: Vocals
  num_epochs: 1000
  num_steps: 1000
  q: 0.95
  coarse_loss_clip: true
  ema_momentum: 0.999
  optimizer: adam
  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true

augmentations:
  enable: true # enable or disable all augmentations (to fast disable if needed)
  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
  loudness_min: 0.5
  loudness_max: 1.5
  mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
  mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
    - 0.2
    - 0.02
  mixup_loudness_min: 0.5
  mixup_loudness_max: 1.5

inference:
  batch_size: 1
  dim_t: 512
  num_overlap: 4