Model/w2v2-vits/1026_epochs.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f61e221e36af355dba89f20f70215d3a93dbe9fd497172ce46c950f757ccce0
3
- size 159675849
 
 
 
 
Model/w2v2-vits/config.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "train": {
3
- "segment_size": 8192
4
- },
5
- "data": {
6
- "text_cleaners":["zh_ja_mixture_cleaners"],
7
- "max_wav_value": 32768.0,
8
- "sampling_rate": 22050,
9
- "filter_length": 1024,
10
- "hop_length": 256,
11
- "win_length": 1024,
12
- "add_blank": true,
13
- "n_speakers": 5,
14
- "emotion_embedding": true
15
- },
16
- "model": {
17
- "inter_channels": 192,
18
- "hidden_channels": 192,
19
- "filter_channels": 768,
20
- "n_heads": 2,
21
- "n_layers": 6,
22
- "kernel_size": 3,
23
- "p_dropout": 0.1,
24
- "resblock": "1",
25
- "resblock_kernel_sizes": [3,7,11],
26
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
27
- "upsample_rates": [8,8,2,2],
28
- "upsample_initial_channel": 512,
29
- "upsample_kernel_sizes": [16,16,4,4],
30
- "n_layers_q": 3,
31
- "use_spectral_norm": false,
32
- "gin_channels": 256
33
- },
34
- "speakers": ["\u7dbe\u5730\u5be7\u3005", "\u5728\u539f\u4e03\u6d77", "\u5c0f\u8338", "\u5510\u4e50\u541f"],
35
- "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u207c", "\u02b0", "`", "\u2192", "\u2193", "\u2191", " "]
36
- }