mrfakename commited on
Commit
494c78a
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +67 -0
  3. config.json +163 -0
  4. preprocessor_config.json +14 -0
  5. pytorch_model.bin +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license:
3
+ - mit
4
+ - apache-2.0
5
+ language:
6
+ - en
7
+ library_name: transformers
8
+ pipeline_tag: audio-classification
9
+ tags:
10
+ - audio
11
+ - tts
12
+ ---
13
+
14
+ # StyleTTS 2 Detector
15
+
16
+ This is a model trained for audio classification on a dataset of almost 10,000 samples of human and StyleTTS 2-generated audio clips. The model is based on [Whisper](https://huggingface.co/openai/whisper-base).
17
+
18
+ **NOTE: This model is not affiliated with the author(s) of StyleTTS 2 in any way.**
19
+
20
+ **NOTE: This model only aims to detect audio generated by StyleTTS 2 and DOES NOT work for audio generated by other TTS models or finetunes. I'm aiming to create a universal classifier in the future.**
21
+
22
+ ## Online Demo
23
+
24
+ An online demo is available [here](https://huggingface.co/spaces/mrfakename/styletts2-detector).
25
+
26
+ ## Usage
27
+
28
+ **IMPORTANT:** Please read the license, disclaimer, and model card before using the model. You may not use the model if you do not agree to the license and disclaimer.
29
+
30
+ ```python
31
+ from transformers import pipeline
32
+ import torch
33
+
34
+ pipe = pipeline('audio-classification', model='mrfakename/styletts2-detector', device='cuda' if torch.cuda.is_available() else 'cpu')
35
+
36
+ result = pipe('audio.wav')
37
+
38
+ print(result)
39
+ ```
40
+
41
+ ## Tags
42
+
43
+ The audio will be classified as either `real` or `fake` (human-generated or StyleTTS 2-spoken).
44
+
45
+ ## Disclaimer
46
+
47
+ The author(s) of this model cannot guarantee complete accuracy. False positives or negatives may occur.
48
+
49
+ Usage of this model should not replace other precautions, such as invisible watermarking or audio watermarking.
50
+
51
+ This model has been trained on outputs from the StyleTTS 2 base model, not fine-tunes. The model may not identify fine-tunes properly.
52
+
53
+ The author(s) of this model disclaim all liability related to or in connection with the usage of this model.
54
+
55
+ ## Training Data
56
+
57
+ This model was trained on the following data:
58
+
59
+ * A dataset of real human audio and synthetic audio generated by StyleTTS 2
60
+ * A subset of the LibriTTS-R dataset, which is licensed under the CC-BY 4.0 license and includes public domain audio.
61
+ * A custom synthetic dataset derived from a subset of the LibriTTS-R dataset and synthesized using StyleTTS 2. Text from the LibriTTS-R dataset were used as prompts for StyleTTS 2. The StyleTTS 2 model used was trained on the LibriTTS dataset.
62
+
63
+ ## License
64
+
65
+ You may use this model under either the **MIT** or **Apache 2.0** license, at your choice, so long as you include the disclaimer above in all redistributions, and require all future redistributions to include the disclaimer.
66
+
67
+ This model was trained partially on data from the [LibriTTS dataset](http://www.openslr.org/60/), the [LibriTTS-R dataset](https://google.github.io/df-conformer/librittsr/), and/or data generated using [StyleTTS 2](https://arxiv.org/abs/2306.07691) (which was trained on the LibriTTS dataset).
config.json ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-base",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "WhisperForAudioClassification"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": [
11
+ 220,
12
+ 50257
13
+ ],
14
+ "bos_token_id": 50257,
15
+ "classifier_proj_size": 256,
16
+ "d_model": 512,
17
+ "decoder_attention_heads": 8,
18
+ "decoder_ffn_dim": 2048,
19
+ "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 6,
21
+ "decoder_start_token_id": 50258,
22
+ "dropout": 0.0,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 50257,
28
+ "finetuning_task": "audio-classification",
29
+ "forced_decoder_ids": [
30
+ [
31
+ 1,
32
+ 50259
33
+ ],
34
+ [
35
+ 2,
36
+ 50359
37
+ ],
38
+ [
39
+ 3,
40
+ 50363
41
+ ]
42
+ ],
43
+ "id2label": {
44
+ "0": "real",
45
+ "1": "fake"
46
+ },
47
+ "init_std": 0.02,
48
+ "is_encoder_decoder": true,
49
+ "label2id": {
50
+ "fake": "1",
51
+ "real": "0"
52
+ },
53
+ "mask_feature_length": 10,
54
+ "mask_feature_min_masks": 0,
55
+ "mask_feature_prob": 0.0,
56
+ "mask_time_length": 10,
57
+ "mask_time_min_masks": 2,
58
+ "mask_time_prob": 0.05,
59
+ "max_length": 448,
60
+ "max_source_positions": 1500,
61
+ "max_target_positions": 448,
62
+ "median_filter_width": 7,
63
+ "model_type": "whisper",
64
+ "num_hidden_layers": 6,
65
+ "num_mel_bins": 80,
66
+ "pad_token_id": 50257,
67
+ "scale_embedding": false,
68
+ "suppress_tokens": [
69
+ 1,
70
+ 2,
71
+ 7,
72
+ 8,
73
+ 9,
74
+ 10,
75
+ 14,
76
+ 25,
77
+ 26,
78
+ 27,
79
+ 28,
80
+ 29,
81
+ 31,
82
+ 58,
83
+ 59,
84
+ 60,
85
+ 61,
86
+ 62,
87
+ 63,
88
+ 90,
89
+ 91,
90
+ 92,
91
+ 93,
92
+ 359,
93
+ 503,
94
+ 522,
95
+ 542,
96
+ 873,
97
+ 893,
98
+ 902,
99
+ 918,
100
+ 922,
101
+ 931,
102
+ 1350,
103
+ 1853,
104
+ 1982,
105
+ 2460,
106
+ 2627,
107
+ 3246,
108
+ 3253,
109
+ 3268,
110
+ 3536,
111
+ 3846,
112
+ 3961,
113
+ 4183,
114
+ 4667,
115
+ 6585,
116
+ 6647,
117
+ 7273,
118
+ 9061,
119
+ 9383,
120
+ 10428,
121
+ 10929,
122
+ 11938,
123
+ 12033,
124
+ 12331,
125
+ 12562,
126
+ 13793,
127
+ 14157,
128
+ 14635,
129
+ 15265,
130
+ 15618,
131
+ 16553,
132
+ 16604,
133
+ 18362,
134
+ 18956,
135
+ 20075,
136
+ 21675,
137
+ 22520,
138
+ 26130,
139
+ 26161,
140
+ 26435,
141
+ 28279,
142
+ 29464,
143
+ 31650,
144
+ 32302,
145
+ 32470,
146
+ 36865,
147
+ 42863,
148
+ 47425,
149
+ 49870,
150
+ 50254,
151
+ 50258,
152
+ 50358,
153
+ 50359,
154
+ 50360,
155
+ 50361,
156
+ 50362
157
+ ],
158
+ "torch_dtype": "float32",
159
+ "transformers_version": "4.31.0",
160
+ "use_cache": true,
161
+ "use_weighted_layer_sum": false,
162
+ "vocab_size": 51865
163
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 80,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6371663d68bb05bf9829ed9b799ae100f211c15ed7346322b9200f5751bd9bff
3
+ size 82923230