Xenova
/

speecht5_tts

Text-to-Speech

Transformers.js

ONNX

speecht5

text-to-audio

Model card Files Files and versions Community

jrsimuix commited on Nov 19, 2024

Commit

7c6c321

verified ·

1 Parent(s): fda0ee8

Upload 2 files

Browse files

Files changed (2) hide show

config.json +14 -94
preprocessor_config.json +7 -19

config.json CHANGED Viewed

@@ -1,94 +1,14 @@
-{
-  "_name_or_path": "microsoft/speecht5_tts",
-  "activation_dropout": 0.1,
-  "apply_spec_augment": true,
-  "architectures": [
-    "SpeechT5ForTextToSpeech"
-  ],
-  "attention_dropout": 0.1,
-  "bos_token_id": 0,
-  "conv_bias": false,
-  "conv_dim": [
-    512,
-    512,
-    512,
-    512,
-    512,
-    512,
-    512
-  ],
-  "conv_kernel": [
-    10,
-    3,
-    3,
-    3,
-    3,
-    2,
-    2
-  ],
-  "conv_stride": [
-    5,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2
-  ],
-  "decoder_attention_heads": 12,
-  "decoder_ffn_dim": 3072,
-  "decoder_layerdrop": 0.1,
-  "decoder_layers": 6,
-  "decoder_start_token_id": 2,
-  "encoder_attention_heads": 12,
-  "encoder_ffn_dim": 3072,
-  "encoder_layerdrop": 0.1,
-  "encoder_layers": 12,
-  "encoder_max_relative_position": 160,
-  "eos_token_id": 2,
-  "feat_extract_activation": "gelu",
-  "feat_extract_norm": "group",
-  "feat_proj_dropout": 0.0,
-  "guided_attention_loss_num_heads": 2,
-  "guided_attention_loss_scale": 10.0,
-  "guided_attention_loss_sigma": 0.4,
-  "hidden_act": "gelu",
-  "hidden_dropout": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "is_encoder_decoder": true,
-  "layer_norm_eps": 1e-05,
-  "mask_feature_length": 10,
-  "mask_feature_min_masks": 0,
-  "mask_feature_prob": 0.0,
-  "mask_time_length": 10,
-  "mask_time_min_masks": 2,
-  "mask_time_prob": 0.05,
-  "max_length": 1876,
-  "max_speech_positions": 1876,
-  "max_text_positions": 600,
-  "model_type": "speecht5",
-  "num_conv_pos_embedding_groups": 16,
-  "num_conv_pos_embeddings": 128,
-  "num_feat_extract_layers": 7,
-  "num_mel_bins": 80,
-  "pad_token_id": 1,
-  "positional_dropout": 0.1,
-  "reduction_factor": 2,
-  "scale_embedding": false,
-  "speaker_embedding_dim": 512,
-  "speech_decoder_postnet_dropout": 0.5,
-  "speech_decoder_postnet_kernel": 5,
-  "speech_decoder_postnet_layers": 5,
-  "speech_decoder_postnet_units": 256,
-  "speech_decoder_prenet_dropout": 0.5,
-  "speech_decoder_prenet_layers": 2,
-  "speech_decoder_prenet_units": 256,
-  "transformers_version": "4.33.2",
-  "transformers.js_config": {
-    "dtype": "fp32"
-  },
-  "use_cache": true,
-  "use_guided_attention_loss": true,
-  "vocab_size": 81
-}

+{
+  "model_type": "vit",
+  "hidden_size": 768,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "intermediate_size": 3072,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "attention_probs_dropout_prob": 0.1,
+  "image_size": 128,
+  "patch_size": 16,
+  "num_channels": 3,
+  "num_labels": 2
+}

preprocessor_config.json CHANGED Viewed

@@ -1,19 +1,7 @@
-{
-  "do_normalize": false,
-  "feature_extractor_type": "SpeechT5FeatureExtractor",
-  "feature_size": 1,
-  "fmax": 7600,
-  "fmin": 80,
-  "frame_signal_scale": 1.0,
-  "hop_length": 16,
-  "mel_floor": 1e-10,
-  "num_mel_bins": 80,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "processor_class": "SpeechT5Processor",
-  "reduction_factor": 2,
-  "return_attention_mask": true,
-  "sampling_rate": 16000,
-  "win_function": "hann_window",
-  "win_length": 64
-}

+{
+	"feature_extractor_type": "ViTFeatureExtractor",
+	"image_mean": [0.5, 0.5, 0.5],
+	"image_std": [0.5, 0.5, 0.5],
+	"size": 128
+  }