Spaces:
Running
Running
Tacatron2-TTS
/
pretrained_models
/GraphemeToPhoneme-9b27d6eb840bf95c5aedf15ae8ed1172
/hyperparams.yaml
# Generated 2022-07-09 from: | |
# /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml | |
# yamllint disable | |
# ################################ | |
# Model: LSTM (encoder) + GRU (decoder) (tokenized) | |
# Authors: | |
# Loren Lugosch & Mirco Ravanelli 2020 | |
# Artem Ploujnikov 2021 | |
# ################################ | |
# Seed needs to be set at top of yaml, before objects with parameters are made | |
seed: 1234 | |
__set_seed: !apply:torch.manual_seed [!ref <seed>] | |
# Tokenizers | |
char_tokenize: False | |
char_token_type: unigram # ["unigram", "bpe", "char"] | |
char_token_output: 512 | |
char_token_wordwise: True | |
phn_tokenize: False | |
phn_token_type: unigram # ["unigram", "bpe", "char"] | |
phn_token_output: 512 # index(blank/eos/bos/unk) = 0 | |
phn_token_wordwise: True | |
character_coverage: 1.0 | |
phonemes_count: 43 | |
graphemes_count: 31 | |
phonemes_enable_space: True | |
ctc_weight: 0.5 | |
ctc_window_size: 0 | |
homograph_loss_weight: 2.0 | |
# Model parameters | |
output_neurons: !apply:speechbrain.utils.hparams.choice | |
value: !ref <phn_tokenize> | |
choices: | |
True: !ref <phn_token_output> + 1 | |
False: !ref <phonemes_count> | |
enc_num_embeddings: !apply:speechbrain.utils.hparams.choice | |
value: !ref <char_tokenize> | |
choices: | |
True: !ref <char_token_output> + 1 | |
False: !ref <graphemes_count> | |
enc_dropout: 0.5 | |
enc_neurons: 512 | |
enc_num_layers: 4 | |
dec_dropout: 0.5 | |
dec_neurons: 512 | |
dec_att_neurons: 256 | |
dec_num_layers: 4 | |
embedding_dim: 512 | |
# Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens | |
# Available modes: | |
# raw: no BOS/EOS tokens are added | |
# bos: a beginning-of-sequence token is added | |
# eos: an end-of-sequence token is added | |
grapheme_sequence_mode: bos | |
phoneme_sequence_mode: bos | |
# Special Token information | |
bos_index: 0 | |
eos_index: 1 | |
blank_index: 2 | |
unk_index: 2 | |
token_space_index: 512 | |
# Language Model | |
lm_emb_dim: 256 # dimension of the embeddings | |
lm_rnn_size: 512 # dimension of hidden layers | |
lm_layers: 2 # number of hidden layers | |
lm_output_neurons: 43 | |
# Beam Searcher | |
beam_search_min_decode_ratio: 0 | |
beam_search_max_decode_ratio: 1.0 | |
beam_search_beam_size: 16 | |
beam_search_beam_size_valid: 16 | |
beam_search_eos_threshold: 10.0 | |
beam_search_using_max_attn_shift: false | |
beam_search_max_attn_shift: 10 | |
beam_search_coverage_penalty: 5.0 | |
beam_search_lm_weight: 0.5 | |
beam_search_ctc_weight_decode: 0.4 | |
beam_search_temperature: 1.25 | |
beam_search_temperature_lm: 1.0 | |
# Word embeddings | |
use_word_emb: true | |
word_emb_model: bert-base-uncased | |
word_emb_dim: 768 | |
word_emb_enc_dim: 256 | |
word_emb_norm_type: batch | |
graphemes: | |
- A | |
- B | |
- C | |
- D | |
- E | |
- F | |
- G | |
- H | |
- I | |
- J | |
- K | |
- L | |
- M | |
- N | |
- O | |
- P | |
- Q | |
- R | |
- S | |
- T | |
- U | |
- V | |
- W | |
- X | |
- Y | |
- Z | |
- "'" | |
- ' ' | |
phonemes: | |
- AA | |
- AE | |
- AH | |
- AO | |
- AW | |
- AY | |
- B | |
- CH | |
- D | |
- DH | |
- EH | |
- ER | |
- EY | |
- F | |
- G | |
- HH | |
- IH | |
- IY | |
- JH | |
- K | |
- L | |
- M | |
- N | |
- NG | |
- OW | |
- OY | |
- P | |
- R | |
- S | |
- SH | |
- T | |
- TH | |
- UH | |
- UW | |
- V | |
- W | |
- Y | |
- Z | |
- ZH | |
- ' ' | |
enc_input_dim: !apply:speechbrain.lobes.models.g2p.model.input_dim | |
use_word_emb: !ref <use_word_emb> | |
word_emb_enc_dim: !ref <word_emb_enc_dim> | |
embedding_dim: !ref <embedding_dim> | |
phn_char_map: !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map | |
tokens: !ref <phonemes> | |
char_phn_map: !apply:speechbrain.lobes.models.g2p.dataio.flip_map | |
map_dict: !ref <phn_char_map> | |
enc: !new:speechbrain.nnet.RNN.LSTM | |
input_shape: [null, null, !ref <enc_input_dim>] | |
bidirectional: True | |
hidden_size: !ref <enc_neurons> | |
num_layers: !ref <enc_num_layers> | |
dropout: !ref <enc_dropout> | |
lin: !new:speechbrain.nnet.linear.Linear | |
input_size: !ref <dec_neurons> | |
n_neurons: !ref <output_neurons> | |
bias: false | |
ctc_lin: !new:speechbrain.nnet.linear.Linear | |
input_size: !ref 2 * <enc_neurons> | |
n_neurons: !ref <output_neurons> | |
encoder_emb: !new:speechbrain.nnet.embedding.Embedding | |
num_embeddings: !ref <enc_num_embeddings> | |
embedding_dim: !ref <embedding_dim> | |
emb: !new:speechbrain.nnet.embedding.Embedding | |
num_embeddings: !ref <output_neurons> | |
embedding_dim: !ref <embedding_dim> | |
dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder | |
enc_dim: !ref <enc_neurons> * 2 | |
input_size: !ref <embedding_dim> | |
rnn_type: gru | |
attn_type: content | |
dropout: !ref <dec_dropout> | |
hidden_size: !ref <dec_neurons> | |
attn_dim: !ref <dec_att_neurons> | |
num_layers: !ref <dec_num_layers> | |
word_emb_enc: !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder | |
word_emb_dim: !ref <word_emb_dim> | |
word_emb_enc_dim: !ref <word_emb_enc_dim> | |
norm_type: batch | |
word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init | |
init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings | |
model: bert-base-uncased | |
log_softmax: !new:speechbrain.nnet.activations.Softmax | |
apply_log: true | |
model: !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq | |
enc: !ref <enc> | |
encoder_emb: !ref <encoder_emb> | |
emb: !ref <emb> | |
dec: !ref <dec> | |
lin: !ref <lin> | |
out: !ref <log_softmax> | |
use_word_emb: !ref <use_word_emb> | |
word_emb_enc: !ref <word_emb_enc> | |
modules: | |
model: !ref <model> | |
enc: !ref <enc> | |
encoder_emb: !ref <encoder_emb> | |
emb: !ref <emb> | |
dec: !ref <dec> | |
lin: !ref <lin> | |
ctc_lin: !ref <ctc_lin> | |
out: !ref <log_softmax> | |
word_emb: !ref <word_emb> | |
word_emb_enc: !ref <word_emb_enc> | |
lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM | |
embedding_dim: !ref <lm_emb_dim> | |
rnn_layers: !ref <lm_layers> | |
rnn_neurons: !ref <lm_rnn_size> | |
output_neurons: !ref <lm_output_neurons> | |
return_hidden: True | |
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer | |
eos_index: !ref <eos_index> | |
blank_index: !ref <blank_index> | |
ctc_fc: !ref <ctc_lin> | |
ctc_window_size: !ref <ctc_window_size> | |
coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer | |
vocab_size: !ref <output_neurons> | |
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder | |
full_scorers: [!ref <coverage_scorer>, !ref <ctc_scorer>] | |
weights: | |
coverage: !ref <beam_search_coverage_penalty> | |
ctc: !ref <ctc_weight> | |
beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher | |
embedding: !ref <emb> | |
decoder: !ref <dec> | |
linear: !ref <lin> | |
bos_index: !ref <bos_index> | |
eos_index: !ref <eos_index> | |
min_decode_ratio: !ref <beam_search_min_decode_ratio> | |
max_decode_ratio: !ref <beam_search_max_decode_ratio> | |
beam_size: !ref <beam_search_beam_size> | |
eos_threshold: !ref <beam_search_eos_threshold> | |
using_max_attn_shift: !ref <beam_search_using_max_attn_shift> | |
max_attn_shift: !ref <beam_search_max_attn_shift> | |
temperature: !ref <beam_search_temperature> | |
scorer: !ref <scorer> | |
beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher | |
embedding: !ref <emb> | |
decoder: !ref <dec> | |
linear: !ref <lin> | |
bos_index: !ref <bos_index> | |
eos_index: !ref <eos_index> | |
min_decode_ratio: !ref <beam_search_min_decode_ratio> | |
max_decode_ratio: !ref <beam_search_max_decode_ratio> | |
beam_size: !ref <beam_search_beam_size> | |
eos_threshold: !ref <beam_search_eos_threshold> | |
using_max_attn_shift: !ref <beam_search_using_max_attn_shift> | |
max_attn_shift: !ref <beam_search_max_attn_shift> | |
temperature: !ref <beam_search_temperature> | |
scorer: !ref <scorer> | |
homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor | |
model_output_keys: | |
- p_seq | |
- char_lens | |
- encoder_out | |
grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder | |
phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder | |
grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init | |
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece | |
model_dir: grapheme_tokenizer | |
bos_id: !ref <bos_index> | |
eos_id: !ref <eos_index> | |
unk_id: !ref <unk_index> | |
vocab_size: !ref <char_token_output> | |
annotation_train: null | |
annotation_read: char | |
model_type: !ref <char_token_type> # ["unigram", "bpe", "char"] | |
character_coverage: !ref <character_coverage> | |
annotation_format: json | |
text_file: grapheme_annotations.txt | |
phoneme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init | |
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece | |
model_dir: phoneme_tokenizer | |
bos_id: !ref <bos_index> | |
eos_id: !ref <eos_index> | |
unk_id: !ref <unk_index> | |
vocab_size: !ref <phn_token_output> | |
annotation_train: null | |
annotation_read: phn | |
model_type: !ref <phn_token_type> # ["unigram", "bpe", "char"] | |
character_coverage: !ref <character_coverage> | |
annotation_format: json | |
text_file: null | |
out_phoneme_decoder_tok: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize | |
tokenizer: !ref <phoneme_tokenizer> | |
char_map: !ref <char_phn_map> | |
token_space_index: !ref <token_space_index> | |
wordwise: !ref <phn_token_wordwise> | |
out_phoneme_decoder_raw: !name:speechbrain.lobes.models.g2p.dataio.text_decode | |
encoder: !ref <phoneme_encoder> | |
out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice | |
value: false | |
choices: | |
True: !ref <out_phoneme_decoder_tok> | |
False: !ref <out_phoneme_decoder_raw> | |
encode_pipeline: | |
batch: false | |
use_padded_data: true | |
output_keys: | |
- grapheme_list | |
- grapheme_encoded_list | |
- grapheme_encoded | |
- word_emb | |
init: | |
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos | |
encoder: !ref <grapheme_encoder> | |
tokens: !ref <graphemes> | |
bos_index: !ref <bos_index> | |
eos_index: !ref <eos_index> | |
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos | |
encoder: !ref <phoneme_encoder> | |
tokens: !ref <phonemes> | |
bos_index: !ref <bos_index> | |
eos_index: !ref <eos_index> | |
steps: | |
- func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline | |
graphemes: !ref <graphemes> | |
takes: txt | |
provides: txt_cleaned | |
- func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline | |
grapheme_encoder: !ref <grapheme_encoder> | |
takes: txt_cleaned | |
provides: | |
- grapheme_list | |
- grapheme_encoded_list | |
- grapheme_encoded_raw | |
- func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos | |
encoder: !ref <grapheme_encoder> | |
takes: grapheme_encoded_list | |
provides: | |
- grapheme_encoded | |
- grapheme_len | |
- grapheme_encoded_eos | |
- grapheme_len_eos | |
- func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline | |
word_emb: !ref <word_emb> | |
grapheme_encoder: !ref <grapheme_encoder> | |
use_word_emb: !ref <use_word_emb> | |
takes: | |
- txt | |
- grapheme_encoded | |
- grapheme_len | |
provides: word_emb | |
decode_pipeline: | |
batch: true | |
output_keys: | |
- phonemes | |
steps: | |
- func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline | |
beam_searcher: !ref <beam_searcher> | |
takes: | |
- char_lens | |
- encoder_out | |
provides: | |
- hyps | |
- scores | |
- func: !apply:speechbrain.utils.hparams.choice | |
value: false | |
choices: | |
True: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize | |
tokenizer: !ref <phoneme_tokenizer> | |
char_map: !ref <char_phn_map> | |
token_space_index: !ref <token_space_index> | |
wordwise: !ref <phn_token_wordwise> | |
False: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline | |
phoneme_encoder: !ref <phoneme_encoder> | |
takes: | |
- hyps | |
provides: | |
- phonemes | |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
loadables: | |
model: !ref <model> | |
ctc_lin: !ref <ctc_lin> | |