train:
  seed: 1234
  epochs: 20
  batch_size: 8
  save_every_n_epoch: 1
  precision: 16-mixed
  gradient_clip: 1.0
optimizer:
  lr: 0.01
  lr_init: 0.00001
  lr_end: 0.0001
  warmup_steps: 2000
  decay_steps: 40000
data:
  max_eval_sample: 8
  max_sec: 54
  num_workers: 4
  pad_val: 1024 # same with EOS in model
model:
  vocab_size: 1025
  phoneme_vocab_size: 512
  embedding_dim: 512
  hidden_dim: 512
  head: 16
  linear_units: 2048
  n_layer: 24
  dropout: 0
  EOS: 1024
  random_bert: 0
inference:
  top_k: 5