experiment_name: 'runs/transformer_big' dataset: src_lang: 'lo' src_tokenizer: 'BPE' src_max_seq_len: 400 tgt_lang: 'vi' tgt_tokenizer: 'WordLevel' tgt_max_seq_len: 350 train_dataset: 'train_clean.dat' validate_dataset: 'dev_clean.dat' tokenizer_file: "tokenizer_{0}.json" bleu_dataset: 'test2023' model: # 42688527 parameters d_model: 512 num_heads: 8 d_ff: 2048 dropout_p: 0.3 num_encoder_layers: 4 num_decoder_layers: 2 model_folder: "weights" model_basename: "transformer_" preload: "big" train: lr: 0.0001 # 1e-4 batch_size: 16 num_epochs: 40 label_smoothing: 0.1 on_colab: True # are you training on Colab? patience: 100 # (steps) warm_up_steps: 700