|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=76.57510 PPL=1803619835086933004964966285967360.00000 |
|
val epoch=1 loss=3.55529 PPL=34.99814 |
|
train epoch=1 loss=3.58229 PPL=35.95572 |
|
[trained] 0.0[H] 41.41847747564316[M] 2485.109[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=8.62410 PPL=5564.13037 |
|
val epoch=1 loss=3.48060 PPL=32.47906 |
|
train epoch=1 loss=2.05416 PPL=7.80031 |
|
[trained] 0.0[H] 45.51669268210729[M] 2731.002[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=5.33357 PPL=207.17598 |
|
val epoch=1 loss=2.69441 PPL=14.79680 |
|
train epoch=1 loss=1.59283 PPL=4.91763 |
|
[trained] 0.0[H] 41.46436125040054[M] 2487.862[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=5.03823 PPL=154.19640 |
|
val epoch=1 loss=3.20544 PPL=24.66638 |
|
train epoch=1 loss=1.61361 PPL=5.02092 |
|
[trained] 0.0[H] 45.251987334092455[M] 2715.119[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=4.14368 PPL=63.03437 |
|
val epoch=1 loss=2.43705 PPL=11.43929 |
|
train epoch=1 loss=1.37564 PPL=3.95763 |
|
[trained] 0.0[H] 41.47204469839732[M] 2488.323[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=4.28832 PPL=72.84402 |
|
val epoch=1 loss=3.02900 PPL=20.67647 |
|
train epoch=1 loss=1.48900 PPL=4.43266 |
|
[trained] 0.0[H] 45.57923027674357[M] 2734.754[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.70968 PPL=40.84082 |
|
val epoch=1 loss=2.28623 PPL=9.83775 |
|
train epoch=1 loss=1.27682 PPL=3.58522 |
|
[trained] 0.0[H] 41.4678033153216[M] 2488.068[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.83498 PPL=46.29248 |
|
val epoch=1 loss=2.79002 PPL=16.28134 |
|
train epoch=1 loss=1.41784 PPL=4.12821 |
|
[trained] 0.0[H] 45.09872035185496[M] 2705.923[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.38932 PPL=29.64582 |
|
val epoch=1 loss=2.20471 PPL=9.06766 |
|
train epoch=1 loss=1.22078 PPL=3.38983 |
|
[trained] 0.0[H] 41.52079544067383[M] 2491.248[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.77504 PPL=43.59935 |
|
val epoch=1 loss=2.75377 PPL=15.70175 |
|
train epoch=1 loss=1.37220 PPL=3.94404 |
|
[trained] 0.0[H] 45.1388335108757[M] 2708.330[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.05649 PPL=21.25283 |
|
val epoch=1 loss=2.06552 PPL=7.88940 |
|
train epoch=1 loss=1.18322 PPL=3.26485 |
|
[trained] 0.0[H] 41.343922030925754[M] 2480.635[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.63945 PPL=38.07071 |
|
val epoch=1 loss=2.74634 PPL=15.58548 |
|
train epoch=1 loss=1.34129 PPL=3.82397 |
|
[trained] 0.0[H] 44.50069724321365[M] 2670.042[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.99219 PPL=19.92924 |
|
val epoch=1 loss=2.11169 PPL=8.26216 |
|
train epoch=1 loss=1.15597 PPL=3.17710 |
|
[trained] 0.0[H] 41.03153887987137[M] 2461.892[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.69122 PPL=40.09384 |
|
val epoch=1 loss=2.79154 PPL=16.30605 |
|
train epoch=1 loss=1.31323 PPL=3.71816 |
|
[trained] 0.0[H] 45.27243907054265[M] 2716.346[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.89017 PPL=17.99635 |
|
val epoch=1 loss=2.05285 PPL=7.79006 |
|
train epoch=1 loss=1.13480 PPL=3.11056 |
|
[trained] 0.0[H] 41.108288780848184[M] 2466.497[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.43673 PPL=31.08512 |
|
val epoch=1 loss=2.64907 PPL=14.14095 |
|
train epoch=1 loss=1.29298 PPL=3.64363 |
|
[trained] 0.0[H] 44.97415177822113[M] 2698.449[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.77340 PPL=16.01299 |
|
val epoch=1 loss=1.99160 PPL=7.32726 |
|
train epoch=1 loss=1.11733 PPL=3.05667 |
|
[trained] 0.0[H] 41.14810743729274[M] 2468.886[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.39647 PPL=29.85852 |
|
val epoch=1 loss=2.36330 PPL=10.62593 |
|
train epoch=1 loss=1.27496 PPL=3.57856 |
|
[trained] 0.0[H] 44.73817230463028[M] 2684.290[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.74622 PPL=15.58363 |
|
val epoch=1 loss=2.00091 PPL=7.39578 |
|
train epoch=1 loss=1.10269 PPL=3.01226 |
|
[trained] 0.0[H] 41.041836047172545[M] 2462.510[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.45477 PPL=31.65103 |
|
val epoch=1 loss=2.73762 PPL=15.45019 |
|
train epoch=1 loss=1.25830 PPL=3.51942 |
|
[trained] 0.0[H] 45.509643785158794[M] 2730.579[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.71395 PPL=15.08881 |
|
val epoch=1 loss=2.00103 PPL=7.39668 |
|
train epoch=1 loss=1.09001 PPL=2.97429 |
|
[trained] 0.0[H] 41.28162391185761[M] 2476.897[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.26547 PPL=26.19238 |
|
val epoch=1 loss=2.69914 PPL=14.86692 |
|
train epoch=1 loss=1.24174 PPL=3.46165 |
|
[trained] 0.0[H] 45.42912646929423[M] 2725.748[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.68335 PPL=14.63405 |
|
val epoch=1 loss=2.00004 PPL=7.38934 |
|
train epoch=1 loss=1.07841 PPL=2.94001 |
|
[trained] 0.0[H] 41.447514899571736[M] 2486.851[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.27115 PPL=26.34149 |
|
val epoch=1 loss=2.72310 PPL=15.22747 |
|
train epoch=1 loss=1.23098 PPL=3.42457 |
|
[trained] 0.0[H] 45.18751840988795[M] 2711.251[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.57214 PPL=13.09387 |
|
val epoch=1 loss=1.95365 PPL=7.05438 |
|
train epoch=1 loss=1.06908 PPL=2.91269 |
|
[trained] 0.0[H] 40.959261027971905[M] 2457.556[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=3.18254 PPL=24.10785 |
|
val epoch=1 loss=2.68020 PPL=14.58803 |
|
train epoch=1 loss=1.22046 PPL=3.38875 |
|
[trained] 0.0[H] 45.264945685863495[M] 2715.897[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.57132 PPL=13.08305 |
|
val epoch=1 loss=1.94033 PPL=6.96107 |
|
train epoch=1 loss=1.06083 PPL=2.88875 |
|
[trained] 0.0[H] 41.00604948997498[M] 2460.363[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.99903 PPL=20.06612 |
|
val epoch=1 loss=2.42283 PPL=11.27773 |
|
train epoch=1 loss=1.20782 PPL=3.34619 |
|
[trained] 0.0[H] 45.244081223011015[M] 2714.645[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
val epoch=1 loss=2.55513 PPL=12.87301 |
|
val epoch=1 loss=1.93933 PPL=6.95411 |
|
train epoch=1 loss=1.05271 PPL=2.86539 |
|
[trained] 0.0[H] 41.11795919736226[M] 2467.078[sec] |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [Errno 28] No space left on device |
|
[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) |
|
[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] |
|
[batch_size] 256 |
|
[accumulate_grad_batches] 2 |
|
[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [Errno 28] No space left on device |
|
|