mt5-mini9L / train_log.txt

first version

14f0f69 almost 2 years ago

35.8 kB

	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=76.57510 PPL=1803619835086933004964966285967360.00000
	val epoch=1 loss=3.55529 PPL=34.99814
	train epoch=1 loss=3.58229 PPL=35.95572
	[trained] 0.0[H] 41.41847747564316[M] 2485.109[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=8.62410 PPL=5564.13037
	val epoch=1 loss=3.48060 PPL=32.47906
	train epoch=1 loss=2.05416 PPL=7.80031
	[trained] 0.0[H] 45.51669268210729[M] 2731.002[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=5.33357 PPL=207.17598
	val epoch=1 loss=2.69441 PPL=14.79680
	train epoch=1 loss=1.59283 PPL=4.91763
	[trained] 0.0[H] 41.46436125040054[M] 2487.862[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=5.03823 PPL=154.19640
	val epoch=1 loss=3.20544 PPL=24.66638
	train epoch=1 loss=1.61361 PPL=5.02092
	[trained] 0.0[H] 45.251987334092455[M] 2715.119[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=4.14368 PPL=63.03437
	val epoch=1 loss=2.43705 PPL=11.43929
	train epoch=1 loss=1.37564 PPL=3.95763
	[trained] 0.0[H] 41.47204469839732[M] 2488.323[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=4.28832 PPL=72.84402
	val epoch=1 loss=3.02900 PPL=20.67647
	train epoch=1 loss=1.48900 PPL=4.43266
	[trained] 0.0[H] 45.57923027674357[M] 2734.754[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.70968 PPL=40.84082
	val epoch=1 loss=2.28623 PPL=9.83775
	train epoch=1 loss=1.27682 PPL=3.58522
	[trained] 0.0[H] 41.4678033153216[M] 2488.068[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.83498 PPL=46.29248
	val epoch=1 loss=2.79002 PPL=16.28134
	train epoch=1 loss=1.41784 PPL=4.12821
	[trained] 0.0[H] 45.09872035185496[M] 2705.923[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.38932 PPL=29.64582
	val epoch=1 loss=2.20471 PPL=9.06766
	train epoch=1 loss=1.22078 PPL=3.38983
	[trained] 0.0[H] 41.52079544067383[M] 2491.248[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.77504 PPL=43.59935
	val epoch=1 loss=2.75377 PPL=15.70175
	train epoch=1 loss=1.37220 PPL=3.94404
	[trained] 0.0[H] 45.1388335108757[M] 2708.330[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.05649 PPL=21.25283
	val epoch=1 loss=2.06552 PPL=7.88940
	train epoch=1 loss=1.18322 PPL=3.26485
	[trained] 0.0[H] 41.343922030925754[M] 2480.635[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.63945 PPL=38.07071
	val epoch=1 loss=2.74634 PPL=15.58548
	train epoch=1 loss=1.34129 PPL=3.82397
	[trained] 0.0[H] 44.50069724321365[M] 2670.042[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.99219 PPL=19.92924
	val epoch=1 loss=2.11169 PPL=8.26216
	train epoch=1 loss=1.15597 PPL=3.17710
	[trained] 0.0[H] 41.03153887987137[M] 2461.892[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.69122 PPL=40.09384
	val epoch=1 loss=2.79154 PPL=16.30605
	train epoch=1 loss=1.31323 PPL=3.71816
	[trained] 0.0[H] 45.27243907054265[M] 2716.346[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.89017 PPL=17.99635
	val epoch=1 loss=2.05285 PPL=7.79006
	train epoch=1 loss=1.13480 PPL=3.11056
	[trained] 0.0[H] 41.108288780848184[M] 2466.497[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.43673 PPL=31.08512
	val epoch=1 loss=2.64907 PPL=14.14095
	train epoch=1 loss=1.29298 PPL=3.64363
	[trained] 0.0[H] 44.97415177822113[M] 2698.449[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.77340 PPL=16.01299
	val epoch=1 loss=1.99160 PPL=7.32726
	train epoch=1 loss=1.11733 PPL=3.05667
	[trained] 0.0[H] 41.14810743729274[M] 2468.886[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.39647 PPL=29.85852
	val epoch=1 loss=2.36330 PPL=10.62593
	train epoch=1 loss=1.27496 PPL=3.57856
	[trained] 0.0[H] 44.73817230463028[M] 2684.290[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.74622 PPL=15.58363
	val epoch=1 loss=2.00091 PPL=7.39578
	train epoch=1 loss=1.10269 PPL=3.01226
	[trained] 0.0[H] 41.041836047172545[M] 2462.510[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.45477 PPL=31.65103
	val epoch=1 loss=2.73762 PPL=15.45019
	train epoch=1 loss=1.25830 PPL=3.51942
	[trained] 0.0[H] 45.509643785158794[M] 2730.579[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.71395 PPL=15.08881
	val epoch=1 loss=2.00103 PPL=7.39668
	train epoch=1 loss=1.09001 PPL=2.97429
	[trained] 0.0[H] 41.28162391185761[M] 2476.897[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.26547 PPL=26.19238
	val epoch=1 loss=2.69914 PPL=14.86692
	train epoch=1 loss=1.24174 PPL=3.46165
	[trained] 0.0[H] 45.42912646929423[M] 2725.748[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.68335 PPL=14.63405
	val epoch=1 loss=2.00004 PPL=7.38934
	train epoch=1 loss=1.07841 PPL=2.94001
	[trained] 0.0[H] 41.447514899571736[M] 2486.851[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.27115 PPL=26.34149
	val epoch=1 loss=2.72310 PPL=15.22747
	train epoch=1 loss=1.23098 PPL=3.42457
	[trained] 0.0[H] 45.18751840988795[M] 2711.251[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.57214 PPL=13.09387
	val epoch=1 loss=1.95365 PPL=7.05438
	train epoch=1 loss=1.06908 PPL=2.91269
	[trained] 0.0[H] 40.959261027971905[M] 2457.556[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.18254 PPL=24.10785
	val epoch=1 loss=2.68020 PPL=14.58803
	train epoch=1 loss=1.22046 PPL=3.38875
	[trained] 0.0[H] 45.264945685863495[M] 2715.897[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.57132 PPL=13.08305
	val epoch=1 loss=1.94033 PPL=6.96107
	train epoch=1 loss=1.06083 PPL=2.88875
	[trained] 0.0[H] 41.00604948997498[M] 2460.363[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.99903 PPL=20.06612
	val epoch=1 loss=2.42283 PPL=11.27773
	train epoch=1 loss=1.20782 PPL=3.34619
	[trained] 0.0[H] 45.244081223011015[M] 2714.645[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.55513 PPL=12.87301
	val epoch=1 loss=1.93933 PPL=6.95411
	train epoch=1 loss=1.05271 PPL=2.86539
	[trained] 0.0[H] 41.11795919736226[M] 2467.078[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [Errno 28] No space left on device