{ "best_metric": 0.09464961290359497, "best_model_checkpoint": "outputs/checkpoint-540", "epoch": 9.840546697038725, "eval_steps": 500, "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.36446469248291574, "grad_norm": 3.396773099899292, "learning_rate": 1.8000000000000001e-06, "loss": 0.3727, "step": 20 }, { "epoch": 0.7289293849658315, "grad_norm": 1.4876775741577148, "learning_rate": 3.8000000000000005e-06, "loss": 0.2861, "step": 40 }, { "epoch": 0.9840546697038725, "eval_loss": 0.18749132752418518, "eval_runtime": 49.826, "eval_samples_per_second": 3.372, "eval_steps_per_second": 0.421, "step": 54 }, { "epoch": 1.0933940774487472, "grad_norm": 0.6102920770645142, "learning_rate": 5.8e-06, "loss": 0.1811, "step": 60 }, { "epoch": 1.4578587699316627, "grad_norm": 0.4133767783641815, "learning_rate": 7.800000000000002e-06, "loss": 0.1346, "step": 80 }, { "epoch": 1.8223234624145785, "grad_norm": 1.7365530729293823, "learning_rate": 9.800000000000001e-06, "loss": 0.1243, "step": 100 }, { "epoch": 1.9863325740318907, "eval_loss": 0.13598798215389252, "eval_runtime": 49.8253, "eval_samples_per_second": 3.372, "eval_steps_per_second": 0.421, "step": 109 }, { "epoch": 2.1867881548974943, "grad_norm": 0.4655854403972626, "learning_rate": 9.958763523679515e-06, "loss": 0.1038, "step": 120 }, { "epoch": 2.55125284738041, "grad_norm": 0.6029446125030518, "learning_rate": 9.817090706862895e-06, "loss": 0.0873, "step": 140 }, { "epoch": 2.9157175398633255, "grad_norm": 0.4066179394721985, "learning_rate": 9.577355814597031e-06, "loss": 0.0862, "step": 160 }, { "epoch": 2.988610478359909, "eval_loss": 0.11723620444536209, "eval_runtime": 49.7707, "eval_samples_per_second": 3.375, "eval_steps_per_second": 0.422, "step": 164 }, { "epoch": 3.2801822323462413, "grad_norm": 0.6009082198143005, "learning_rate": 9.244439157950114e-06, "loss": 0.0834, "step": 180 }, { "epoch": 3.644646924829157, "grad_norm": 0.6393954157829285, "learning_rate": 8.825117959999117e-06, "loss": 0.0756, "step": 200 }, { "epoch": 3.990888382687927, "eval_loss": 0.10594599694013596, "eval_runtime": 49.8676, "eval_samples_per_second": 3.369, "eval_steps_per_second": 0.421, "step": 219 }, { "epoch": 4.009111617312073, "grad_norm": 0.5410734415054321, "learning_rate": 8.327928391111841e-06, "loss": 0.0733, "step": 220 }, { "epoch": 4.373576309794989, "grad_norm": 0.544377326965332, "learning_rate": 7.762991797134513e-06, "loss": 0.0684, "step": 240 }, { "epoch": 4.738041002277904, "grad_norm": 0.5764002799987793, "learning_rate": 7.1418086579779075e-06, "loss": 0.0628, "step": 260 }, { "epoch": 4.993166287015946, "eval_loss": 0.0995541512966156, "eval_runtime": 49.7114, "eval_samples_per_second": 3.38, "eval_steps_per_second": 0.422, "step": 274 }, { "epoch": 5.10250569476082, "grad_norm": 0.48410555720329285, "learning_rate": 6.477024471011001e-06, "loss": 0.0628, "step": 280 }, { "epoch": 5.466970387243736, "grad_norm": 0.6120467185974121, "learning_rate": 5.782172325201155e-06, "loss": 0.0594, "step": 300 }, { "epoch": 5.831435079726651, "grad_norm": 0.7353035807609558, "learning_rate": 5.071397406448937e-06, "loss": 0.0593, "step": 320 }, { "epoch": 5.995444191343964, "eval_loss": 0.09746743738651276, "eval_runtime": 49.7379, "eval_samples_per_second": 3.378, "eval_steps_per_second": 0.422, "step": 329 }, { "epoch": 6.195899772209567, "grad_norm": 0.5597842931747437, "learning_rate": 4.359169042394537e-06, "loss": 0.0606, "step": 340 }, { "epoch": 6.560364464692483, "grad_norm": 0.5548788905143738, "learning_rate": 3.6599861486331074e-06, "loss": 0.0548, "step": 360 }, { "epoch": 6.924829157175399, "grad_norm": 0.6110637784004211, "learning_rate": 2.9880820726046613e-06, "loss": 0.0498, "step": 380 }, { "epoch": 6.997722095671982, "eval_loss": 0.09508081525564194, "eval_runtime": 49.5881, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.423, "step": 384 }, { "epoch": 7.289293849658314, "grad_norm": 0.6793264150619507, "learning_rate": 2.3571348436857906e-06, "loss": 0.0485, "step": 400 }, { "epoch": 7.65375854214123, "grad_norm": 0.499203085899353, "learning_rate": 1.7799887279557238e-06, "loss": 0.0474, "step": 420 }, { "epoch": 8.0, "eval_loss": 0.09498950093984604, "eval_runtime": 49.5797, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.424, "step": 439 }, { "epoch": 8.018223234624147, "grad_norm": 0.86496502161026, "learning_rate": 1.2683927559787657e-06, "loss": 0.0523, "step": 440 }, { "epoch": 8.382687927107062, "grad_norm": 0.7341931462287903, "learning_rate": 8.327615464234129e-07, "loss": 0.0423, "step": 460 }, { "epoch": 8.747152619589977, "grad_norm": 0.5355525016784668, "learning_rate": 4.819632944595415e-07, "loss": 0.047, "step": 480 }, { "epoch": 8.984054669703873, "eval_loss": 0.09474514424800873, "eval_runtime": 49.5868, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.424, "step": 493 }, { "epoch": 9.111617312072893, "grad_norm": 0.5788146257400513, "learning_rate": 2.2313924087851657e-07, "loss": 0.0548, "step": 500 }, { "epoch": 9.476082004555808, "grad_norm": 0.4824052155017853, "learning_rate": 6.15582970243117e-08, "loss": 0.0451, "step": 520 }, { "epoch": 9.840546697038725, "grad_norm": 0.9220362901687622, "learning_rate": 5.09784952833492e-10, "loss": 0.0453, "step": 540 }, { "epoch": 9.840546697038725, "eval_loss": 0.09464961290359497, "eval_runtime": 49.5747, "eval_samples_per_second": 3.389, "eval_steps_per_second": 0.424, "step": 540 } ], "logging_steps": 20, "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.0191775689433088e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }