{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17323516673884798, "eval_steps": 25, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034647033347769596, "grad_norm": 71.73021697998047, "learning_rate": 6.666666666666667e-05, "loss": 56.7725, "step": 1 }, { "epoch": 0.0034647033347769596, "eval_loss": 5.7633891105651855, "eval_runtime": 54.7107, "eval_samples_per_second": 4.46, "eval_steps_per_second": 2.23, "step": 1 }, { "epoch": 0.006929406669553919, "grad_norm": 58.02839279174805, "learning_rate": 0.00013333333333333334, "loss": 45.1267, "step": 2 }, { "epoch": 0.01039411000433088, "grad_norm": 62.47406768798828, "learning_rate": 0.0002, "loss": 51.5264, "step": 3 }, { "epoch": 0.013858813339107838, "grad_norm": 50.805538177490234, "learning_rate": 0.0001999048221581858, "loss": 48.1345, "step": 4 }, { "epoch": 0.0173235166738848, "grad_norm": 53.90876388549805, "learning_rate": 0.00019961946980917456, "loss": 33.368, "step": 5 }, { "epoch": 0.02078822000866176, "grad_norm": 47.89884567260742, "learning_rate": 0.00019914448613738106, "loss": 20.7941, "step": 6 }, { "epoch": 0.024252923343438718, "grad_norm": 46.44947814941406, "learning_rate": 0.00019848077530122083, "loss": 19.2011, "step": 7 }, { "epoch": 0.027717626678215677, "grad_norm": 29.739330291748047, "learning_rate": 0.00019762960071199333, "loss": 11.6099, "step": 8 }, { "epoch": 0.031182330012992636, "grad_norm": 37.299049377441406, "learning_rate": 0.00019659258262890683, "loss": 10.8285, "step": 9 }, { "epoch": 0.0346470333477696, "grad_norm": 37.623783111572266, "learning_rate": 0.0001953716950748227, "loss": 8.8167, "step": 10 }, { "epoch": 0.038111736682546554, "grad_norm": 25.743181228637695, "learning_rate": 0.00019396926207859084, "loss": 7.6365, "step": 11 }, { "epoch": 0.04157644001732352, "grad_norm": 21.669662475585938, "learning_rate": 0.0001923879532511287, "loss": 6.3246, "step": 12 }, { "epoch": 0.04504114335210048, "grad_norm": 21.14327049255371, "learning_rate": 0.000190630778703665, "loss": 3.8779, "step": 13 }, { "epoch": 0.048505846686877435, "grad_norm": 32.12599563598633, "learning_rate": 0.00018870108331782217, "loss": 5.3825, "step": 14 }, { "epoch": 0.0519705500216544, "grad_norm": 56.678955078125, "learning_rate": 0.00018660254037844388, "loss": 5.0414, "step": 15 }, { "epoch": 0.055435253356431353, "grad_norm": 51.66940689086914, "learning_rate": 0.0001843391445812886, "loss": 8.305, "step": 16 }, { "epoch": 0.058899956691208316, "grad_norm": 27.637439727783203, "learning_rate": 0.0001819152044288992, "loss": 6.1433, "step": 17 }, { "epoch": 0.06236466002598527, "grad_norm": 28.653242111206055, "learning_rate": 0.00017933533402912354, "loss": 4.0016, "step": 18 }, { "epoch": 0.06582936336076224, "grad_norm": 23.004844665527344, "learning_rate": 0.0001766044443118978, "loss": 4.7562, "step": 19 }, { "epoch": 0.0692940666955392, "grad_norm": 25.145166397094727, "learning_rate": 0.0001737277336810124, "loss": 4.5456, "step": 20 }, { "epoch": 0.07275877003031615, "grad_norm": 16.525177001953125, "learning_rate": 0.00017071067811865476, "loss": 4.3994, "step": 21 }, { "epoch": 0.07622347336509311, "grad_norm": 21.258804321289062, "learning_rate": 0.00016755902076156604, "loss": 4.9417, "step": 22 }, { "epoch": 0.07968817669987008, "grad_norm": 18.620487213134766, "learning_rate": 0.00016427876096865394, "loss": 4.2276, "step": 23 }, { "epoch": 0.08315288003464703, "grad_norm": 71.73140716552734, "learning_rate": 0.00016087614290087208, "loss": 5.1532, "step": 24 }, { "epoch": 0.08661758336942399, "grad_norm": 32.46612548828125, "learning_rate": 0.0001573576436351046, "loss": 3.328, "step": 25 }, { "epoch": 0.08661758336942399, "eval_loss": 0.6132519841194153, "eval_runtime": 55.2353, "eval_samples_per_second": 4.417, "eval_steps_per_second": 2.209, "step": 25 }, { "epoch": 0.09008228670420096, "grad_norm": 30.413021087646484, "learning_rate": 0.0001537299608346824, "loss": 4.3897, "step": 26 }, { "epoch": 0.09354699003897791, "grad_norm": 70.32632446289062, "learning_rate": 0.00015000000000000001, "loss": 6.1758, "step": 27 }, { "epoch": 0.09701169337375487, "grad_norm": 44.74322509765625, "learning_rate": 0.00014617486132350343, "loss": 5.2728, "step": 28 }, { "epoch": 0.10047639670853183, "grad_norm": 44.46076965332031, "learning_rate": 0.00014226182617406996, "loss": 7.1594, "step": 29 }, { "epoch": 0.1039411000433088, "grad_norm": 55.632720947265625, "learning_rate": 0.000138268343236509, "loss": 3.2506, "step": 30 }, { "epoch": 0.10740580337808575, "grad_norm": 64.20110321044922, "learning_rate": 0.00013420201433256689, "loss": 4.082, "step": 31 }, { "epoch": 0.11087050671286271, "grad_norm": 82.19364166259766, "learning_rate": 0.00013007057995042732, "loss": 5.7286, "step": 32 }, { "epoch": 0.11433521004763968, "grad_norm": 25.911882400512695, "learning_rate": 0.00012588190451025207, "loss": 3.3962, "step": 33 }, { "epoch": 0.11779991338241663, "grad_norm": 34.44158935546875, "learning_rate": 0.00012164396139381029, "loss": 4.11, "step": 34 }, { "epoch": 0.12126461671719359, "grad_norm": 29.16976547241211, "learning_rate": 0.00011736481776669306, "loss": 4.2445, "step": 35 }, { "epoch": 0.12472932005197054, "grad_norm": 18.717464447021484, "learning_rate": 0.00011305261922200519, "loss": 3.1105, "step": 36 }, { "epoch": 0.1281940233867475, "grad_norm": 12.39698314666748, "learning_rate": 0.00010871557427476583, "loss": 2.9836, "step": 37 }, { "epoch": 0.13165872672152448, "grad_norm": 20.110858917236328, "learning_rate": 0.00010436193873653361, "loss": 4.8843, "step": 38 }, { "epoch": 0.13512343005630142, "grad_norm": 22.39121437072754, "learning_rate": 0.0001, "loss": 3.7531, "step": 39 }, { "epoch": 0.1385881333910784, "grad_norm": 12.983887672424316, "learning_rate": 9.563806126346642e-05, "loss": 2.8063, "step": 40 }, { "epoch": 0.14205283672585534, "grad_norm": 12.021308898925781, "learning_rate": 9.128442572523417e-05, "loss": 1.8465, "step": 41 }, { "epoch": 0.1455175400606323, "grad_norm": 32.44965744018555, "learning_rate": 8.694738077799488e-05, "loss": 4.7937, "step": 42 }, { "epoch": 0.14898224339540928, "grad_norm": 16.291828155517578, "learning_rate": 8.263518223330697e-05, "loss": 4.0307, "step": 43 }, { "epoch": 0.15244694673018622, "grad_norm": 12.486902236938477, "learning_rate": 7.835603860618972e-05, "loss": 3.1398, "step": 44 }, { "epoch": 0.1559116500649632, "grad_norm": 20.737529754638672, "learning_rate": 7.411809548974792e-05, "loss": 2.7772, "step": 45 }, { "epoch": 0.15937635339974016, "grad_norm": 17.130455017089844, "learning_rate": 6.992942004957271e-05, "loss": 2.7659, "step": 46 }, { "epoch": 0.1628410567345171, "grad_norm": 33.27773666381836, "learning_rate": 6.579798566743314e-05, "loss": 4.4388, "step": 47 }, { "epoch": 0.16630576006929407, "grad_norm": 16.671640396118164, "learning_rate": 6.173165676349103e-05, "loss": 5.4897, "step": 48 }, { "epoch": 0.16977046340407104, "grad_norm": 32.55076599121094, "learning_rate": 5.773817382593008e-05, "loss": 5.7507, "step": 49 }, { "epoch": 0.17323516673884798, "grad_norm": 24.630674362182617, "learning_rate": 5.382513867649663e-05, "loss": 4.7681, "step": 50 }, { "epoch": 0.17323516673884798, "eval_loss": 0.4547581374645233, "eval_runtime": 55.2618, "eval_samples_per_second": 4.415, "eval_steps_per_second": 2.208, "step": 50 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2695579394048e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }