{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0119047619047619, "eval_steps": 17, "global_step": 85, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011904761904761904, "grad_norm": 0.9544092626224188, "learning_rate": 2e-05, "loss": 2.5697, "step": 1 }, { "epoch": 0.011904761904761904, "eval_loss": 2.4926321506500244, "eval_runtime": 84.5031, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.118, "step": 1 }, { "epoch": 0.023809523809523808, "grad_norm": 0.8699237107804378, "learning_rate": 4e-05, "loss": 2.5045, "step": 2 }, { "epoch": 0.03571428571428571, "grad_norm": 0.8471542581095772, "learning_rate": 6e-05, "loss": 2.444, "step": 3 }, { "epoch": 0.047619047619047616, "grad_norm": 0.6760804223199605, "learning_rate": 8e-05, "loss": 2.4135, "step": 4 }, { "epoch": 0.05952380952380952, "grad_norm": 0.3645239609928067, "learning_rate": 0.0001, "loss": 2.5502, "step": 5 }, { "epoch": 0.07142857142857142, "grad_norm": 1.0191533159070523, "learning_rate": 9.996530663083255e-05, "loss": 2.4001, "step": 6 }, { "epoch": 0.08333333333333333, "grad_norm": 0.7536562391615925, "learning_rate": 9.986128001799077e-05, "loss": 2.4241, "step": 7 }, { "epoch": 0.09523809523809523, "grad_norm": 0.47058800200722606, "learning_rate": 9.96880805629717e-05, "loss": 2.3783, "step": 8 }, { "epoch": 0.10714285714285714, "grad_norm": 0.42488376754431106, "learning_rate": 9.94459753267812e-05, "loss": 2.4496, "step": 9 }, { "epoch": 0.11904761904761904, "grad_norm": 0.2964058549391389, "learning_rate": 9.913533761814537e-05, "loss": 2.4123, "step": 10 }, { "epoch": 0.13095238095238096, "grad_norm": 0.274543387752409, "learning_rate": 9.875664641789545e-05, "loss": 2.3733, "step": 11 }, { "epoch": 0.14285714285714285, "grad_norm": 0.4783689100368163, "learning_rate": 9.831048564041413e-05, "loss": 2.3535, "step": 12 }, { "epoch": 0.15476190476190477, "grad_norm": 1.3750248629360309, "learning_rate": 9.779754323328192e-05, "loss": 2.2893, "step": 13 }, { "epoch": 0.16666666666666666, "grad_norm": 0.3128166343225632, "learning_rate": 9.72186101165118e-05, "loss": 2.3495, "step": 14 }, { "epoch": 0.17857142857142858, "grad_norm": 0.2978734846133746, "learning_rate": 9.657457896300791e-05, "loss": 2.3489, "step": 15 }, { "epoch": 0.19047619047619047, "grad_norm": 0.3296693493448657, "learning_rate": 9.586644282212866e-05, "loss": 2.2308, "step": 16 }, { "epoch": 0.20238095238095238, "grad_norm": 0.35894978658993554, "learning_rate": 9.509529358847655e-05, "loss": 2.2991, "step": 17 }, { "epoch": 0.20238095238095238, "eval_loss": 2.3356270790100098, "eval_runtime": 84.8559, "eval_samples_per_second": 0.236, "eval_steps_per_second": 0.118, "step": 17 }, { "epoch": 0.21428571428571427, "grad_norm": 0.556925508907772, "learning_rate": 9.426232031827588e-05, "loss": 2.3245, "step": 18 }, { "epoch": 0.2261904761904762, "grad_norm": 0.2596512913089154, "learning_rate": 9.336880739593416e-05, "loss": 2.2259, "step": 19 }, { "epoch": 0.23809523809523808, "grad_norm": 0.23810127330848288, "learning_rate": 9.241613255361455e-05, "loss": 2.3589, "step": 20 }, { "epoch": 0.25, "grad_norm": 0.2486358353351647, "learning_rate": 9.140576474687264e-05, "loss": 2.3491, "step": 21 }, { "epoch": 0.2619047619047619, "grad_norm": 0.2569987072732923, "learning_rate": 9.033926188963352e-05, "loss": 2.3966, "step": 22 }, { "epoch": 0.27380952380952384, "grad_norm": 0.2818819247779108, "learning_rate": 8.921826845200139e-05, "loss": 2.319, "step": 23 }, { "epoch": 0.2857142857142857, "grad_norm": 0.3051353561467919, "learning_rate": 8.804451292460585e-05, "loss": 2.3764, "step": 24 }, { "epoch": 0.2976190476190476, "grad_norm": 0.28378754273403445, "learning_rate": 8.681980515339464e-05, "loss": 2.2516, "step": 25 }, { "epoch": 0.30952380952380953, "grad_norm": 0.23671187368883853, "learning_rate": 8.554603354898238e-05, "loss": 2.213, "step": 26 }, { "epoch": 0.32142857142857145, "grad_norm": 0.2366234206615884, "learning_rate": 8.422516217485826e-05, "loss": 2.2728, "step": 27 }, { "epoch": 0.3333333333333333, "grad_norm": 0.29302002288238443, "learning_rate": 8.285922771894254e-05, "loss": 2.3657, "step": 28 }, { "epoch": 0.34523809523809523, "grad_norm": 0.27328082985574287, "learning_rate": 8.14503363531613e-05, "loss": 2.293, "step": 29 }, { "epoch": 0.35714285714285715, "grad_norm": 0.2629290179604872, "learning_rate": 8.000066048588211e-05, "loss": 2.3183, "step": 30 }, { "epoch": 0.36904761904761907, "grad_norm": 0.2769635403979301, "learning_rate": 7.85124354122177e-05, "loss": 2.2946, "step": 31 }, { "epoch": 0.38095238095238093, "grad_norm": 0.2322670372127043, "learning_rate": 7.698795586736298e-05, "loss": 2.1595, "step": 32 }, { "epoch": 0.39285714285714285, "grad_norm": 0.2733558129218707, "learning_rate": 7.542957248827961e-05, "loss": 2.3469, "step": 33 }, { "epoch": 0.40476190476190477, "grad_norm": 0.24834436859405049, "learning_rate": 7.383968818918426e-05, "loss": 2.199, "step": 34 }, { "epoch": 0.40476190476190477, "eval_loss": 2.299875259399414, "eval_runtime": 85.3272, "eval_samples_per_second": 0.234, "eval_steps_per_second": 0.117, "step": 34 }, { "epoch": 0.4166666666666667, "grad_norm": 0.27121410447461325, "learning_rate": 7.222075445642904e-05, "loss": 2.1842, "step": 35 }, { "epoch": 0.42857142857142855, "grad_norm": 0.31502528289581183, "learning_rate": 7.057526756848719e-05, "loss": 2.2298, "step": 36 }, { "epoch": 0.44047619047619047, "grad_norm": 0.24007323900338268, "learning_rate": 6.890576474687263e-05, "loss": 2.4688, "step": 37 }, { "epoch": 0.4523809523809524, "grad_norm": 0.2608935989516304, "learning_rate": 6.721482024392835e-05, "loss": 2.2838, "step": 38 }, { "epoch": 0.4642857142857143, "grad_norm": 0.29766343141422075, "learning_rate": 6.550504137351576e-05, "loss": 2.1777, "step": 39 }, { "epoch": 0.47619047619047616, "grad_norm": 0.29532812127434044, "learning_rate": 6.377906449072578e-05, "loss": 2.2005, "step": 40 }, { "epoch": 0.4880952380952381, "grad_norm": 0.25346420479015375, "learning_rate": 6.203955092681039e-05, "loss": 2.2493, "step": 41 }, { "epoch": 0.5, "grad_norm": 0.2733242470847782, "learning_rate": 6.0289182885602704e-05, "loss": 2.264, "step": 42 }, { "epoch": 0.5119047619047619, "grad_norm": 0.2832250530342488, "learning_rate": 5.8530659307753036e-05, "loss": 2.3321, "step": 43 }, { "epoch": 0.5238095238095238, "grad_norm": 0.23261141435639465, "learning_rate": 5.6766691709158096e-05, "loss": 2.2584, "step": 44 }, { "epoch": 0.5357142857142857, "grad_norm": 0.24734669584168956, "learning_rate": 5.500000000000001e-05, "loss": 2.3196, "step": 45 }, { "epoch": 0.5476190476190477, "grad_norm": 0.3610266226821954, "learning_rate": 5.3233308290841935e-05, "loss": 2.2857, "step": 46 }, { "epoch": 0.5595238095238095, "grad_norm": 0.33827980816046865, "learning_rate": 5.1469340692246995e-05, "loss": 2.099, "step": 47 }, { "epoch": 0.5714285714285714, "grad_norm": 0.25979841286218003, "learning_rate": 4.9710817114397314e-05, "loss": 2.2922, "step": 48 }, { "epoch": 0.5833333333333334, "grad_norm": 0.2966248351989265, "learning_rate": 4.7960449073189606e-05, "loss": 2.3001, "step": 49 }, { "epoch": 0.5952380952380952, "grad_norm": 0.2649319822020958, "learning_rate": 4.6220935509274235e-05, "loss": 2.0389, "step": 50 }, { "epoch": 0.6071428571428571, "grad_norm": 0.3040829342691192, "learning_rate": 4.4494958626484276e-05, "loss": 2.3336, "step": 51 }, { "epoch": 0.6071428571428571, "eval_loss": 2.2863776683807373, "eval_runtime": 85.1891, "eval_samples_per_second": 0.235, "eval_steps_per_second": 0.117, "step": 51 }, { "epoch": 0.6190476190476191, "grad_norm": 0.25961905620657355, "learning_rate": 4.278517975607167e-05, "loss": 2.257, "step": 52 }, { "epoch": 0.6309523809523809, "grad_norm": 0.2520680871081291, "learning_rate": 4.109423525312738e-05, "loss": 2.3646, "step": 53 }, { "epoch": 0.6428571428571429, "grad_norm": 0.2563922206690732, "learning_rate": 3.942473243151281e-05, "loss": 2.1605, "step": 54 }, { "epoch": 0.6547619047619048, "grad_norm": 0.23710847982590777, "learning_rate": 3.777924554357096e-05, "loss": 2.2253, "step": 55 }, { "epoch": 0.6666666666666666, "grad_norm": 0.2358700389619207, "learning_rate": 3.616031181081575e-05, "loss": 2.2685, "step": 56 }, { "epoch": 0.6785714285714286, "grad_norm": 0.25688706639252046, "learning_rate": 3.45704275117204e-05, "loss": 2.306, "step": 57 }, { "epoch": 0.6904761904761905, "grad_norm": 0.28553196847033946, "learning_rate": 3.301204413263704e-05, "loss": 2.394, "step": 58 }, { "epoch": 0.7023809523809523, "grad_norm": 0.24843175796925013, "learning_rate": 3.1487564587782306e-05, "loss": 2.2661, "step": 59 }, { "epoch": 0.7142857142857143, "grad_norm": 0.24662937375083732, "learning_rate": 2.9999339514117912e-05, "loss": 2.3021, "step": 60 }, { "epoch": 0.7261904761904762, "grad_norm": 0.3580921716501969, "learning_rate": 2.854966364683872e-05, "loss": 2.2545, "step": 61 }, { "epoch": 0.7380952380952381, "grad_norm": 0.2601716169771101, "learning_rate": 2.7140772281057468e-05, "loss": 2.4256, "step": 62 }, { "epoch": 0.75, "grad_norm": 0.2729633905977987, "learning_rate": 2.577483782514174e-05, "loss": 2.4064, "step": 63 }, { "epoch": 0.7619047619047619, "grad_norm": 0.35811707072589155, "learning_rate": 2.445396645101762e-05, "loss": 2.3352, "step": 64 }, { "epoch": 0.7738095238095238, "grad_norm": 0.27727284509496897, "learning_rate": 2.3180194846605367e-05, "loss": 2.5192, "step": 65 }, { "epoch": 0.7857142857142857, "grad_norm": 0.2803742538067055, "learning_rate": 2.195548707539416e-05, "loss": 2.3278, "step": 66 }, { "epoch": 0.7976190476190477, "grad_norm": 0.25468361529041794, "learning_rate": 2.0781731547998614e-05, "loss": 2.2444, "step": 67 }, { "epoch": 0.8095238095238095, "grad_norm": 0.4651703351900801, "learning_rate": 1.966073811036649e-05, "loss": 2.1637, "step": 68 }, { "epoch": 0.8095238095238095, "eval_loss": 2.2794995307922363, "eval_runtime": 84.9665, "eval_samples_per_second": 0.235, "eval_steps_per_second": 0.118, "step": 68 }, { "epoch": 0.8214285714285714, "grad_norm": 0.2419165423914357, "learning_rate": 1.8594235253127375e-05, "loss": 2.1522, "step": 69 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5579840019563456, "learning_rate": 1.758386744638546e-05, "loss": 2.3066, "step": 70 }, { "epoch": 0.8452380952380952, "grad_norm": 0.33911931890037594, "learning_rate": 1.6631192604065855e-05, "loss": 2.1133, "step": 71 }, { "epoch": 0.8571428571428571, "grad_norm": 0.23476812251683926, "learning_rate": 1.573767968172413e-05, "loss": 2.368, "step": 72 }, { "epoch": 0.8690476190476191, "grad_norm": 0.2783320668382118, "learning_rate": 1.490470641152345e-05, "loss": 2.2878, "step": 73 }, { "epoch": 0.8809523809523809, "grad_norm": 0.28487408965300465, "learning_rate": 1.413355717787134e-05, "loss": 2.1781, "step": 74 }, { "epoch": 0.8928571428571429, "grad_norm": 0.2342499729484135, "learning_rate": 1.3425421036992098e-05, "loss": 2.2248, "step": 75 }, { "epoch": 0.9047619047619048, "grad_norm": 0.3276211876914205, "learning_rate": 1.2781389883488218e-05, "loss": 2.0437, "step": 76 }, { "epoch": 0.9166666666666666, "grad_norm": 0.23498872906974066, "learning_rate": 1.2202456766718093e-05, "loss": 2.1563, "step": 77 }, { "epoch": 0.9285714285714286, "grad_norm": 0.24216692635411755, "learning_rate": 1.168951435958588e-05, "loss": 2.3686, "step": 78 }, { "epoch": 0.9404761904761905, "grad_norm": 0.24267162761351987, "learning_rate": 1.1243353582104556e-05, "loss": 2.1837, "step": 79 }, { "epoch": 0.9523809523809523, "grad_norm": 0.25968272691520106, "learning_rate": 1.0864662381854632e-05, "loss": 2.2079, "step": 80 }, { "epoch": 0.9642857142857143, "grad_norm": 0.2437768972745406, "learning_rate": 1.0554024673218807e-05, "loss": 2.1451, "step": 81 }, { "epoch": 0.9761904761904762, "grad_norm": 0.315983035460693, "learning_rate": 1.0311919437028318e-05, "loss": 2.3015, "step": 82 }, { "epoch": 0.9880952380952381, "grad_norm": 0.2704189055909359, "learning_rate": 1.0138719982009242e-05, "loss": 2.2505, "step": 83 }, { "epoch": 1.0, "grad_norm": 0.2530162105544282, "learning_rate": 1.003469336916747e-05, "loss": 2.3508, "step": 84 }, { "epoch": 1.0119047619047619, "grad_norm": 0.2513361659364113, "learning_rate": 1e-05, "loss": 2.2057, "step": 85 }, { "epoch": 1.0119047619047619, "eval_loss": 2.2760229110717773, "eval_runtime": 85.0943, "eval_samples_per_second": 0.235, "eval_steps_per_second": 0.118, "step": 85 } ], "logging_steps": 1, "max_steps": 85, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 50739334348800.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }