{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25985275010827197, "eval_steps": 25, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034647033347769596, "grad_norm": 71.73021697998047, "learning_rate": 6.666666666666667e-05, "loss": 56.7725, "step": 1 }, { "epoch": 0.0034647033347769596, "eval_loss": 5.7633891105651855, "eval_runtime": 54.7107, "eval_samples_per_second": 4.46, "eval_steps_per_second": 2.23, "step": 1 }, { "epoch": 0.006929406669553919, "grad_norm": 58.02839279174805, "learning_rate": 0.00013333333333333334, "loss": 45.1267, "step": 2 }, { "epoch": 0.01039411000433088, "grad_norm": 62.47406768798828, "learning_rate": 0.0002, "loss": 51.5264, "step": 3 }, { "epoch": 0.013858813339107838, "grad_norm": 50.805538177490234, "learning_rate": 0.0001999048221581858, "loss": 48.1345, "step": 4 }, { "epoch": 0.0173235166738848, "grad_norm": 53.90876388549805, "learning_rate": 0.00019961946980917456, "loss": 33.368, "step": 5 }, { "epoch": 0.02078822000866176, "grad_norm": 47.89884567260742, "learning_rate": 0.00019914448613738106, "loss": 20.7941, "step": 6 }, { "epoch": 0.024252923343438718, "grad_norm": 46.44947814941406, "learning_rate": 0.00019848077530122083, "loss": 19.2011, "step": 7 }, { "epoch": 0.027717626678215677, "grad_norm": 29.739330291748047, "learning_rate": 0.00019762960071199333, "loss": 11.6099, "step": 8 }, { "epoch": 0.031182330012992636, "grad_norm": 37.299049377441406, "learning_rate": 0.00019659258262890683, "loss": 10.8285, "step": 9 }, { "epoch": 0.0346470333477696, "grad_norm": 37.623783111572266, "learning_rate": 0.0001953716950748227, "loss": 8.8167, "step": 10 }, { "epoch": 0.038111736682546554, "grad_norm": 25.743181228637695, "learning_rate": 0.00019396926207859084, "loss": 7.6365, "step": 11 }, { "epoch": 0.04157644001732352, "grad_norm": 21.669662475585938, "learning_rate": 0.0001923879532511287, "loss": 6.3246, "step": 12 }, { "epoch": 0.04504114335210048, "grad_norm": 21.14327049255371, "learning_rate": 0.000190630778703665, "loss": 3.8779, "step": 13 }, { "epoch": 0.048505846686877435, "grad_norm": 32.12599563598633, "learning_rate": 0.00018870108331782217, "loss": 5.3825, "step": 14 }, { "epoch": 0.0519705500216544, "grad_norm": 56.678955078125, "learning_rate": 0.00018660254037844388, "loss": 5.0414, "step": 15 }, { "epoch": 0.055435253356431353, "grad_norm": 51.66940689086914, "learning_rate": 0.0001843391445812886, "loss": 8.305, "step": 16 }, { "epoch": 0.058899956691208316, "grad_norm": 27.637439727783203, "learning_rate": 0.0001819152044288992, "loss": 6.1433, "step": 17 }, { "epoch": 0.06236466002598527, "grad_norm": 28.653242111206055, "learning_rate": 0.00017933533402912354, "loss": 4.0016, "step": 18 }, { "epoch": 0.06582936336076224, "grad_norm": 23.004844665527344, "learning_rate": 0.0001766044443118978, "loss": 4.7562, "step": 19 }, { "epoch": 0.0692940666955392, "grad_norm": 25.145166397094727, "learning_rate": 0.0001737277336810124, "loss": 4.5456, "step": 20 }, { "epoch": 0.07275877003031615, "grad_norm": 16.525177001953125, "learning_rate": 0.00017071067811865476, "loss": 4.3994, "step": 21 }, { "epoch": 0.07622347336509311, "grad_norm": 21.258804321289062, "learning_rate": 0.00016755902076156604, "loss": 4.9417, "step": 22 }, { "epoch": 0.07968817669987008, "grad_norm": 18.620487213134766, "learning_rate": 0.00016427876096865394, "loss": 4.2276, "step": 23 }, { "epoch": 0.08315288003464703, "grad_norm": 71.73140716552734, "learning_rate": 0.00016087614290087208, "loss": 5.1532, "step": 24 }, { "epoch": 0.08661758336942399, "grad_norm": 32.46612548828125, "learning_rate": 0.0001573576436351046, "loss": 3.328, "step": 25 }, { "epoch": 0.08661758336942399, "eval_loss": 0.6132519841194153, "eval_runtime": 55.2353, "eval_samples_per_second": 4.417, "eval_steps_per_second": 2.209, "step": 25 }, { "epoch": 0.09008228670420096, "grad_norm": 30.413021087646484, "learning_rate": 0.0001537299608346824, "loss": 4.3897, "step": 26 }, { "epoch": 0.09354699003897791, "grad_norm": 70.32632446289062, "learning_rate": 0.00015000000000000001, "loss": 6.1758, "step": 27 }, { "epoch": 0.09701169337375487, "grad_norm": 44.74322509765625, "learning_rate": 0.00014617486132350343, "loss": 5.2728, "step": 28 }, { "epoch": 0.10047639670853183, "grad_norm": 44.46076965332031, "learning_rate": 0.00014226182617406996, "loss": 7.1594, "step": 29 }, { "epoch": 0.1039411000433088, "grad_norm": 55.632720947265625, "learning_rate": 0.000138268343236509, "loss": 3.2506, "step": 30 }, { "epoch": 0.10740580337808575, "grad_norm": 64.20110321044922, "learning_rate": 0.00013420201433256689, "loss": 4.082, "step": 31 }, { "epoch": 0.11087050671286271, "grad_norm": 82.19364166259766, "learning_rate": 0.00013007057995042732, "loss": 5.7286, "step": 32 }, { "epoch": 0.11433521004763968, "grad_norm": 25.911882400512695, "learning_rate": 0.00012588190451025207, "loss": 3.3962, "step": 33 }, { "epoch": 0.11779991338241663, "grad_norm": 34.44158935546875, "learning_rate": 0.00012164396139381029, "loss": 4.11, "step": 34 }, { "epoch": 0.12126461671719359, "grad_norm": 29.16976547241211, "learning_rate": 0.00011736481776669306, "loss": 4.2445, "step": 35 }, { "epoch": 0.12472932005197054, "grad_norm": 18.717464447021484, "learning_rate": 0.00011305261922200519, "loss": 3.1105, "step": 36 }, { "epoch": 0.1281940233867475, "grad_norm": 12.39698314666748, "learning_rate": 0.00010871557427476583, "loss": 2.9836, "step": 37 }, { "epoch": 0.13165872672152448, "grad_norm": 20.110858917236328, "learning_rate": 0.00010436193873653361, "loss": 4.8843, "step": 38 }, { "epoch": 0.13512343005630142, "grad_norm": 22.39121437072754, "learning_rate": 0.0001, "loss": 3.7531, "step": 39 }, { "epoch": 0.1385881333910784, "grad_norm": 12.983887672424316, "learning_rate": 9.563806126346642e-05, "loss": 2.8063, "step": 40 }, { "epoch": 0.14205283672585534, "grad_norm": 12.021308898925781, "learning_rate": 9.128442572523417e-05, "loss": 1.8465, "step": 41 }, { "epoch": 0.1455175400606323, "grad_norm": 32.44965744018555, "learning_rate": 8.694738077799488e-05, "loss": 4.7937, "step": 42 }, { "epoch": 0.14898224339540928, "grad_norm": 16.291828155517578, "learning_rate": 8.263518223330697e-05, "loss": 4.0307, "step": 43 }, { "epoch": 0.15244694673018622, "grad_norm": 12.486902236938477, "learning_rate": 7.835603860618972e-05, "loss": 3.1398, "step": 44 }, { "epoch": 0.1559116500649632, "grad_norm": 20.737529754638672, "learning_rate": 7.411809548974792e-05, "loss": 2.7772, "step": 45 }, { "epoch": 0.15937635339974016, "grad_norm": 17.130455017089844, "learning_rate": 6.992942004957271e-05, "loss": 2.7659, "step": 46 }, { "epoch": 0.1628410567345171, "grad_norm": 33.27773666381836, "learning_rate": 6.579798566743314e-05, "loss": 4.4388, "step": 47 }, { "epoch": 0.16630576006929407, "grad_norm": 16.671640396118164, "learning_rate": 6.173165676349103e-05, "loss": 5.4897, "step": 48 }, { "epoch": 0.16977046340407104, "grad_norm": 32.55076599121094, "learning_rate": 5.773817382593008e-05, "loss": 5.7507, "step": 49 }, { "epoch": 0.17323516673884798, "grad_norm": 24.630674362182617, "learning_rate": 5.382513867649663e-05, "loss": 4.7681, "step": 50 }, { "epoch": 0.17323516673884798, "eval_loss": 0.4547581374645233, "eval_runtime": 55.2618, "eval_samples_per_second": 4.415, "eval_steps_per_second": 2.208, "step": 50 }, { "epoch": 0.17669987007362495, "grad_norm": 35.518733978271484, "learning_rate": 5.000000000000002e-05, "loss": 3.7489, "step": 51 }, { "epoch": 0.18016457340840192, "grad_norm": 13.69632625579834, "learning_rate": 4.6270039165317605e-05, "loss": 3.3506, "step": 52 }, { "epoch": 0.18362927674317886, "grad_norm": 8.78832721710205, "learning_rate": 4.264235636489542e-05, "loss": 1.1164, "step": 53 }, { "epoch": 0.18709398007795583, "grad_norm": 19.308809280395508, "learning_rate": 3.9123857099127936e-05, "loss": 2.4321, "step": 54 }, { "epoch": 0.19055868341273277, "grad_norm": 37.123531341552734, "learning_rate": 3.5721239031346066e-05, "loss": 2.8694, "step": 55 }, { "epoch": 0.19402338674750974, "grad_norm": 17.861862182617188, "learning_rate": 3.244097923843398e-05, "loss": 2.7996, "step": 56 }, { "epoch": 0.1974880900822867, "grad_norm": 25.85512351989746, "learning_rate": 2.9289321881345254e-05, "loss": 3.5098, "step": 57 }, { "epoch": 0.20095279341706365, "grad_norm": 15.257984161376953, "learning_rate": 2.6272266318987603e-05, "loss": 4.2756, "step": 58 }, { "epoch": 0.20441749675184062, "grad_norm": 9.036160469055176, "learning_rate": 2.339555568810221e-05, "loss": 2.1725, "step": 59 }, { "epoch": 0.2078822000866176, "grad_norm": 8.915935516357422, "learning_rate": 2.0664665970876496e-05, "loss": 2.2972, "step": 60 }, { "epoch": 0.21134690342139453, "grad_norm": 21.377731323242188, "learning_rate": 1.808479557110081e-05, "loss": 3.7501, "step": 61 }, { "epoch": 0.2148116067561715, "grad_norm": 25.860002517700195, "learning_rate": 1.566085541871145e-05, "loss": 4.4689, "step": 62 }, { "epoch": 0.21827631009094847, "grad_norm": 18.97432518005371, "learning_rate": 1.339745962155613e-05, "loss": 1.9634, "step": 63 }, { "epoch": 0.22174101342572541, "grad_norm": 19.91278839111328, "learning_rate": 1.129891668217783e-05, "loss": 2.4683, "step": 64 }, { "epoch": 0.22520571676050238, "grad_norm": 23.109867095947266, "learning_rate": 9.369221296335006e-06, "loss": 5.7752, "step": 65 }, { "epoch": 0.22867042009527935, "grad_norm": 15.709304809570312, "learning_rate": 7.612046748871327e-06, "loss": 3.541, "step": 66 }, { "epoch": 0.2321351234300563, "grad_norm": 20.902395248413086, "learning_rate": 6.030737921409169e-06, "loss": 3.8023, "step": 67 }, { "epoch": 0.23559982676483326, "grad_norm": 23.645151138305664, "learning_rate": 4.628304925177318e-06, "loss": 3.0177, "step": 68 }, { "epoch": 0.23906453009961023, "grad_norm": 17.126794815063477, "learning_rate": 3.40741737109318e-06, "loss": 2.0654, "step": 69 }, { "epoch": 0.24252923343438718, "grad_norm": 20.345962524414062, "learning_rate": 2.3703992880066638e-06, "loss": 3.7378, "step": 70 }, { "epoch": 0.24599393676916415, "grad_norm": 21.93646812438965, "learning_rate": 1.5192246987791981e-06, "loss": 2.4299, "step": 71 }, { "epoch": 0.2494586401039411, "grad_norm": 20.879924774169922, "learning_rate": 8.555138626189618e-07, "loss": 2.5581, "step": 72 }, { "epoch": 0.2529233434387181, "grad_norm": 31.036603927612305, "learning_rate": 3.805301908254455e-07, "loss": 4.7284, "step": 73 }, { "epoch": 0.256388046773495, "grad_norm": 24.942869186401367, "learning_rate": 9.517784181422019e-08, "loss": 2.1838, "step": 74 }, { "epoch": 0.25985275010827197, "grad_norm": 11.447266578674316, "learning_rate": 0.0, "loss": 2.1105, "step": 75 }, { "epoch": 0.25985275010827197, "eval_loss": 0.38869383931159973, "eval_runtime": 55.2151, "eval_samples_per_second": 4.419, "eval_steps_per_second": 2.21, "step": 75 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9043369091072e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }