{ "best_metric": 0.840443960959029, "best_model_checkpoint": "/data/ephemeral/home/level2-nlp-datacentric-nlp-15/models/train_new_8564_7000fconcat+deepl_15000.csv_20241107_163124/checkpoint-600", "epoch": 2.0, "eval_steps": 100, "global_step": 692, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028901734104046242, "grad_norm": 5.670835018157959, "learning_rate": 1.971098265895954e-05, "loss": 1.8852, "step": 10 }, { "epoch": 0.057803468208092484, "grad_norm": 5.40836763381958, "learning_rate": 1.9421965317919077e-05, "loss": 1.6768, "step": 20 }, { "epoch": 0.08670520231213873, "grad_norm": 6.977480888366699, "learning_rate": 1.9132947976878615e-05, "loss": 1.4293, "step": 30 }, { "epoch": 0.11560693641618497, "grad_norm": 6.8324995040893555, "learning_rate": 1.8843930635838153e-05, "loss": 1.2372, "step": 40 }, { "epoch": 0.14450867052023122, "grad_norm": 4.969491481781006, "learning_rate": 1.855491329479769e-05, "loss": 1.0231, "step": 50 }, { "epoch": 0.17341040462427745, "grad_norm": 7.948673725128174, "learning_rate": 1.8265895953757225e-05, "loss": 0.9941, "step": 60 }, { "epoch": 0.2023121387283237, "grad_norm": 7.314361572265625, "learning_rate": 1.7976878612716763e-05, "loss": 0.8929, "step": 70 }, { "epoch": 0.23121387283236994, "grad_norm": 7.860761642456055, "learning_rate": 1.76878612716763e-05, "loss": 0.7236, "step": 80 }, { "epoch": 0.26011560693641617, "grad_norm": 8.400164604187012, "learning_rate": 1.739884393063584e-05, "loss": 0.7342, "step": 90 }, { "epoch": 0.28901734104046245, "grad_norm": 5.281578063964844, "learning_rate": 1.7109826589595377e-05, "loss": 0.7285, "step": 100 }, { "epoch": 0.28901734104046245, "eval_f1": 0.7855931775121717, "eval_loss": 0.6838143467903137, "eval_runtime": 43.4732, "eval_samples_per_second": 108.895, "eval_steps_per_second": 3.404, "step": 100 }, { "epoch": 0.3179190751445087, "grad_norm": 6.559142589569092, "learning_rate": 1.6820809248554915e-05, "loss": 0.767, "step": 110 }, { "epoch": 0.3468208092485549, "grad_norm": 6.961650371551514, "learning_rate": 1.6531791907514452e-05, "loss": 0.72, "step": 120 }, { "epoch": 0.37572254335260113, "grad_norm": 7.83851432800293, "learning_rate": 1.624277456647399e-05, "loss": 0.6204, "step": 130 }, { "epoch": 0.4046242774566474, "grad_norm": 5.967935562133789, "learning_rate": 1.5953757225433528e-05, "loss": 0.6811, "step": 140 }, { "epoch": 0.43352601156069365, "grad_norm": 7.195727348327637, "learning_rate": 1.5664739884393066e-05, "loss": 0.7254, "step": 150 }, { "epoch": 0.4624277456647399, "grad_norm": 6.0163750648498535, "learning_rate": 1.5375722543352604e-05, "loss": 0.5248, "step": 160 }, { "epoch": 0.4913294797687861, "grad_norm": 7.599509239196777, "learning_rate": 1.508670520231214e-05, "loss": 0.6095, "step": 170 }, { "epoch": 0.5202312138728323, "grad_norm": 11.452792167663574, "learning_rate": 1.4797687861271676e-05, "loss": 0.6339, "step": 180 }, { "epoch": 0.5491329479768786, "grad_norm": 9.863639831542969, "learning_rate": 1.4508670520231216e-05, "loss": 0.5901, "step": 190 }, { "epoch": 0.5780346820809249, "grad_norm": 6.931671619415283, "learning_rate": 1.4219653179190754e-05, "loss": 0.5106, "step": 200 }, { "epoch": 0.5780346820809249, "eval_f1": 0.8163391427279263, "eval_loss": 0.604805588722229, "eval_runtime": 43.5017, "eval_samples_per_second": 108.823, "eval_steps_per_second": 3.402, "step": 200 }, { "epoch": 0.6069364161849711, "grad_norm": 7.751237869262695, "learning_rate": 1.393063583815029e-05, "loss": 0.6615, "step": 210 }, { "epoch": 0.6358381502890174, "grad_norm": 7.981545925140381, "learning_rate": 1.3641618497109828e-05, "loss": 0.6194, "step": 220 }, { "epoch": 0.6647398843930635, "grad_norm": 8.75810432434082, "learning_rate": 1.3352601156069365e-05, "loss": 0.6756, "step": 230 }, { "epoch": 0.6936416184971098, "grad_norm": 7.4759521484375, "learning_rate": 1.3063583815028902e-05, "loss": 0.6079, "step": 240 }, { "epoch": 0.7225433526011561, "grad_norm": 9.445594787597656, "learning_rate": 1.2774566473988441e-05, "loss": 0.5329, "step": 250 }, { "epoch": 0.7514450867052023, "grad_norm": 7.227514266967773, "learning_rate": 1.2485549132947979e-05, "loss": 0.5342, "step": 260 }, { "epoch": 0.7803468208092486, "grad_norm": 6.781445503234863, "learning_rate": 1.2196531791907515e-05, "loss": 0.5455, "step": 270 }, { "epoch": 0.8092485549132948, "grad_norm": 7.4944353103637695, "learning_rate": 1.1907514450867053e-05, "loss": 0.5171, "step": 280 }, { "epoch": 0.838150289017341, "grad_norm": 12.261595726013184, "learning_rate": 1.161849710982659e-05, "loss": 0.5793, "step": 290 }, { "epoch": 0.8670520231213873, "grad_norm": 7.033445835113525, "learning_rate": 1.1329479768786129e-05, "loss": 0.6664, "step": 300 }, { "epoch": 0.8670520231213873, "eval_f1": 0.8202014204332396, "eval_loss": 0.5958309769630432, "eval_runtime": 43.4645, "eval_samples_per_second": 108.916, "eval_steps_per_second": 3.405, "step": 300 }, { "epoch": 0.8959537572254336, "grad_norm": 16.04691505432129, "learning_rate": 1.1040462427745667e-05, "loss": 0.5736, "step": 310 }, { "epoch": 0.9248554913294798, "grad_norm": 6.906232833862305, "learning_rate": 1.0751445086705203e-05, "loss": 0.5677, "step": 320 }, { "epoch": 0.953757225433526, "grad_norm": 7.995438098907471, "learning_rate": 1.046242774566474e-05, "loss": 0.5872, "step": 330 }, { "epoch": 0.9826589595375722, "grad_norm": 7.967160701751709, "learning_rate": 1.0173410404624278e-05, "loss": 0.5363, "step": 340 }, { "epoch": 1.0115606936416186, "grad_norm": 7.901902198791504, "learning_rate": 9.884393063583816e-06, "loss": 0.4558, "step": 350 }, { "epoch": 1.0404624277456647, "grad_norm": 3.8353588581085205, "learning_rate": 9.595375722543352e-06, "loss": 0.563, "step": 360 }, { "epoch": 1.069364161849711, "grad_norm": 10.367301940917969, "learning_rate": 9.306358381502892e-06, "loss": 0.4669, "step": 370 }, { "epoch": 1.0982658959537572, "grad_norm": 3.960563898086548, "learning_rate": 9.017341040462428e-06, "loss": 0.4186, "step": 380 }, { "epoch": 1.1271676300578035, "grad_norm": 5.368332862854004, "learning_rate": 8.728323699421966e-06, "loss": 0.4852, "step": 390 }, { "epoch": 1.1560693641618498, "grad_norm": 4.498876094818115, "learning_rate": 8.439306358381504e-06, "loss": 0.5515, "step": 400 }, { "epoch": 1.1560693641618498, "eval_f1": 0.832606512739663, "eval_loss": 0.5639938116073608, "eval_runtime": 43.4855, "eval_samples_per_second": 108.864, "eval_steps_per_second": 3.403, "step": 400 }, { "epoch": 1.1849710982658959, "grad_norm": 6.101068496704102, "learning_rate": 8.150289017341042e-06, "loss": 0.5084, "step": 410 }, { "epoch": 1.2138728323699421, "grad_norm": 8.67908000946045, "learning_rate": 7.86127167630058e-06, "loss": 0.4328, "step": 420 }, { "epoch": 1.2427745664739884, "grad_norm": 6.932382106781006, "learning_rate": 7.5722543352601166e-06, "loss": 0.3798, "step": 430 }, { "epoch": 1.2716763005780347, "grad_norm": 6.64376163482666, "learning_rate": 7.283236994219654e-06, "loss": 0.4763, "step": 440 }, { "epoch": 1.300578034682081, "grad_norm": 5.648944854736328, "learning_rate": 6.9942196531791914e-06, "loss": 0.4008, "step": 450 }, { "epoch": 1.3294797687861273, "grad_norm": 8.3818998336792, "learning_rate": 6.7052023121387284e-06, "loss": 0.4959, "step": 460 }, { "epoch": 1.3583815028901733, "grad_norm": 6.271963119506836, "learning_rate": 6.416184971098266e-06, "loss": 0.4123, "step": 470 }, { "epoch": 1.3872832369942196, "grad_norm": 10.868525505065918, "learning_rate": 6.127167630057804e-06, "loss": 0.4478, "step": 480 }, { "epoch": 1.416184971098266, "grad_norm": 4.8018012046813965, "learning_rate": 5.838150289017341e-06, "loss": 0.3475, "step": 490 }, { "epoch": 1.4450867052023122, "grad_norm": 7.195227146148682, "learning_rate": 5.549132947976878e-06, "loss": 0.5246, "step": 500 }, { "epoch": 1.4450867052023122, "eval_f1": 0.8394367921843663, "eval_loss": 0.5455829501152039, "eval_runtime": 43.4865, "eval_samples_per_second": 108.861, "eval_steps_per_second": 3.403, "step": 500 }, { "epoch": 1.4739884393063583, "grad_norm": 7.47812557220459, "learning_rate": 5.260115606936417e-06, "loss": 0.4008, "step": 510 }, { "epoch": 1.5028901734104045, "grad_norm": 3.8493812084198, "learning_rate": 4.971098265895954e-06, "loss": 0.4166, "step": 520 }, { "epoch": 1.5317919075144508, "grad_norm": 6.382811069488525, "learning_rate": 4.682080924855492e-06, "loss": 0.4287, "step": 530 }, { "epoch": 1.560693641618497, "grad_norm": 4.259267330169678, "learning_rate": 4.3930635838150296e-06, "loss": 0.4493, "step": 540 }, { "epoch": 1.5895953757225434, "grad_norm": 5.338929653167725, "learning_rate": 4.1040462427745666e-06, "loss": 0.4488, "step": 550 }, { "epoch": 1.6184971098265897, "grad_norm": 4.65845251083374, "learning_rate": 3.815028901734104e-06, "loss": 0.4649, "step": 560 }, { "epoch": 1.647398843930636, "grad_norm": 8.946381568908691, "learning_rate": 3.526011560693642e-06, "loss": 0.4826, "step": 570 }, { "epoch": 1.6763005780346822, "grad_norm": 10.277519226074219, "learning_rate": 3.2369942196531797e-06, "loss": 0.4061, "step": 580 }, { "epoch": 1.7052023121387283, "grad_norm": 9.004323959350586, "learning_rate": 2.947976878612717e-06, "loss": 0.463, "step": 590 }, { "epoch": 1.7341040462427746, "grad_norm": 9.768237113952637, "learning_rate": 2.658959537572254e-06, "loss": 0.407, "step": 600 }, { "epoch": 1.7341040462427746, "eval_f1": 0.840443960959029, "eval_loss": 0.5418282151222229, "eval_runtime": 43.4552, "eval_samples_per_second": 108.94, "eval_steps_per_second": 3.406, "step": 600 }, { "epoch": 1.7630057803468207, "grad_norm": 3.791731119155884, "learning_rate": 2.369942196531792e-06, "loss": 0.4195, "step": 610 }, { "epoch": 1.791907514450867, "grad_norm": 9.772268295288086, "learning_rate": 2.08092485549133e-06, "loss": 0.4644, "step": 620 }, { "epoch": 1.8208092485549132, "grad_norm": 5.360107421875, "learning_rate": 1.791907514450867e-06, "loss": 0.4315, "step": 630 }, { "epoch": 1.8497109826589595, "grad_norm": 9.654316902160645, "learning_rate": 1.502890173410405e-06, "loss": 0.435, "step": 640 }, { "epoch": 1.8786127167630058, "grad_norm": 6.9839768409729, "learning_rate": 1.2138728323699423e-06, "loss": 0.387, "step": 650 }, { "epoch": 1.907514450867052, "grad_norm": 8.008018493652344, "learning_rate": 9.248554913294798e-07, "loss": 0.4039, "step": 660 }, { "epoch": 1.9364161849710984, "grad_norm": 7.851306438446045, "learning_rate": 6.358381502890174e-07, "loss": 0.4879, "step": 670 }, { "epoch": 1.9653179190751446, "grad_norm": 10.8189115524292, "learning_rate": 3.468208092485549e-07, "loss": 0.3706, "step": 680 }, { "epoch": 1.9942196531791907, "grad_norm": 8.877676963806152, "learning_rate": 5.7803468208092485e-08, "loss": 0.3861, "step": 690 } ], "logging_steps": 10, "max_steps": 692, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5812910382182400.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }