|
{ |
|
"best_metric": 0.840443960959029, |
|
"best_model_checkpoint": "/data/ephemeral/home/level2-nlp-datacentric-nlp-15/models/train_new_8564_7000fconcat+deepl_15000.csv_20241107_163124/checkpoint-600", |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 692, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.028901734104046242, |
|
"grad_norm": 5.670835018157959, |
|
"learning_rate": 1.971098265895954e-05, |
|
"loss": 1.8852, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.057803468208092484, |
|
"grad_norm": 5.40836763381958, |
|
"learning_rate": 1.9421965317919077e-05, |
|
"loss": 1.6768, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08670520231213873, |
|
"grad_norm": 6.977480888366699, |
|
"learning_rate": 1.9132947976878615e-05, |
|
"loss": 1.4293, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11560693641618497, |
|
"grad_norm": 6.8324995040893555, |
|
"learning_rate": 1.8843930635838153e-05, |
|
"loss": 1.2372, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14450867052023122, |
|
"grad_norm": 4.969491481781006, |
|
"learning_rate": 1.855491329479769e-05, |
|
"loss": 1.0231, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17341040462427745, |
|
"grad_norm": 7.948673725128174, |
|
"learning_rate": 1.8265895953757225e-05, |
|
"loss": 0.9941, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2023121387283237, |
|
"grad_norm": 7.314361572265625, |
|
"learning_rate": 1.7976878612716763e-05, |
|
"loss": 0.8929, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23121387283236994, |
|
"grad_norm": 7.860761642456055, |
|
"learning_rate": 1.76878612716763e-05, |
|
"loss": 0.7236, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.26011560693641617, |
|
"grad_norm": 8.400164604187012, |
|
"learning_rate": 1.739884393063584e-05, |
|
"loss": 0.7342, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.28901734104046245, |
|
"grad_norm": 5.281578063964844, |
|
"learning_rate": 1.7109826589595377e-05, |
|
"loss": 0.7285, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.28901734104046245, |
|
"eval_f1": 0.7855931775121717, |
|
"eval_loss": 0.6838143467903137, |
|
"eval_runtime": 43.4732, |
|
"eval_samples_per_second": 108.895, |
|
"eval_steps_per_second": 3.404, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3179190751445087, |
|
"grad_norm": 6.559142589569092, |
|
"learning_rate": 1.6820809248554915e-05, |
|
"loss": 0.767, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3468208092485549, |
|
"grad_norm": 6.961650371551514, |
|
"learning_rate": 1.6531791907514452e-05, |
|
"loss": 0.72, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.37572254335260113, |
|
"grad_norm": 7.83851432800293, |
|
"learning_rate": 1.624277456647399e-05, |
|
"loss": 0.6204, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4046242774566474, |
|
"grad_norm": 5.967935562133789, |
|
"learning_rate": 1.5953757225433528e-05, |
|
"loss": 0.6811, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.43352601156069365, |
|
"grad_norm": 7.195727348327637, |
|
"learning_rate": 1.5664739884393066e-05, |
|
"loss": 0.7254, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4624277456647399, |
|
"grad_norm": 6.0163750648498535, |
|
"learning_rate": 1.5375722543352604e-05, |
|
"loss": 0.5248, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4913294797687861, |
|
"grad_norm": 7.599509239196777, |
|
"learning_rate": 1.508670520231214e-05, |
|
"loss": 0.6095, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5202312138728323, |
|
"grad_norm": 11.452792167663574, |
|
"learning_rate": 1.4797687861271676e-05, |
|
"loss": 0.6339, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5491329479768786, |
|
"grad_norm": 9.863639831542969, |
|
"learning_rate": 1.4508670520231216e-05, |
|
"loss": 0.5901, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5780346820809249, |
|
"grad_norm": 6.931671619415283, |
|
"learning_rate": 1.4219653179190754e-05, |
|
"loss": 0.5106, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5780346820809249, |
|
"eval_f1": 0.8163391427279263, |
|
"eval_loss": 0.604805588722229, |
|
"eval_runtime": 43.5017, |
|
"eval_samples_per_second": 108.823, |
|
"eval_steps_per_second": 3.402, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6069364161849711, |
|
"grad_norm": 7.751237869262695, |
|
"learning_rate": 1.393063583815029e-05, |
|
"loss": 0.6615, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6358381502890174, |
|
"grad_norm": 7.981545925140381, |
|
"learning_rate": 1.3641618497109828e-05, |
|
"loss": 0.6194, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6647398843930635, |
|
"grad_norm": 8.75810432434082, |
|
"learning_rate": 1.3352601156069365e-05, |
|
"loss": 0.6756, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6936416184971098, |
|
"grad_norm": 7.4759521484375, |
|
"learning_rate": 1.3063583815028902e-05, |
|
"loss": 0.6079, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7225433526011561, |
|
"grad_norm": 9.445594787597656, |
|
"learning_rate": 1.2774566473988441e-05, |
|
"loss": 0.5329, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7514450867052023, |
|
"grad_norm": 7.227514266967773, |
|
"learning_rate": 1.2485549132947979e-05, |
|
"loss": 0.5342, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7803468208092486, |
|
"grad_norm": 6.781445503234863, |
|
"learning_rate": 1.2196531791907515e-05, |
|
"loss": 0.5455, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8092485549132948, |
|
"grad_norm": 7.4944353103637695, |
|
"learning_rate": 1.1907514450867053e-05, |
|
"loss": 0.5171, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.838150289017341, |
|
"grad_norm": 12.261595726013184, |
|
"learning_rate": 1.161849710982659e-05, |
|
"loss": 0.5793, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8670520231213873, |
|
"grad_norm": 7.033445835113525, |
|
"learning_rate": 1.1329479768786129e-05, |
|
"loss": 0.6664, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8670520231213873, |
|
"eval_f1": 0.8202014204332396, |
|
"eval_loss": 0.5958309769630432, |
|
"eval_runtime": 43.4645, |
|
"eval_samples_per_second": 108.916, |
|
"eval_steps_per_second": 3.405, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8959537572254336, |
|
"grad_norm": 16.04691505432129, |
|
"learning_rate": 1.1040462427745667e-05, |
|
"loss": 0.5736, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9248554913294798, |
|
"grad_norm": 6.906232833862305, |
|
"learning_rate": 1.0751445086705203e-05, |
|
"loss": 0.5677, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.953757225433526, |
|
"grad_norm": 7.995438098907471, |
|
"learning_rate": 1.046242774566474e-05, |
|
"loss": 0.5872, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9826589595375722, |
|
"grad_norm": 7.967160701751709, |
|
"learning_rate": 1.0173410404624278e-05, |
|
"loss": 0.5363, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0115606936416186, |
|
"grad_norm": 7.901902198791504, |
|
"learning_rate": 9.884393063583816e-06, |
|
"loss": 0.4558, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0404624277456647, |
|
"grad_norm": 3.8353588581085205, |
|
"learning_rate": 9.595375722543352e-06, |
|
"loss": 0.563, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.069364161849711, |
|
"grad_norm": 10.367301940917969, |
|
"learning_rate": 9.306358381502892e-06, |
|
"loss": 0.4669, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0982658959537572, |
|
"grad_norm": 3.960563898086548, |
|
"learning_rate": 9.017341040462428e-06, |
|
"loss": 0.4186, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1271676300578035, |
|
"grad_norm": 5.368332862854004, |
|
"learning_rate": 8.728323699421966e-06, |
|
"loss": 0.4852, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1560693641618498, |
|
"grad_norm": 4.498876094818115, |
|
"learning_rate": 8.439306358381504e-06, |
|
"loss": 0.5515, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1560693641618498, |
|
"eval_f1": 0.832606512739663, |
|
"eval_loss": 0.5639938116073608, |
|
"eval_runtime": 43.4855, |
|
"eval_samples_per_second": 108.864, |
|
"eval_steps_per_second": 3.403, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1849710982658959, |
|
"grad_norm": 6.101068496704102, |
|
"learning_rate": 8.150289017341042e-06, |
|
"loss": 0.5084, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2138728323699421, |
|
"grad_norm": 8.67908000946045, |
|
"learning_rate": 7.86127167630058e-06, |
|
"loss": 0.4328, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2427745664739884, |
|
"grad_norm": 6.932382106781006, |
|
"learning_rate": 7.5722543352601166e-06, |
|
"loss": 0.3798, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2716763005780347, |
|
"grad_norm": 6.64376163482666, |
|
"learning_rate": 7.283236994219654e-06, |
|
"loss": 0.4763, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.300578034682081, |
|
"grad_norm": 5.648944854736328, |
|
"learning_rate": 6.9942196531791914e-06, |
|
"loss": 0.4008, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3294797687861273, |
|
"grad_norm": 8.3818998336792, |
|
"learning_rate": 6.7052023121387284e-06, |
|
"loss": 0.4959, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3583815028901733, |
|
"grad_norm": 6.271963119506836, |
|
"learning_rate": 6.416184971098266e-06, |
|
"loss": 0.4123, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3872832369942196, |
|
"grad_norm": 10.868525505065918, |
|
"learning_rate": 6.127167630057804e-06, |
|
"loss": 0.4478, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.416184971098266, |
|
"grad_norm": 4.8018012046813965, |
|
"learning_rate": 5.838150289017341e-06, |
|
"loss": 0.3475, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4450867052023122, |
|
"grad_norm": 7.195227146148682, |
|
"learning_rate": 5.549132947976878e-06, |
|
"loss": 0.5246, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4450867052023122, |
|
"eval_f1": 0.8394367921843663, |
|
"eval_loss": 0.5455829501152039, |
|
"eval_runtime": 43.4865, |
|
"eval_samples_per_second": 108.861, |
|
"eval_steps_per_second": 3.403, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4739884393063583, |
|
"grad_norm": 7.47812557220459, |
|
"learning_rate": 5.260115606936417e-06, |
|
"loss": 0.4008, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5028901734104045, |
|
"grad_norm": 3.8493812084198, |
|
"learning_rate": 4.971098265895954e-06, |
|
"loss": 0.4166, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5317919075144508, |
|
"grad_norm": 6.382811069488525, |
|
"learning_rate": 4.682080924855492e-06, |
|
"loss": 0.4287, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.560693641618497, |
|
"grad_norm": 4.259267330169678, |
|
"learning_rate": 4.3930635838150296e-06, |
|
"loss": 0.4493, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5895953757225434, |
|
"grad_norm": 5.338929653167725, |
|
"learning_rate": 4.1040462427745666e-06, |
|
"loss": 0.4488, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6184971098265897, |
|
"grad_norm": 4.65845251083374, |
|
"learning_rate": 3.815028901734104e-06, |
|
"loss": 0.4649, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.647398843930636, |
|
"grad_norm": 8.946381568908691, |
|
"learning_rate": 3.526011560693642e-06, |
|
"loss": 0.4826, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.6763005780346822, |
|
"grad_norm": 10.277519226074219, |
|
"learning_rate": 3.2369942196531797e-06, |
|
"loss": 0.4061, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7052023121387283, |
|
"grad_norm": 9.004323959350586, |
|
"learning_rate": 2.947976878612717e-06, |
|
"loss": 0.463, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7341040462427746, |
|
"grad_norm": 9.768237113952637, |
|
"learning_rate": 2.658959537572254e-06, |
|
"loss": 0.407, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7341040462427746, |
|
"eval_f1": 0.840443960959029, |
|
"eval_loss": 0.5418282151222229, |
|
"eval_runtime": 43.4552, |
|
"eval_samples_per_second": 108.94, |
|
"eval_steps_per_second": 3.406, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7630057803468207, |
|
"grad_norm": 3.791731119155884, |
|
"learning_rate": 2.369942196531792e-06, |
|
"loss": 0.4195, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.791907514450867, |
|
"grad_norm": 9.772268295288086, |
|
"learning_rate": 2.08092485549133e-06, |
|
"loss": 0.4644, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8208092485549132, |
|
"grad_norm": 5.360107421875, |
|
"learning_rate": 1.791907514450867e-06, |
|
"loss": 0.4315, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.8497109826589595, |
|
"grad_norm": 9.654316902160645, |
|
"learning_rate": 1.502890173410405e-06, |
|
"loss": 0.435, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.8786127167630058, |
|
"grad_norm": 6.9839768409729, |
|
"learning_rate": 1.2138728323699423e-06, |
|
"loss": 0.387, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.907514450867052, |
|
"grad_norm": 8.008018493652344, |
|
"learning_rate": 9.248554913294798e-07, |
|
"loss": 0.4039, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9364161849710984, |
|
"grad_norm": 7.851306438446045, |
|
"learning_rate": 6.358381502890174e-07, |
|
"loss": 0.4879, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.9653179190751446, |
|
"grad_norm": 10.8189115524292, |
|
"learning_rate": 3.468208092485549e-07, |
|
"loss": 0.3706, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.9942196531791907, |
|
"grad_norm": 8.877676963806152, |
|
"learning_rate": 5.7803468208092485e-08, |
|
"loss": 0.3861, |
|
"step": 690 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 692, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5812910382182400.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|