|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.04222972972972973, |
|
"eval_steps": 25, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008445945945945946, |
|
"grad_norm": 7.727255344390869, |
|
"learning_rate": 2e-05, |
|
"loss": 8.372, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008445945945945946, |
|
"eval_loss": 8.063016891479492, |
|
"eval_runtime": 125.6054, |
|
"eval_samples_per_second": 3.973, |
|
"eval_steps_per_second": 1.99, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0016891891891891893, |
|
"grad_norm": 9.669200897216797, |
|
"learning_rate": 4e-05, |
|
"loss": 8.046, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002533783783783784, |
|
"grad_norm": 9.18079948425293, |
|
"learning_rate": 6e-05, |
|
"loss": 8.0321, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0033783783783783786, |
|
"grad_norm": 9.363329887390137, |
|
"learning_rate": 8e-05, |
|
"loss": 7.0182, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004222972972972973, |
|
"grad_norm": 9.816231727600098, |
|
"learning_rate": 0.0001, |
|
"loss": 7.7225, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005067567567567568, |
|
"grad_norm": 9.099088668823242, |
|
"learning_rate": 0.00012, |
|
"loss": 7.875, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0059121621621621625, |
|
"grad_norm": 9.296707153320312, |
|
"learning_rate": 0.00014, |
|
"loss": 5.6422, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006756756756756757, |
|
"grad_norm": 9.570911407470703, |
|
"learning_rate": 0.00016, |
|
"loss": 3.8105, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007601351351351352, |
|
"grad_norm": 8.131881713867188, |
|
"learning_rate": 0.00018, |
|
"loss": 2.0885, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.008445945945945946, |
|
"grad_norm": 5.208433628082275, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0562, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009290540540540541, |
|
"grad_norm": 10.91145133972168, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 2.9782, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.010135135135135136, |
|
"grad_norm": 17.846271514892578, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 3.6074, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01097972972972973, |
|
"grad_norm": 8.735333442687988, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 0.805, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011824324324324325, |
|
"grad_norm": 6.351312637329102, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 1.1463, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01266891891891892, |
|
"grad_norm": 2.7280004024505615, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 0.2077, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"grad_norm": 5.975780487060547, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 1.1476, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.014358108108108109, |
|
"grad_norm": 5.841843605041504, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 1.0156, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015202702702702704, |
|
"grad_norm": 6.997043132781982, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 0.6625, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.016047297297297296, |
|
"grad_norm": 7.254495620727539, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.8055, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.016891891891891893, |
|
"grad_norm": 5.706070423126221, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.3771, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017736486486486486, |
|
"grad_norm": 3.621236801147461, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 0.2341, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.018581081081081082, |
|
"grad_norm": 1.3116642236709595, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.0388, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.019425675675675675, |
|
"grad_norm": 6.483520030975342, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 0.7181, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02027027027027027, |
|
"grad_norm": 16.79340171813965, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 1.0507, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.021114864864864864, |
|
"grad_norm": 3.9302544593811035, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.8626, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.021114864864864864, |
|
"eval_loss": 0.4980570375919342, |
|
"eval_runtime": 126.9443, |
|
"eval_samples_per_second": 3.931, |
|
"eval_steps_per_second": 1.969, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02195945945945946, |
|
"grad_norm": 3.3856303691864014, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 0.3799, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.022804054054054054, |
|
"grad_norm": 4.26168966293335, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 0.5836, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02364864864864865, |
|
"grad_norm": 4.655025959014893, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.6228, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.024493243243243243, |
|
"grad_norm": 5.267085075378418, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 0.354, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.02533783783783784, |
|
"grad_norm": 3.4175171852111816, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.3025, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.026182432432432432, |
|
"grad_norm": 3.4233884811401367, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 0.3893, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 0.795799195766449, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 0.0842, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02787162162162162, |
|
"grad_norm": 2.771843671798706, |
|
"learning_rate": 0.00016946583704589973, |
|
"loss": 0.5852, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.028716216216216218, |
|
"grad_norm": 2.5016746520996094, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 0.467, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02956081081081081, |
|
"grad_norm": 3.512495994567871, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 0.1078, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.030405405405405407, |
|
"grad_norm": 2.59957218170166, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 0.2617, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 1.0484542846679688, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.0766, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03209459459459459, |
|
"grad_norm": 12.746214866638184, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 0.3286, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.032939189189189186, |
|
"grad_norm": 0.457368403673172, |
|
"learning_rate": 0.0001529919264233205, |
|
"loss": 0.0239, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.033783783783783786, |
|
"grad_norm": 4.603786945343018, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.2592, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03462837837837838, |
|
"grad_norm": 4.357280731201172, |
|
"learning_rate": 0.00014694715627858908, |
|
"loss": 0.3277, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03547297297297297, |
|
"grad_norm": 5.606065273284912, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 0.9881, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.036317567567567564, |
|
"grad_norm": 0.958200216293335, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 0.0339, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.037162162162162164, |
|
"grad_norm": 0.9110239744186401, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 0.0388, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03800675675675676, |
|
"grad_norm": 10.007410049438477, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 0.1342, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03885135135135135, |
|
"grad_norm": 3.0555267333984375, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.1913, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03969594594594594, |
|
"grad_norm": 4.296360969543457, |
|
"learning_rate": 0.0001275637355816999, |
|
"loss": 0.2431, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.04054054054054054, |
|
"grad_norm": 0.9275014400482178, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 0.0389, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.041385135135135136, |
|
"grad_norm": 0.2515358626842499, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 0.0064, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04222972972972973, |
|
"grad_norm": 0.12618118524551392, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.0036, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04222972972972973, |
|
"eval_loss": 0.24848654866218567, |
|
"eval_runtime": 127.0152, |
|
"eval_samples_per_second": 3.929, |
|
"eval_steps_per_second": 1.968, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.6109411688448e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|