End of training

Browse files

Files changed (5) hide show

README.md +5 -5
all_results.json +16 -0
eval_results.json +11 -0
train_results.json +8 -0
trainer_state.json +1459 -0

README.md CHANGED Viewed

@@ -23,11 +23,11 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k) on the medmnist-v2 dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.0785
-- Accuracy: 0.9708
-- Precision: 0.9668
-- Recall: 0.9737
-- F1: 0.9698
 ## Model description

 This model is a fine-tuned version of [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k) on the medmnist-v2 dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.0847
+- Accuracy: 0.9737
+- Precision: 0.9726
+- Recall: 0.9724
+- F1: 0.9724
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.9736919029523531,
+    "eval_f1": 0.9724362036912011,
+    "eval_loss": 0.08470147103071213,
+    "eval_precision": 0.9726422315270469,
+    "eval_recall": 0.9724099912288373,
+    "eval_runtime": 20.7035,
+    "eval_samples_per_second": 165.237,
+    "eval_steps_per_second": 10.336,
+    "total_flos": 9.328175742872125e+18,
+    "train_loss": 0.3662890907277398,
+    "train_runtime": 1600.7009,
+    "train_samples_per_second": 74.711,
+    "train_steps_per_second": 1.168
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.9736919029523531,
+    "eval_f1": 0.9724362036912011,
+    "eval_loss": 0.08470147103071213,
+    "eval_precision": 0.9726422315270469,
+    "eval_recall": 0.9724099912288373,
+    "eval_runtime": 20.7035,
+    "eval_samples_per_second": 165.237,
+    "eval_steps_per_second": 10.336
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 10.0,
+    "total_flos": 9.328175742872125e+18,
+    "train_loss": 0.3662890907277398,
+    "train_runtime": 1600.7009,
+    "train_samples_per_second": 74.711,
+    "train_steps_per_second": 1.168
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1459 @@

+{
+  "best_metric": 0.9707943925233645,
+  "best_model_checkpoint": "beit-base-patch16-224-pt22k-ft22k-finetuned-lora-medmnistv2/checkpoint-1870",
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 1870,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.053475935828877004,
+      "grad_norm": 4.650041580200195,
+      "learning_rate": 0.004973262032085562,
+      "loss": 1.5063,
+      "step": 10
+    },
+    {
+      "epoch": 0.10695187165775401,
+      "grad_norm": 3.0658373832702637,
+      "learning_rate": 0.004946524064171123,
+      "loss": 0.8711,
+      "step": 20
+    },
+    {
+      "epoch": 0.16042780748663102,
+      "grad_norm": 2.9676272869110107,
+      "learning_rate": 0.004919786096256685,
+      "loss": 0.8,
+      "step": 30
+    },
+    {
+      "epoch": 0.21390374331550802,
+      "grad_norm": 2.5159189701080322,
+      "learning_rate": 0.004893048128342246,
+      "loss": 0.7794,
+      "step": 40
+    },
+    {
+      "epoch": 0.26737967914438504,
+      "grad_norm": 2.4576735496520996,
+      "learning_rate": 0.004868983957219251,
+      "loss": 0.8748,
+      "step": 50
+    },
+    {
+      "epoch": 0.32085561497326204,
+      "grad_norm": 1.9533675909042358,
+      "learning_rate": 0.004842245989304813,
+      "loss": 0.6213,
+      "step": 60
+    },
+    {
+      "epoch": 0.37433155080213903,
+      "grad_norm": 3.91825795173645,
+      "learning_rate": 0.004815508021390374,
+      "loss": 0.6883,
+      "step": 70
+    },
+    {
+      "epoch": 0.42780748663101603,
+      "grad_norm": 3.228422164916992,
+      "learning_rate": 0.004788770053475936,
+      "loss": 0.7019,
+      "step": 80
+    },
+    {
+      "epoch": 0.48128342245989303,
+      "grad_norm": 4.45206356048584,
+      "learning_rate": 0.004762032085561497,
+      "loss": 0.5394,
+      "step": 90
+    },
+    {
+      "epoch": 0.5347593582887701,
+      "grad_norm": 2.184957504272461,
+      "learning_rate": 0.004735294117647059,
+      "loss": 0.5543,
+      "step": 100
+    },
+    {
+      "epoch": 0.5882352941176471,
+      "grad_norm": 2.246079206466675,
+      "learning_rate": 0.00470855614973262,
+      "loss": 0.5738,
+      "step": 110
+    },
+    {
+      "epoch": 0.6417112299465241,
+      "grad_norm": 2.6914820671081543,
+      "learning_rate": 0.004681818181818182,
+      "loss": 0.6209,
+      "step": 120
+    },
+    {
+      "epoch": 0.6951871657754011,
+      "grad_norm": 2.5458545684814453,
+      "learning_rate": 0.0046550802139037435,
+      "loss": 0.5597,
+      "step": 130
+    },
+    {
+      "epoch": 0.7486631016042781,
+      "grad_norm": 2.676391363143921,
+      "learning_rate": 0.004628342245989305,
+      "loss": 0.5273,
+      "step": 140
+    },
+    {
+      "epoch": 0.8021390374331551,
+      "grad_norm": 2.5059385299682617,
+      "learning_rate": 0.0046016042780748665,
+      "loss": 0.5199,
+      "step": 150
+    },
+    {
+      "epoch": 0.8556149732620321,
+      "grad_norm": 1.451249122619629,
+      "learning_rate": 0.004574866310160428,
+      "loss": 0.5509,
+      "step": 160
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 2.5957276821136475,
+      "learning_rate": 0.00454812834224599,
+      "loss": 0.5336,
+      "step": 170
+    },
+    {
+      "epoch": 0.9625668449197861,
+      "grad_norm": 2.4229955673217773,
+      "learning_rate": 0.004521390374331551,
+      "loss": 0.4657,
+      "step": 180
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9094626168224299,
+      "eval_f1": 0.8972949130385568,
+      "eval_loss": 0.2451503425836563,
+      "eval_precision": 0.8964084875867973,
+      "eval_recall": 0.9082806506629539,
+      "eval_runtime": 10.2386,
+      "eval_samples_per_second": 167.21,
+      "eval_steps_per_second": 10.451,
+      "step": 187
+    },
+    {
+      "epoch": 1.0160427807486632,
+      "grad_norm": 2.3994851112365723,
+      "learning_rate": 0.004494652406417113,
+      "loss": 0.4772,
+      "step": 190
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "grad_norm": 1.985571265220642,
+      "learning_rate": 0.004467914438502674,
+      "loss": 0.5995,
+      "step": 200
+    },
+    {
+      "epoch": 1.1229946524064172,
+      "grad_norm": 2.3798632621765137,
+      "learning_rate": 0.004441176470588235,
+      "loss": 0.5686,
+      "step": 210
+    },
+    {
+      "epoch": 1.1764705882352942,
+      "grad_norm": 3.1128406524658203,
+      "learning_rate": 0.004414438502673797,
+      "loss": 0.4984,
+      "step": 220
+    },
+    {
+      "epoch": 1.2299465240641712,
+      "grad_norm": 2.8572049140930176,
+      "learning_rate": 0.004387700534759359,
+      "loss": 0.5027,
+      "step": 230
+    },
+    {
+      "epoch": 1.2834224598930482,
+      "grad_norm": 5.178213119506836,
+      "learning_rate": 0.00436096256684492,
+      "loss": 0.4864,
+      "step": 240
+    },
+    {
+      "epoch": 1.3368983957219251,
+      "grad_norm": 1.9515773057937622,
+      "learning_rate": 0.004334224598930481,
+      "loss": 0.4528,
+      "step": 250
+    },
+    {
+      "epoch": 1.3903743315508021,
+      "grad_norm": 3.023959159851074,
+      "learning_rate": 0.0043074866310160425,
+      "loss": 0.5513,
+      "step": 260
+    },
+    {
+      "epoch": 1.4438502673796791,
+      "grad_norm": 2.371218204498291,
+      "learning_rate": 0.004280748663101605,
+      "loss": 0.442,
+      "step": 270
+    },
+    {
+      "epoch": 1.4973262032085561,
+      "grad_norm": 2.111191987991333,
+      "learning_rate": 0.004254010695187166,
+      "loss": 0.6163,
+      "step": 280
+    },
+    {
+      "epoch": 1.5508021390374331,
+      "grad_norm": 2.123419761657715,
+      "learning_rate": 0.004227272727272727,
+      "loss": 0.5522,
+      "step": 290
+    },
+    {
+      "epoch": 1.6042780748663101,
+      "grad_norm": 1.6425999402999878,
+      "learning_rate": 0.004200534759358289,
+      "loss": 0.4601,
+      "step": 300
+    },
+    {
+      "epoch": 1.6577540106951871,
+      "grad_norm": 3.847395420074463,
+      "learning_rate": 0.00417379679144385,
+      "loss": 0.5434,
+      "step": 310
+    },
+    {
+      "epoch": 1.7112299465240641,
+      "grad_norm": 1.8732799291610718,
+      "learning_rate": 0.004147058823529412,
+      "loss": 0.4952,
+      "step": 320
+    },
+    {
+      "epoch": 1.7647058823529411,
+      "grad_norm": 1.4881893396377563,
+      "learning_rate": 0.004120320855614973,
+      "loss": 0.4926,
+      "step": 330
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 1.9936500787734985,
+      "learning_rate": 0.004093582887700535,
+      "loss": 0.4582,
+      "step": 340
+    },
+    {
+      "epoch": 1.8716577540106951,
+      "grad_norm": 4.784737586975098,
+      "learning_rate": 0.004066844919786096,
+      "loss": 0.4839,
+      "step": 350
+    },
+    {
+      "epoch": 1.9251336898395723,
+      "grad_norm": 2.403982162475586,
+      "learning_rate": 0.004040106951871658,
+      "loss": 0.5868,
+      "step": 360
+    },
+    {
+      "epoch": 1.9786096256684491,
+      "grad_norm": 1.7464922666549683,
+      "learning_rate": 0.004013368983957219,
+      "loss": 0.4327,
+      "step": 370
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9182242990654206,
+      "eval_f1": 0.9007413709436916,
+      "eval_loss": 0.21109923720359802,
+      "eval_precision": 0.9299210483133126,
+      "eval_recall": 0.8921235393972065,
+      "eval_runtime": 10.4332,
+      "eval_samples_per_second": 164.091,
+      "eval_steps_per_second": 10.256,
+      "step": 374
+    },
+    {
+      "epoch": 2.0320855614973263,
+      "grad_norm": 1.444707989692688,
+      "learning_rate": 0.003986631016042781,
+      "loss": 0.478,
+      "step": 380
+    },
+    {
+      "epoch": 2.085561497326203,
+      "grad_norm": 1.4123905897140503,
+      "learning_rate": 0.003959893048128342,
+      "loss": 0.5,
+      "step": 390
+    },
+    {
+      "epoch": 2.1390374331550803,
+      "grad_norm": 2.96335768699646,
+      "learning_rate": 0.003933155080213904,
+      "loss": 0.5348,
+      "step": 400
+    },
+    {
+      "epoch": 2.192513368983957,
+      "grad_norm": 1.4397529363632202,
+      "learning_rate": 0.0039064171122994654,
+      "loss": 0.4571,
+      "step": 410
+    },
+    {
+      "epoch": 2.2459893048128343,
+      "grad_norm": 1.821366548538208,
+      "learning_rate": 0.0038796791443850265,
+      "loss": 0.4982,
+      "step": 420
+    },
+    {
+      "epoch": 2.299465240641711,
+      "grad_norm": 2.112130641937256,
+      "learning_rate": 0.0038529411764705885,
+      "loss": 0.4343,
+      "step": 430
+    },
+    {
+      "epoch": 2.3529411764705883,
+      "grad_norm": 1.942734956741333,
+      "learning_rate": 0.00382620320855615,
+      "loss": 0.5078,
+      "step": 440
+    },
+    {
+      "epoch": 2.406417112299465,
+      "grad_norm": 2.774502754211426,
+      "learning_rate": 0.003799465240641711,
+      "loss": 0.4016,
+      "step": 450
+    },
+    {
+      "epoch": 2.4598930481283423,
+      "grad_norm": 2.139463424682617,
+      "learning_rate": 0.0037727272727272726,
+      "loss": 0.5415,
+      "step": 460
+    },
+    {
+      "epoch": 2.5133689839572195,
+      "grad_norm": 1.9148341417312622,
+      "learning_rate": 0.003745989304812834,
+      "loss": 0.4417,
+      "step": 470
+    },
+    {
+      "epoch": 2.5668449197860963,
+      "grad_norm": 1.9109567403793335,
+      "learning_rate": 0.003719251336898396,
+      "loss": 0.4273,
+      "step": 480
+    },
+    {
+      "epoch": 2.620320855614973,
+      "grad_norm": 2.2219059467315674,
+      "learning_rate": 0.0036925133689839572,
+      "loss": 0.5218,
+      "step": 490
+    },
+    {
+      "epoch": 2.6737967914438503,
+      "grad_norm": 3.378606081008911,
+      "learning_rate": 0.0036657754010695188,
+      "loss": 0.4318,
+      "step": 500
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 1.668760061264038,
+      "learning_rate": 0.0036390374331550803,
+      "loss": 0.4447,
+      "step": 510
+    },
+    {
+      "epoch": 2.7807486631016043,
+      "grad_norm": 1.830342411994934,
+      "learning_rate": 0.0036122994652406414,
+      "loss": 0.4507,
+      "step": 520
+    },
+    {
+      "epoch": 2.834224598930481,
+      "grad_norm": 2.2146425247192383,
+      "learning_rate": 0.0035855614973262034,
+      "loss": 0.4127,
+      "step": 530
+    },
+    {
+      "epoch": 2.8877005347593583,
+      "grad_norm": 1.3959295749664307,
+      "learning_rate": 0.003558823529411765,
+      "loss": 0.4353,
+      "step": 540
+    },
+    {
+      "epoch": 2.9411764705882355,
+      "grad_norm": 1.844604253768921,
+      "learning_rate": 0.0035320855614973264,
+      "loss": 0.3488,
+      "step": 550
+    },
+    {
+      "epoch": 2.9946524064171123,
+      "grad_norm": 1.421885371208191,
+      "learning_rate": 0.0035053475935828875,
+      "loss": 0.3977,
+      "step": 560
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9339953271028038,
+      "eval_f1": 0.924420495312186,
+      "eval_loss": 0.17427141964435577,
+      "eval_precision": 0.9228598461246502,
+      "eval_recall": 0.928247943129569,
+      "eval_runtime": 9.981,
+      "eval_samples_per_second": 171.527,
+      "eval_steps_per_second": 10.72,
+      "step": 561
+    },
+    {
+      "epoch": 3.0481283422459895,
+      "grad_norm": 2.2883894443511963,
+      "learning_rate": 0.003478609625668449,
+      "loss": 0.3909,
+      "step": 570
+    },
+    {
+      "epoch": 3.1016042780748663,
+      "grad_norm": 2.4753079414367676,
+      "learning_rate": 0.003451871657754011,
+      "loss": 0.4352,
+      "step": 580
+    },
+    {
+      "epoch": 3.1550802139037435,
+      "grad_norm": 2.298736572265625,
+      "learning_rate": 0.0034251336898395725,
+      "loss": 0.4641,
+      "step": 590
+    },
+    {
+      "epoch": 3.2085561497326203,
+      "grad_norm": 1.4368634223937988,
+      "learning_rate": 0.0033983957219251336,
+      "loss": 0.4225,
+      "step": 600
+    },
+    {
+      "epoch": 3.2620320855614975,
+      "grad_norm": 1.462842583656311,
+      "learning_rate": 0.003371657754010695,
+      "loss": 0.3958,
+      "step": 610
+    },
+    {
+      "epoch": 3.3155080213903743,
+      "grad_norm": 2.449066638946533,
+      "learning_rate": 0.0033449197860962567,
+      "loss": 0.3784,
+      "step": 620
+    },
+    {
+      "epoch": 3.3689839572192515,
+      "grad_norm": 1.5616710186004639,
+      "learning_rate": 0.0033181818181818186,
+      "loss": 0.4476,
+      "step": 630
+    },
+    {
+      "epoch": 3.4224598930481283,
+      "grad_norm": 2.284454345703125,
+      "learning_rate": 0.0032914438502673797,
+      "loss": 0.3725,
+      "step": 640
+    },
+    {
+      "epoch": 3.4759358288770055,
+      "grad_norm": 1.5143663883209229,
+      "learning_rate": 0.0032647058823529413,
+      "loss": 0.4597,
+      "step": 650
+    },
+    {
+      "epoch": 3.5294117647058822,
+      "grad_norm": 1.6112128496170044,
+      "learning_rate": 0.003237967914438503,
+      "loss": 0.4198,
+      "step": 660
+    },
+    {
+      "epoch": 3.5828877005347595,
+      "grad_norm": 1.2612804174423218,
+      "learning_rate": 0.003211229946524064,
+      "loss": 0.4785,
+      "step": 670
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 2.0233500003814697,
+      "learning_rate": 0.0031844919786096254,
+      "loss": 0.4276,
+      "step": 680
+    },
+    {
+      "epoch": 3.6898395721925135,
+      "grad_norm": 1.2161093950271606,
+      "learning_rate": 0.0031577540106951874,
+      "loss": 0.3865,
+      "step": 690
+    },
+    {
+      "epoch": 3.7433155080213902,
+      "grad_norm": 1.835656762123108,
+      "learning_rate": 0.003131016042780749,
+      "loss": 0.3202,
+      "step": 700
+    },
+    {
+      "epoch": 3.7967914438502675,
+      "grad_norm": 2.9908785820007324,
+      "learning_rate": 0.00310427807486631,
+      "loss": 0.3879,
+      "step": 710
+    },
+    {
+      "epoch": 3.8502673796791442,
+      "grad_norm": 1.587223768234253,
+      "learning_rate": 0.0030775401069518715,
+      "loss": 0.3682,
+      "step": 720
+    },
+    {
+      "epoch": 3.9037433155080214,
+      "grad_norm": 2.0039021968841553,
+      "learning_rate": 0.003050802139037433,
+      "loss": 0.4148,
+      "step": 730
+    },
+    {
+      "epoch": 3.9572192513368982,
+      "grad_norm": 1.8037409782409668,
+      "learning_rate": 0.003024064171122995,
+      "loss": 0.3318,
+      "step": 740
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.9351635514018691,
+      "eval_f1": 0.928485806906975,
+      "eval_loss": 0.17756415903568268,
+      "eval_precision": 0.9248343621199285,
+      "eval_recall": 0.9352570988138212,
+      "eval_runtime": 10.1719,
+      "eval_samples_per_second": 168.307,
+      "eval_steps_per_second": 10.519,
+      "step": 748
+    },
+    {
+      "epoch": 4.010695187165775,
+      "grad_norm": 2.230004072189331,
+      "learning_rate": 0.002997326203208556,
+      "loss": 0.4071,
+      "step": 750
+    },
+    {
+      "epoch": 4.064171122994653,
+      "grad_norm": 2.1018853187561035,
+      "learning_rate": 0.0029705882352941177,
+      "loss": 0.3498,
+      "step": 760
+    },
+    {
+      "epoch": 4.117647058823529,
+      "grad_norm": 1.6814857721328735,
+      "learning_rate": 0.002943850267379679,
+      "loss": 0.4085,
+      "step": 770
+    },
+    {
+      "epoch": 4.171122994652406,
+      "grad_norm": 2.0869903564453125,
+      "learning_rate": 0.0029171122994652403,
+      "loss": 0.4481,
+      "step": 780
+    },
+    {
+      "epoch": 4.224598930481283,
+      "grad_norm": 1.4043067693710327,
+      "learning_rate": 0.0028903743315508022,
+      "loss": 0.3234,
+      "step": 790
+    },
+    {
+      "epoch": 4.278074866310161,
+      "grad_norm": 2.0766959190368652,
+      "learning_rate": 0.0028636363636363638,
+      "loss": 0.3719,
+      "step": 800
+    },
+    {
+      "epoch": 4.331550802139038,
+      "grad_norm": 1.85934317111969,
+      "learning_rate": 0.0028368983957219253,
+      "loss": 0.4784,
+      "step": 810
+    },
+    {
+      "epoch": 4.385026737967914,
+      "grad_norm": 2.3728232383728027,
+      "learning_rate": 0.0028101604278074864,
+      "loss": 0.3704,
+      "step": 820
+    },
+    {
+      "epoch": 4.438502673796791,
+      "grad_norm": 1.2759883403778076,
+      "learning_rate": 0.002783422459893048,
+      "loss": 0.3283,
+      "step": 830
+    },
+    {
+      "epoch": 4.491978609625669,
+      "grad_norm": 1.2006633281707764,
+      "learning_rate": 0.00275668449197861,
+      "loss": 0.3792,
+      "step": 840
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 2.0884652137756348,
+      "learning_rate": 0.0027299465240641714,
+      "loss": 0.4041,
+      "step": 850
+    },
+    {
+      "epoch": 4.598930481283422,
+      "grad_norm": 1.281827688217163,
+      "learning_rate": 0.0027032085561497325,
+      "loss": 0.352,
+      "step": 860
+    },
+    {
+      "epoch": 4.652406417112299,
+      "grad_norm": 1.7143138647079468,
+      "learning_rate": 0.002676470588235294,
+      "loss": 0.3896,
+      "step": 870
+    },
+    {
+      "epoch": 4.705882352941177,
+      "grad_norm": 2.069678544998169,
+      "learning_rate": 0.0026497326203208556,
+      "loss": 0.335,
+      "step": 880
+    },
+    {
+      "epoch": 4.759358288770054,
+      "grad_norm": 1.6988319158554077,
+      "learning_rate": 0.0026229946524064175,
+      "loss": 0.3693,
+      "step": 890
+    },
+    {
+      "epoch": 4.81283422459893,
+      "grad_norm": 1.6188457012176514,
+      "learning_rate": 0.0025962566844919786,
+      "loss": 0.337,
+      "step": 900
+    },
+    {
+      "epoch": 4.866310160427807,
+      "grad_norm": 2.0478222370147705,
+      "learning_rate": 0.00256951871657754,
+      "loss": 0.3156,
+      "step": 910
+    },
+    {
+      "epoch": 4.919786096256685,
+      "grad_norm": 1.7088401317596436,
+      "learning_rate": 0.0025427807486631017,
+      "loss": 0.3414,
+      "step": 920
+    },
+    {
+      "epoch": 4.973262032085562,
+      "grad_norm": 1.161230444908142,
+      "learning_rate": 0.002516042780748663,
+      "loss": 0.3461,
+      "step": 930
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9380841121495327,
+      "eval_f1": 0.9304948103477649,
+      "eval_loss": 0.17028363049030304,
+      "eval_precision": 0.9311071354745837,
+      "eval_recall": 0.9344001562456381,
+      "eval_runtime": 10.2604,
+      "eval_samples_per_second": 166.855,
+      "eval_steps_per_second": 10.428,
+      "step": 935
+    },
+    {
+      "epoch": 5.026737967914438,
+      "grad_norm": 1.723848819732666,
+      "learning_rate": 0.0024893048128342248,
+      "loss": 0.3622,
+      "step": 940
+    },
+    {
+      "epoch": 5.080213903743315,
+      "grad_norm": 2.0140602588653564,
+      "learning_rate": 0.002462566844919786,
+      "loss": 0.3973,
+      "step": 950
+    },
+    {
+      "epoch": 5.133689839572193,
+      "grad_norm": 1.5653032064437866,
+      "learning_rate": 0.002435828877005348,
+      "loss": 0.3106,
+      "step": 960
+    },
+    {
+      "epoch": 5.18716577540107,
+      "grad_norm": 1.7829616069793701,
+      "learning_rate": 0.002409090909090909,
+      "loss": 0.3723,
+      "step": 970
+    },
+    {
+      "epoch": 5.240641711229946,
+      "grad_norm": 0.9940521717071533,
+      "learning_rate": 0.0023823529411764704,
+      "loss": 0.3453,
+      "step": 980
+    },
+    {
+      "epoch": 5.294117647058823,
+      "grad_norm": 1.1114059686660767,
+      "learning_rate": 0.002355614973262032,
+      "loss": 0.3769,
+      "step": 990
+    },
+    {
+      "epoch": 5.347593582887701,
+      "grad_norm": 0.9444433450698853,
+      "learning_rate": 0.0023288770053475935,
+      "loss": 0.3489,
+      "step": 1000
+    },
+    {
+      "epoch": 5.401069518716578,
+      "grad_norm": 2.0856947898864746,
+      "learning_rate": 0.002302139037433155,
+      "loss": 0.374,
+      "step": 1010
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 1.679477572441101,
+      "learning_rate": 0.0022754010695187166,
+      "loss": 0.3738,
+      "step": 1020
+    },
+    {
+      "epoch": 5.508021390374331,
+      "grad_norm": 1.3019518852233887,
+      "learning_rate": 0.002248663101604278,
+      "loss": 0.3634,
+      "step": 1030
+    },
+    {
+      "epoch": 5.561497326203209,
+      "grad_norm": 1.467846155166626,
+      "learning_rate": 0.0022219251336898396,
+      "loss": 0.3457,
+      "step": 1040
+    },
+    {
+      "epoch": 5.614973262032086,
+      "grad_norm": 1.6348631381988525,
+      "learning_rate": 0.002195187165775401,
+      "loss": 0.3216,
+      "step": 1050
+    },
+    {
+      "epoch": 5.668449197860962,
+      "grad_norm": 1.158215880393982,
+      "learning_rate": 0.0021684491978609627,
+      "loss": 0.3033,
+      "step": 1060
+    },
+    {
+      "epoch": 5.721925133689839,
+      "grad_norm": 0.8872423768043518,
+      "learning_rate": 0.002141711229946524,
+      "loss": 0.2919,
+      "step": 1070
+    },
+    {
+      "epoch": 5.775401069518717,
+      "grad_norm": 1.9146243333816528,
+      "learning_rate": 0.0021149732620320857,
+      "loss": 0.3228,
+      "step": 1080
+    },
+    {
+      "epoch": 5.828877005347594,
+      "grad_norm": 1.7084169387817383,
+      "learning_rate": 0.0020882352941176473,
+      "loss": 0.2754,
+      "step": 1090
+    },
+    {
+      "epoch": 5.882352941176471,
+      "grad_norm": 1.0626111030578613,
+      "learning_rate": 0.0020614973262032084,
+      "loss": 0.3165,
+      "step": 1100
+    },
+    {
+      "epoch": 5.935828877005347,
+      "grad_norm": 1.8155293464660645,
+      "learning_rate": 0.00203475935828877,
+      "loss": 0.2815,
+      "step": 1110
+    },
+    {
+      "epoch": 5.989304812834225,
+      "grad_norm": 1.8623782396316528,
+      "learning_rate": 0.0020080213903743314,
+      "loss": 0.3309,
+      "step": 1120
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.9369158878504673,
+      "eval_f1": 0.9334719219156348,
+      "eval_loss": 0.19556888937950134,
+      "eval_precision": 0.9335706750233659,
+      "eval_recall": 0.9396740716392903,
+      "eval_runtime": 10.2767,
+      "eval_samples_per_second": 166.591,
+      "eval_steps_per_second": 10.412,
+      "step": 1122
+    },
+    {
+      "epoch": 6.042780748663102,
+      "grad_norm": 1.1055293083190918,
+      "learning_rate": 0.001981283422459893,
+      "loss": 0.3202,
+      "step": 1130
+    },
+    {
+      "epoch": 6.096256684491979,
+      "grad_norm": 1.7265422344207764,
+      "learning_rate": 0.0019545454545454545,
+      "loss": 0.2973,
+      "step": 1140
+    },
+    {
+      "epoch": 6.149732620320855,
+      "grad_norm": 2.0242912769317627,
+      "learning_rate": 0.001927807486631016,
+      "loss": 0.302,
+      "step": 1150
+    },
+    {
+      "epoch": 6.2032085561497325,
+      "grad_norm": 1.0210644006729126,
+      "learning_rate": 0.0019010695187165775,
+      "loss": 0.2785,
+      "step": 1160
+    },
+    {
+      "epoch": 6.25668449197861,
+      "grad_norm": 1.5111178159713745,
+      "learning_rate": 0.001874331550802139,
+      "loss": 0.2873,
+      "step": 1170
+    },
+    {
+      "epoch": 6.310160427807487,
+      "grad_norm": 1.060488224029541,
+      "learning_rate": 0.0018475935828877006,
+      "loss": 0.321,
+      "step": 1180
+    },
+    {
+      "epoch": 6.363636363636363,
+      "grad_norm": 1.0627189874649048,
+      "learning_rate": 0.0018208556149732621,
+      "loss": 0.2682,
+      "step": 1190
+    },
+    {
+      "epoch": 6.4171122994652405,
+      "grad_norm": 1.1237576007843018,
+      "learning_rate": 0.0017941176470588236,
+      "loss": 0.2383,
+      "step": 1200
+    },
+    {
+      "epoch": 6.470588235294118,
+      "grad_norm": 1.6101592779159546,
+      "learning_rate": 0.001767379679144385,
+      "loss": 0.3197,
+      "step": 1210
+    },
+    {
+      "epoch": 6.524064171122995,
+      "grad_norm": 0.6864691972732544,
+      "learning_rate": 0.0017406417112299467,
+      "loss": 0.2307,
+      "step": 1220
+    },
+    {
+      "epoch": 6.577540106951871,
+      "grad_norm": 1.339308500289917,
+      "learning_rate": 0.001713903743315508,
+      "loss": 0.2534,
+      "step": 1230
+    },
+    {
+      "epoch": 6.6310160427807485,
+      "grad_norm": 1.3319642543792725,
+      "learning_rate": 0.0016871657754010698,
+      "loss": 0.32,
+      "step": 1240
+    },
+    {
+      "epoch": 6.684491978609626,
+      "grad_norm": 1.4089816808700562,
+      "learning_rate": 0.001660427807486631,
+      "loss": 0.285,
+      "step": 1250
+    },
+    {
+      "epoch": 6.737967914438503,
+      "grad_norm": 1.212084174156189,
+      "learning_rate": 0.0016336898395721924,
+      "loss": 0.2217,
+      "step": 1260
+    },
+    {
+      "epoch": 6.791443850267379,
+      "grad_norm": 1.6609482765197754,
+      "learning_rate": 0.0016069518716577541,
+      "loss": 0.2952,
+      "step": 1270
+    },
+    {
+      "epoch": 6.8449197860962565,
+      "grad_norm": 1.060892105102539,
+      "learning_rate": 0.0015802139037433154,
+      "loss": 0.2524,
+      "step": 1280
+    },
+    {
+      "epoch": 6.898395721925134,
+      "grad_norm": 1.3365124464035034,
+      "learning_rate": 0.001553475935828877,
+      "loss": 0.2694,
+      "step": 1290
+    },
+    {
+      "epoch": 6.951871657754011,
+      "grad_norm": 1.1521918773651123,
+      "learning_rate": 0.0015267379679144385,
+      "loss": 0.3088,
+      "step": 1300
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.9532710280373832,
+      "eval_f1": 0.9461125894090557,
+      "eval_loss": 0.11792106181383133,
+      "eval_precision": 0.9426583892398479,
+      "eval_recall": 0.952515495389921,
+      "eval_runtime": 10.3853,
+      "eval_samples_per_second": 164.849,
+      "eval_steps_per_second": 10.303,
+      "step": 1309
+    },
+    {
+      "epoch": 7.005347593582887,
+      "grad_norm": 0.8682220578193665,
+      "learning_rate": 0.0015,
+      "loss": 0.2627,
+      "step": 1310
+    },
+    {
+      "epoch": 7.0588235294117645,
+      "grad_norm": 2.279827356338501,
+      "learning_rate": 0.0014732620320855616,
+      "loss": 0.2796,
+      "step": 1320
+    },
+    {
+      "epoch": 7.112299465240642,
+      "grad_norm": 1.3697049617767334,
+      "learning_rate": 0.001446524064171123,
+      "loss": 0.2369,
+      "step": 1330
+    },
+    {
+      "epoch": 7.165775401069519,
+      "grad_norm": 0.8857790231704712,
+      "learning_rate": 0.0014197860962566844,
+      "loss": 0.2648,
+      "step": 1340
+    },
+    {
+      "epoch": 7.219251336898395,
+      "grad_norm": 2.053224802017212,
+      "learning_rate": 0.0013930481283422461,
+      "loss": 0.212,
+      "step": 1350
+    },
+    {
+      "epoch": 7.2727272727272725,
+      "grad_norm": 1.619578242301941,
+      "learning_rate": 0.0013663101604278075,
+      "loss": 0.2229,
+      "step": 1360
+    },
+    {
+      "epoch": 7.32620320855615,
+      "grad_norm": 1.3765966892242432,
+      "learning_rate": 0.0013395721925133692,
+      "loss": 0.2311,
+      "step": 1370
+    },
+    {
+      "epoch": 7.379679144385027,
+      "grad_norm": 1.2967066764831543,
+      "learning_rate": 0.0013128342245989305,
+      "loss": 0.2402,
+      "step": 1380
+    },
+    {
+      "epoch": 7.433155080213904,
+      "grad_norm": 1.2961163520812988,
+      "learning_rate": 0.0012860962566844918,
+      "loss": 0.2318,
+      "step": 1390
+    },
+    {
+      "epoch": 7.4866310160427805,
+      "grad_norm": 1.6240290403366089,
+      "learning_rate": 0.0012593582887700536,
+      "loss": 0.2669,
+      "step": 1400
+    },
+    {
+      "epoch": 7.540106951871658,
+      "grad_norm": 1.1457808017730713,
+      "learning_rate": 0.0012326203208556149,
+      "loss": 0.2887,
+      "step": 1410
+    },
+    {
+      "epoch": 7.593582887700535,
+      "grad_norm": 1.303931474685669,
+      "learning_rate": 0.0012058823529411764,
+      "loss": 0.2862,
+      "step": 1420
+    },
+    {
+      "epoch": 7.647058823529412,
+      "grad_norm": 0.9429693222045898,
+      "learning_rate": 0.001179144385026738,
+      "loss": 0.2282,
+      "step": 1430
+    },
+    {
+      "epoch": 7.7005347593582885,
+      "grad_norm": 1.349269986152649,
+      "learning_rate": 0.0011524064171122995,
+      "loss": 0.2414,
+      "step": 1440
+    },
+    {
+      "epoch": 7.754010695187166,
+      "grad_norm": 1.185160517692566,
+      "learning_rate": 0.001125668449197861,
+      "loss": 0.219,
+      "step": 1450
+    },
+    {
+      "epoch": 7.807486631016043,
+      "grad_norm": 1.5935460329055786,
+      "learning_rate": 0.0010989304812834225,
+      "loss": 0.2109,
+      "step": 1460
+    },
+    {
+      "epoch": 7.86096256684492,
+      "grad_norm": 1.4563795328140259,
+      "learning_rate": 0.001072192513368984,
+      "loss": 0.2943,
+      "step": 1470
+    },
+    {
+      "epoch": 7.9144385026737964,
+      "grad_norm": 1.2570650577545166,
+      "learning_rate": 0.0010454545454545454,
+      "loss": 0.2275,
+      "step": 1480
+    },
+    {
+      "epoch": 7.967914438502674,
+      "grad_norm": 0.6930679082870483,
+      "learning_rate": 0.001018716577540107,
+      "loss": 0.2129,
+      "step": 1490
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.9637850467289719,
+      "eval_f1": 0.9610548371575116,
+      "eval_loss": 0.09920904040336609,
+      "eval_precision": 0.9569323583080014,
+      "eval_recall": 0.9673920345290172,
+      "eval_runtime": 10.543,
+      "eval_samples_per_second": 162.382,
+      "eval_steps_per_second": 10.149,
+      "step": 1496
+    },
+    {
+      "epoch": 8.02139037433155,
+      "grad_norm": 1.4018137454986572,
+      "learning_rate": 0.0009919786096256684,
+      "loss": 0.2638,
+      "step": 1500
+    },
+    {
+      "epoch": 8.074866310160427,
+      "grad_norm": 1.2713522911071777,
+      "learning_rate": 0.00096524064171123,
+      "loss": 0.2099,
+      "step": 1510
+    },
+    {
+      "epoch": 8.128342245989305,
+      "grad_norm": 1.004296064376831,
+      "learning_rate": 0.0009385026737967915,
+      "loss": 0.1801,
+      "step": 1520
+    },
+    {
+      "epoch": 8.181818181818182,
+      "grad_norm": 0.7041844129562378,
+      "learning_rate": 0.0009117647058823529,
+      "loss": 0.1829,
+      "step": 1530
+    },
+    {
+      "epoch": 8.235294117647058,
+      "grad_norm": 1.3204301595687866,
+      "learning_rate": 0.0008850267379679144,
+      "loss": 0.2444,
+      "step": 1540
+    },
+    {
+      "epoch": 8.288770053475936,
+      "grad_norm": 1.261974573135376,
+      "learning_rate": 0.000858288770053476,
+      "loss": 0.2431,
+      "step": 1550
+    },
+    {
+      "epoch": 8.342245989304812,
+      "grad_norm": 0.9899649024009705,
+      "learning_rate": 0.0008315508021390375,
+      "loss": 0.1808,
+      "step": 1560
+    },
+    {
+      "epoch": 8.39572192513369,
+      "grad_norm": 1.150225281715393,
+      "learning_rate": 0.0008048128342245989,
+      "loss": 0.2048,
+      "step": 1570
+    },
+    {
+      "epoch": 8.449197860962567,
+      "grad_norm": 0.9454184770584106,
+      "learning_rate": 0.0007780748663101605,
+      "loss": 0.1919,
+      "step": 1580
+    },
+    {
+      "epoch": 8.502673796791443,
+      "grad_norm": 1.26669442653656,
+      "learning_rate": 0.000751336898395722,
+      "loss": 0.1837,
+      "step": 1590
+    },
+    {
+      "epoch": 8.556149732620321,
+      "grad_norm": 0.8547130823135376,
+      "learning_rate": 0.0007245989304812835,
+      "loss": 0.1774,
+      "step": 1600
+    },
+    {
+      "epoch": 8.609625668449198,
+      "grad_norm": 1.8781049251556396,
+      "learning_rate": 0.000697860962566845,
+      "loss": 0.2202,
+      "step": 1610
+    },
+    {
+      "epoch": 8.663101604278076,
+      "grad_norm": 0.7876987457275391,
+      "learning_rate": 0.0006711229946524064,
+      "loss": 0.1781,
+      "step": 1620
+    },
+    {
+      "epoch": 8.716577540106952,
+      "grad_norm": 1.2137806415557861,
+      "learning_rate": 0.0006443850267379679,
+      "loss": 0.1722,
+      "step": 1630
+    },
+    {
+      "epoch": 8.770053475935828,
+      "grad_norm": 1.6328903436660767,
+      "learning_rate": 0.0006176470588235294,
+      "loss": 0.2085,
+      "step": 1640
+    },
+    {
+      "epoch": 8.823529411764707,
+      "grad_norm": 0.9435901641845703,
+      "learning_rate": 0.0005909090909090909,
+      "loss": 0.2335,
+      "step": 1650
+    },
+    {
+      "epoch": 8.877005347593583,
+      "grad_norm": 1.1905876398086548,
+      "learning_rate": 0.0005641711229946525,
+      "loss": 0.2387,
+      "step": 1660
+    },
+    {
+      "epoch": 8.93048128342246,
+      "grad_norm": 0.8758776783943176,
+      "learning_rate": 0.0005374331550802139,
+      "loss": 0.2265,
+      "step": 1670
+    },
+    {
+      "epoch": 8.983957219251337,
+      "grad_norm": 1.3745719194412231,
+      "learning_rate": 0.0005106951871657754,
+      "loss": 0.2049,
+      "step": 1680
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.967873831775701,
+      "eval_f1": 0.9651132770824573,
+      "eval_loss": 0.08469934016466141,
+      "eval_precision": 0.9626628225985181,
+      "eval_recall": 0.9683070024371949,
+      "eval_runtime": 10.3829,
+      "eval_samples_per_second": 164.887,
+      "eval_steps_per_second": 10.305,
+      "step": 1683
+    },
+    {
+      "epoch": 9.037433155080214,
+      "grad_norm": 0.9230683445930481,
+      "learning_rate": 0.0004839572192513369,
+      "loss": 0.1654,
+      "step": 1690
+    },
+    {
+      "epoch": 9.090909090909092,
+      "grad_norm": 0.8362302184104919,
+      "learning_rate": 0.0004572192513368984,
+      "loss": 0.1918,
+      "step": 1700
+    },
+    {
+      "epoch": 9.144385026737968,
+      "grad_norm": 1.3025470972061157,
+      "learning_rate": 0.0004304812834224599,
+      "loss": 0.1497,
+      "step": 1710
+    },
+    {
+      "epoch": 9.197860962566844,
+      "grad_norm": 0.8339858055114746,
+      "learning_rate": 0.00040374331550802143,
+      "loss": 0.196,
+      "step": 1720
+    },
+    {
+      "epoch": 9.251336898395722,
+      "grad_norm": 1.3273382186889648,
+      "learning_rate": 0.00037700534759358285,
+      "loss": 0.1912,
+      "step": 1730
+    },
+    {
+      "epoch": 9.304812834224599,
+      "grad_norm": 0.5822441577911377,
+      "learning_rate": 0.0003502673796791444,
+      "loss": 0.1452,
+      "step": 1740
+    },
+    {
+      "epoch": 9.358288770053475,
+      "grad_norm": 0.8451639413833618,
+      "learning_rate": 0.0003235294117647059,
+      "loss": 0.1877,
+      "step": 1750
+    },
+    {
+      "epoch": 9.411764705882353,
+      "grad_norm": 1.0270066261291504,
+      "learning_rate": 0.0002967914438502674,
+      "loss": 0.1964,
+      "step": 1760
+    },
+    {
+      "epoch": 9.46524064171123,
+      "grad_norm": 1.0621460676193237,
+      "learning_rate": 0.00027005347593582886,
+      "loss": 0.2015,
+      "step": 1770
+    },
+    {
+      "epoch": 9.518716577540108,
+      "grad_norm": 0.9587564468383789,
+      "learning_rate": 0.00024331550802139036,
+      "loss": 0.1962,
+      "step": 1780
+    },
+    {
+      "epoch": 9.572192513368984,
+      "grad_norm": 0.719536304473877,
+      "learning_rate": 0.00021657754010695186,
+      "loss": 0.1389,
+      "step": 1790
+    },
+    {
+      "epoch": 9.62566844919786,
+      "grad_norm": 0.89113450050354,
+      "learning_rate": 0.0001898395721925134,
+      "loss": 0.1783,
+      "step": 1800
+    },
+    {
+      "epoch": 9.679144385026738,
+      "grad_norm": 0.8831282258033752,
+      "learning_rate": 0.0001631016042780749,
+      "loss": 0.1871,
+      "step": 1810
+    },
+    {
+      "epoch": 9.732620320855615,
+      "grad_norm": 0.6015557646751404,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 0.1414,
+      "step": 1820
+    },
+    {
+      "epoch": 9.786096256684491,
+      "grad_norm": 1.1582796573638916,
+      "learning_rate": 0.00010962566844919787,
+      "loss": 0.2408,
+      "step": 1830
+    },
+    {
+      "epoch": 9.83957219251337,
+      "grad_norm": 0.7856789231300354,
+      "learning_rate": 8.288770053475936e-05,
+      "loss": 0.145,
+      "step": 1840
+    },
+    {
+      "epoch": 9.893048128342246,
+      "grad_norm": 1.1010181903839111,
+      "learning_rate": 5.614973262032086e-05,
+      "loss": 0.1758,
+      "step": 1850
+    },
+    {
+      "epoch": 9.946524064171124,
+      "grad_norm": 0.7676904797554016,
+      "learning_rate": 2.9411764705882354e-05,
+      "loss": 0.1683,
+      "step": 1860
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 1.4464507102966309,
+      "learning_rate": 2.6737967914438504e-06,
+      "loss": 0.2007,
+      "step": 1870
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.9707943925233645,
+      "eval_f1": 0.9697517307733657,
+      "eval_loss": 0.07853860408067703,
+      "eval_precision": 0.9668363312878312,
+      "eval_recall": 0.9737482240908748,
+      "eval_runtime": 10.3924,
+      "eval_samples_per_second": 164.735,
+      "eval_steps_per_second": 10.296,
+      "step": 1870
+    },
+    {
+      "epoch": 10.0,
+      "step": 1870,
+      "total_flos": 9.328175742872125e+18,
+      "train_loss": 0.3662890907277398,
+      "train_runtime": 1600.7009,
+      "train_samples_per_second": 74.711,
+      "train_steps_per_second": 1.168
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1870,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "total_flos": 9.328175742872125e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}