diff --git "a/checkpoint-1274/trainer_state.json" "b/checkpoint-1274/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1274/trainer_state.json" @@ -0,0 +1,9071 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.4606934431857193, + "eval_steps": 91, + "global_step": 1274, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0027463096464126332, + "grad_norm": 0.056396484375, + "learning_rate": 3e-06, + "loss": 0.6232, + "step": 1 + }, + { + "epoch": 0.0027463096464126332, + "eval_loss": 0.6296440362930298, + "eval_runtime": 599.6568, + "eval_samples_per_second": 15.285, + "eval_steps_per_second": 15.285, + "step": 1 + }, + { + "epoch": 0.0054926192928252664, + "grad_norm": 0.060546875, + "learning_rate": 6e-06, + "loss": 0.5596, + "step": 2 + }, + { + "epoch": 0.008238928939237899, + "grad_norm": 0.054443359375, + "learning_rate": 9e-06, + "loss": 0.7123, + "step": 3 + }, + { + "epoch": 0.010985238585650533, + "grad_norm": 0.05224609375, + "learning_rate": 1.2e-05, + "loss": 0.6337, + "step": 4 + }, + { + "epoch": 0.013731548232063165, + "grad_norm": 0.051025390625, + "learning_rate": 1.5e-05, + "loss": 0.5764, + "step": 5 + }, + { + "epoch": 0.016477857878475798, + "grad_norm": 0.0546875, + "learning_rate": 1.8e-05, + "loss": 0.7453, + "step": 6 + }, + { + "epoch": 0.01922416752488843, + "grad_norm": 0.057373046875, + "learning_rate": 2.1e-05, + "loss": 0.7076, + "step": 7 + }, + { + "epoch": 0.021970477171301066, + "grad_norm": 0.06298828125, + "learning_rate": 2.4e-05, + "loss": 0.5094, + "step": 8 + }, + { + "epoch": 0.024716786817713696, + "grad_norm": 0.05859375, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.6062, + "step": 9 + }, + { + "epoch": 0.02746309646412633, + "grad_norm": 0.056884765625, + "learning_rate": 3e-05, + "loss": 0.6501, + "step": 10 + }, + { + "epoch": 0.030209406110538965, + "grad_norm": 0.060546875, + "learning_rate": 2.9999964598289033e-05, + "loss": 0.6403, + "step": 11 + }, + { + "epoch": 0.032955715756951595, + "grad_norm": 0.061279296875, + "learning_rate": 2.999985839332323e-05, + "loss": 0.6464, + "step": 12 + }, + { + "epoch": 0.03570202540336423, + "grad_norm": 0.05859375, + "learning_rate": 2.9999681385603907e-05, + "loss": 0.6528, + "step": 13 + }, + { + "epoch": 0.03844833504977686, + "grad_norm": 0.062255859375, + "learning_rate": 2.9999433575966585e-05, + "loss": 0.7109, + "step": 14 + }, + { + "epoch": 0.0411946446961895, + "grad_norm": 0.06689453125, + "learning_rate": 2.999911496558097e-05, + "loss": 0.4795, + "step": 15 + }, + { + "epoch": 0.04394095434260213, + "grad_norm": 0.061767578125, + "learning_rate": 2.9998725555950983e-05, + "loss": 0.6743, + "step": 16 + }, + { + "epoch": 0.04668726398901476, + "grad_norm": 0.0654296875, + "learning_rate": 2.9998265348914726e-05, + "loss": 0.6143, + "step": 17 + }, + { + "epoch": 0.04943357363542739, + "grad_norm": 0.06494140625, + "learning_rate": 2.9997734346644482e-05, + "loss": 0.5521, + "step": 18 + }, + { + "epoch": 0.05217988328184003, + "grad_norm": 0.06396484375, + "learning_rate": 2.99971325516467e-05, + "loss": 0.6291, + "step": 19 + }, + { + "epoch": 0.05492619292825266, + "grad_norm": 0.06591796875, + "learning_rate": 2.9996459966761994e-05, + "loss": 0.5828, + "step": 20 + }, + { + "epoch": 0.057672502574665295, + "grad_norm": 0.068359375, + "learning_rate": 2.9995716595165114e-05, + "loss": 0.5432, + "step": 21 + }, + { + "epoch": 0.06041881222107793, + "grad_norm": 0.076171875, + "learning_rate": 2.9994902440364943e-05, + "loss": 0.5769, + "step": 22 + }, + { + "epoch": 0.06316512186749056, + "grad_norm": 0.07568359375, + "learning_rate": 2.999401750620448e-05, + "loss": 0.469, + "step": 23 + }, + { + "epoch": 0.06591143151390319, + "grad_norm": 0.06884765625, + "learning_rate": 2.999306179686082e-05, + "loss": 0.4414, + "step": 24 + }, + { + "epoch": 0.06865774116031582, + "grad_norm": 0.05419921875, + "learning_rate": 2.9992035316845125e-05, + "loss": 0.7219, + "step": 25 + }, + { + "epoch": 0.07140405080672846, + "grad_norm": 0.0654296875, + "learning_rate": 2.9990938071002606e-05, + "loss": 0.6742, + "step": 26 + }, + { + "epoch": 0.07415036045314109, + "grad_norm": 0.04541015625, + "learning_rate": 2.998977006451253e-05, + "loss": 0.5633, + "step": 27 + }, + { + "epoch": 0.07689667009955373, + "grad_norm": 0.043212890625, + "learning_rate": 2.998853130288814e-05, + "loss": 0.5486, + "step": 28 + }, + { + "epoch": 0.07964297974596636, + "grad_norm": 0.04345703125, + "learning_rate": 2.9987221791976687e-05, + "loss": 0.4064, + "step": 29 + }, + { + "epoch": 0.082389289392379, + "grad_norm": 0.04541015625, + "learning_rate": 2.9985841537959345e-05, + "loss": 0.5184, + "step": 30 + }, + { + "epoch": 0.08513559903879163, + "grad_norm": 0.044921875, + "learning_rate": 2.9984390547351244e-05, + "loss": 0.5407, + "step": 31 + }, + { + "epoch": 0.08788190868520426, + "grad_norm": 0.044677734375, + "learning_rate": 2.998286882700138e-05, + "loss": 0.5532, + "step": 32 + }, + { + "epoch": 0.09062821833161688, + "grad_norm": 0.14453125, + "learning_rate": 2.9981276384092628e-05, + "loss": 1.2318, + "step": 33 + }, + { + "epoch": 0.09337452797802952, + "grad_norm": 0.04052734375, + "learning_rate": 2.9979613226141672e-05, + "loss": 0.5457, + "step": 34 + }, + { + "epoch": 0.09612083762444215, + "grad_norm": 0.03857421875, + "learning_rate": 2.9977879360999007e-05, + "loss": 0.5391, + "step": 35 + }, + { + "epoch": 0.09886714727085479, + "grad_norm": 0.0400390625, + "learning_rate": 2.9976074796848866e-05, + "loss": 0.5919, + "step": 36 + }, + { + "epoch": 0.10161345691726742, + "grad_norm": 0.037353515625, + "learning_rate": 2.99741995422092e-05, + "loss": 0.6211, + "step": 37 + }, + { + "epoch": 0.10435976656368005, + "grad_norm": 0.036865234375, + "learning_rate": 2.997225360593165e-05, + "loss": 0.5296, + "step": 38 + }, + { + "epoch": 0.10710607621009269, + "grad_norm": 0.037109375, + "learning_rate": 2.997023699720147e-05, + "loss": 0.5469, + "step": 39 + }, + { + "epoch": 0.10985238585650532, + "grad_norm": 0.038818359375, + "learning_rate": 2.9968149725537515e-05, + "loss": 0.6447, + "step": 40 + }, + { + "epoch": 0.11259869550291796, + "grad_norm": 0.03564453125, + "learning_rate": 2.9965991800792185e-05, + "loss": 0.5941, + "step": 41 + }, + { + "epoch": 0.11534500514933059, + "grad_norm": 0.0341796875, + "learning_rate": 2.9963763233151377e-05, + "loss": 0.5228, + "step": 42 + }, + { + "epoch": 0.11809131479574322, + "grad_norm": 0.035888671875, + "learning_rate": 2.9961464033134444e-05, + "loss": 0.6166, + "step": 43 + }, + { + "epoch": 0.12083762444215586, + "grad_norm": 0.0380859375, + "learning_rate": 2.9959094211594122e-05, + "loss": 0.6156, + "step": 44 + }, + { + "epoch": 0.12358393408856849, + "grad_norm": 0.034912109375, + "learning_rate": 2.9956653779716517e-05, + "loss": 0.5492, + "step": 45 + }, + { + "epoch": 0.1263302437349811, + "grad_norm": 0.03564453125, + "learning_rate": 2.9954142749021023e-05, + "loss": 0.5405, + "step": 46 + }, + { + "epoch": 0.12907655338139376, + "grad_norm": 0.035400390625, + "learning_rate": 2.9951561131360278e-05, + "loss": 0.584, + "step": 47 + }, + { + "epoch": 0.13182286302780638, + "grad_norm": 0.0380859375, + "learning_rate": 2.994890893892011e-05, + "loss": 0.62, + "step": 48 + }, + { + "epoch": 0.13456917267421903, + "grad_norm": 0.031982421875, + "learning_rate": 2.994618618421946e-05, + "loss": 0.4026, + "step": 49 + }, + { + "epoch": 0.13731548232063165, + "grad_norm": 0.0341796875, + "learning_rate": 2.994339288011037e-05, + "loss": 0.6126, + "step": 50 + }, + { + "epoch": 0.1400617919670443, + "grad_norm": 0.037109375, + "learning_rate": 2.9940529039777855e-05, + "loss": 0.6316, + "step": 51 + }, + { + "epoch": 0.14280810161345692, + "grad_norm": 0.033935546875, + "learning_rate": 2.9937594676739907e-05, + "loss": 0.3887, + "step": 52 + }, + { + "epoch": 0.14555441125986954, + "grad_norm": 0.03564453125, + "learning_rate": 2.9934589804847382e-05, + "loss": 0.5214, + "step": 53 + }, + { + "epoch": 0.14830072090628219, + "grad_norm": 0.034423828125, + "learning_rate": 2.9931514438283966e-05, + "loss": 0.5914, + "step": 54 + }, + { + "epoch": 0.1510470305526948, + "grad_norm": 0.036376953125, + "learning_rate": 2.9928368591566085e-05, + "loss": 0.5443, + "step": 55 + }, + { + "epoch": 0.15379334019910745, + "grad_norm": 0.03466796875, + "learning_rate": 2.9925152279542856e-05, + "loss": 0.5522, + "step": 56 + }, + { + "epoch": 0.15653964984552007, + "grad_norm": 0.031494140625, + "learning_rate": 2.9921865517396008e-05, + "loss": 0.5146, + "step": 57 + }, + { + "epoch": 0.15928595949193272, + "grad_norm": 0.033447265625, + "learning_rate": 2.9918508320639803e-05, + "loss": 0.5396, + "step": 58 + }, + { + "epoch": 0.16203226913834534, + "grad_norm": 0.035888671875, + "learning_rate": 2.9915080705120976e-05, + "loss": 0.5118, + "step": 59 + }, + { + "epoch": 0.164778578784758, + "grad_norm": 0.03662109375, + "learning_rate": 2.991158268701866e-05, + "loss": 0.6652, + "step": 60 + }, + { + "epoch": 0.1675248884311706, + "grad_norm": 0.0341796875, + "learning_rate": 2.9908014282844295e-05, + "loss": 0.4211, + "step": 61 + }, + { + "epoch": 0.17027119807758326, + "grad_norm": 0.0361328125, + "learning_rate": 2.9904375509441562e-05, + "loss": 0.4445, + "step": 62 + }, + { + "epoch": 0.17301750772399588, + "grad_norm": 0.03369140625, + "learning_rate": 2.9900666383986303e-05, + "loss": 0.588, + "step": 63 + }, + { + "epoch": 0.17576381737040853, + "grad_norm": 0.033203125, + "learning_rate": 2.9896886923986433e-05, + "loss": 0.5705, + "step": 64 + }, + { + "epoch": 0.17851012701682115, + "grad_norm": 0.0341796875, + "learning_rate": 2.989303714728187e-05, + "loss": 0.5068, + "step": 65 + }, + { + "epoch": 0.18125643666323377, + "grad_norm": 0.033203125, + "learning_rate": 2.9889117072044436e-05, + "loss": 0.5196, + "step": 66 + }, + { + "epoch": 0.18400274630964641, + "grad_norm": 0.03515625, + "learning_rate": 2.9885126716777776e-05, + "loss": 0.5952, + "step": 67 + }, + { + "epoch": 0.18674905595605903, + "grad_norm": 0.0322265625, + "learning_rate": 2.9881066100317288e-05, + "loss": 0.6194, + "step": 68 + }, + { + "epoch": 0.18949536560247168, + "grad_norm": 0.0341796875, + "learning_rate": 2.987693524183e-05, + "loss": 0.4453, + "step": 69 + }, + { + "epoch": 0.1922416752488843, + "grad_norm": 0.03369140625, + "learning_rate": 2.987273416081451e-05, + "loss": 0.524, + "step": 70 + }, + { + "epoch": 0.19498798489529695, + "grad_norm": 0.03857421875, + "learning_rate": 2.9868462877100875e-05, + "loss": 0.5899, + "step": 71 + }, + { + "epoch": 0.19773429454170957, + "grad_norm": 0.036376953125, + "learning_rate": 2.9864121410850527e-05, + "loss": 0.4603, + "step": 72 + }, + { + "epoch": 0.20048060418812222, + "grad_norm": 0.03759765625, + "learning_rate": 2.9859709782556185e-05, + "loss": 0.4829, + "step": 73 + }, + { + "epoch": 0.20322691383453484, + "grad_norm": 0.034912109375, + "learning_rate": 2.9855228013041737e-05, + "loss": 0.5735, + "step": 74 + }, + { + "epoch": 0.2059732234809475, + "grad_norm": 0.03369140625, + "learning_rate": 2.9850676123462157e-05, + "loss": 0.5104, + "step": 75 + }, + { + "epoch": 0.2087195331273601, + "grad_norm": 0.03369140625, + "learning_rate": 2.98460541353034e-05, + "loss": 0.5501, + "step": 76 + }, + { + "epoch": 0.21146584277377276, + "grad_norm": 0.032958984375, + "learning_rate": 2.9841362070382307e-05, + "loss": 0.5119, + "step": 77 + }, + { + "epoch": 0.21421215242018538, + "grad_norm": 0.035400390625, + "learning_rate": 2.9836599950846493e-05, + "loss": 0.589, + "step": 78 + }, + { + "epoch": 0.216958462066598, + "grad_norm": 0.03173828125, + "learning_rate": 2.9831767799174255e-05, + "loss": 0.4544, + "step": 79 + }, + { + "epoch": 0.21970477171301064, + "grad_norm": 0.03564453125, + "learning_rate": 2.9826865638174445e-05, + "loss": 0.4294, + "step": 80 + }, + { + "epoch": 0.22245108135942326, + "grad_norm": 0.03466796875, + "learning_rate": 2.9821893490986382e-05, + "loss": 0.5649, + "step": 81 + }, + { + "epoch": 0.2251973910058359, + "grad_norm": 0.03759765625, + "learning_rate": 2.981685138107974e-05, + "loss": 0.532, + "step": 82 + }, + { + "epoch": 0.22794370065224853, + "grad_norm": 0.03369140625, + "learning_rate": 2.9811739332254418e-05, + "loss": 0.6026, + "step": 83 + }, + { + "epoch": 0.23069001029866118, + "grad_norm": 0.035400390625, + "learning_rate": 2.9806557368640457e-05, + "loss": 0.5516, + "step": 84 + }, + { + "epoch": 0.2334363199450738, + "grad_norm": 0.033203125, + "learning_rate": 2.9801305514697913e-05, + "loss": 0.4544, + "step": 85 + }, + { + "epoch": 0.23618262959148645, + "grad_norm": 0.03173828125, + "learning_rate": 2.9795983795216727e-05, + "loss": 0.5327, + "step": 86 + }, + { + "epoch": 0.23892893923789907, + "grad_norm": 0.0380859375, + "learning_rate": 2.979059223531664e-05, + "loss": 0.5217, + "step": 87 + }, + { + "epoch": 0.24167524888431172, + "grad_norm": 0.0361328125, + "learning_rate": 2.978513086044703e-05, + "loss": 0.4562, + "step": 88 + }, + { + "epoch": 0.24442155853072434, + "grad_norm": 0.0380859375, + "learning_rate": 2.9779599696386846e-05, + "loss": 0.763, + "step": 89 + }, + { + "epoch": 0.24716786817713698, + "grad_norm": 0.03369140625, + "learning_rate": 2.9773998769244434e-05, + "loss": 0.4698, + "step": 90 + }, + { + "epoch": 0.2499141778235496, + "grad_norm": 0.03662109375, + "learning_rate": 2.976832810545745e-05, + "loss": 0.5602, + "step": 91 + }, + { + "epoch": 0.2499141778235496, + "eval_loss": 0.5245938897132874, + "eval_runtime": 620.8292, + "eval_samples_per_second": 14.764, + "eval_steps_per_second": 14.764, + "step": 91 + }, + { + "epoch": 0.2526604874699622, + "grad_norm": 0.030517578125, + "learning_rate": 2.9762587731792725e-05, + "loss": 0.477, + "step": 92 + }, + { + "epoch": 0.2554067971163749, + "grad_norm": 0.03271484375, + "learning_rate": 2.9756777675346128e-05, + "loss": 0.5536, + "step": 93 + }, + { + "epoch": 0.2581531067627875, + "grad_norm": 0.036376953125, + "learning_rate": 2.9750897963542453e-05, + "loss": 0.5581, + "step": 94 + }, + { + "epoch": 0.2608994164092001, + "grad_norm": 0.04345703125, + "learning_rate": 2.974494862413528e-05, + "loss": 0.5737, + "step": 95 + }, + { + "epoch": 0.26364572605561276, + "grad_norm": 0.03662109375, + "learning_rate": 2.973892968520685e-05, + "loss": 0.5191, + "step": 96 + }, + { + "epoch": 0.2663920357020254, + "grad_norm": 0.035888671875, + "learning_rate": 2.9732841175167924e-05, + "loss": 0.6794, + "step": 97 + }, + { + "epoch": 0.26913834534843806, + "grad_norm": 0.036376953125, + "learning_rate": 2.9726683122757664e-05, + "loss": 0.5615, + "step": 98 + }, + { + "epoch": 0.27188465499485065, + "grad_norm": 0.036376953125, + "learning_rate": 2.972045555704348e-05, + "loss": 0.521, + "step": 99 + }, + { + "epoch": 0.2746309646412633, + "grad_norm": 0.036376953125, + "learning_rate": 2.97141585074209e-05, + "loss": 0.4473, + "step": 100 + }, + { + "epoch": 0.27737727428767595, + "grad_norm": 0.033203125, + "learning_rate": 2.9707792003613434e-05, + "loss": 0.6017, + "step": 101 + }, + { + "epoch": 0.2801235839340886, + "grad_norm": 0.042236328125, + "learning_rate": 2.9701356075672442e-05, + "loss": 0.5079, + "step": 102 + }, + { + "epoch": 0.2828698935805012, + "grad_norm": 0.038818359375, + "learning_rate": 2.969485075397696e-05, + "loss": 0.5738, + "step": 103 + }, + { + "epoch": 0.28561620322691383, + "grad_norm": 0.034423828125, + "learning_rate": 2.9688276069233596e-05, + "loss": 0.4251, + "step": 104 + }, + { + "epoch": 0.2883625128733265, + "grad_norm": 0.03759765625, + "learning_rate": 2.968163205247636e-05, + "loss": 0.5902, + "step": 105 + }, + { + "epoch": 0.2911088225197391, + "grad_norm": 0.043701171875, + "learning_rate": 2.9674918735066534e-05, + "loss": 0.4307, + "step": 106 + }, + { + "epoch": 0.2938551321661517, + "grad_norm": 0.039306640625, + "learning_rate": 2.9668136148692497e-05, + "loss": 0.4018, + "step": 107 + }, + { + "epoch": 0.29660144181256437, + "grad_norm": 0.039306640625, + "learning_rate": 2.966128432536961e-05, + "loss": 0.5109, + "step": 108 + }, + { + "epoch": 0.299347751458977, + "grad_norm": 0.0361328125, + "learning_rate": 2.9654363297440045e-05, + "loss": 0.6136, + "step": 109 + }, + { + "epoch": 0.3020940611053896, + "grad_norm": 0.038818359375, + "learning_rate": 2.964737309757262e-05, + "loss": 0.4161, + "step": 110 + }, + { + "epoch": 0.30484037075180226, + "grad_norm": 0.0361328125, + "learning_rate": 2.9640313758762692e-05, + "loss": 0.4268, + "step": 111 + }, + { + "epoch": 0.3075866803982149, + "grad_norm": 0.03466796875, + "learning_rate": 2.9633185314331933e-05, + "loss": 0.4809, + "step": 112 + }, + { + "epoch": 0.31033299004462755, + "grad_norm": 0.03662109375, + "learning_rate": 2.9625987797928237e-05, + "loss": 0.4976, + "step": 113 + }, + { + "epoch": 0.31307929969104015, + "grad_norm": 0.03955078125, + "learning_rate": 2.9618721243525522e-05, + "loss": 0.5508, + "step": 114 + }, + { + "epoch": 0.3158256093374528, + "grad_norm": 0.045166015625, + "learning_rate": 2.9611385685423582e-05, + "loss": 0.4852, + "step": 115 + }, + { + "epoch": 0.31857191898386544, + "grad_norm": 0.040771484375, + "learning_rate": 2.9603981158247918e-05, + "loss": 0.4301, + "step": 116 + }, + { + "epoch": 0.3213182286302781, + "grad_norm": 0.0419921875, + "learning_rate": 2.9596507696949598e-05, + "loss": 0.4456, + "step": 117 + }, + { + "epoch": 0.3240645382766907, + "grad_norm": 0.038330078125, + "learning_rate": 2.9588965336805065e-05, + "loss": 0.6092, + "step": 118 + }, + { + "epoch": 0.32681084792310333, + "grad_norm": 0.033935546875, + "learning_rate": 2.958135411341597e-05, + "loss": 0.4823, + "step": 119 + }, + { + "epoch": 0.329557157569516, + "grad_norm": 0.03662109375, + "learning_rate": 2.9573674062709024e-05, + "loss": 0.4666, + "step": 120 + }, + { + "epoch": 0.33230346721592857, + "grad_norm": 0.035888671875, + "learning_rate": 2.9565925220935828e-05, + "loss": 0.4868, + "step": 121 + }, + { + "epoch": 0.3350497768623412, + "grad_norm": 0.034423828125, + "learning_rate": 2.9558107624672673e-05, + "loss": 0.529, + "step": 122 + }, + { + "epoch": 0.33779608650875387, + "grad_norm": 0.0341796875, + "learning_rate": 2.9550221310820405e-05, + "loss": 0.4308, + "step": 123 + }, + { + "epoch": 0.3405423961551665, + "grad_norm": 0.0341796875, + "learning_rate": 2.9542266316604213e-05, + "loss": 0.4058, + "step": 124 + }, + { + "epoch": 0.3432887058015791, + "grad_norm": 0.03466796875, + "learning_rate": 2.95342426795735e-05, + "loss": 0.4788, + "step": 125 + }, + { + "epoch": 0.34603501544799176, + "grad_norm": 0.20703125, + "learning_rate": 2.952615043760165e-05, + "loss": 1.2963, + "step": 126 + }, + { + "epoch": 0.3487813250944044, + "grad_norm": 0.0361328125, + "learning_rate": 2.95179896288859e-05, + "loss": 0.5734, + "step": 127 + }, + { + "epoch": 0.35152763474081705, + "grad_norm": 0.03515625, + "learning_rate": 2.9509760291947128e-05, + "loss": 0.4352, + "step": 128 + }, + { + "epoch": 0.35427394438722964, + "grad_norm": 0.03564453125, + "learning_rate": 2.9501462465629672e-05, + "loss": 0.6082, + "step": 129 + }, + { + "epoch": 0.3570202540336423, + "grad_norm": 0.035400390625, + "learning_rate": 2.949309618910118e-05, + "loss": 0.4699, + "step": 130 + }, + { + "epoch": 0.35976656368005494, + "grad_norm": 0.03759765625, + "learning_rate": 2.9484661501852373e-05, + "loss": 0.5504, + "step": 131 + }, + { + "epoch": 0.36251287332646753, + "grad_norm": 0.037353515625, + "learning_rate": 2.94761584436969e-05, + "loss": 0.545, + "step": 132 + }, + { + "epoch": 0.3652591829728802, + "grad_norm": 0.03515625, + "learning_rate": 2.9467587054771146e-05, + "loss": 0.445, + "step": 133 + }, + { + "epoch": 0.36800549261929283, + "grad_norm": 0.1806640625, + "learning_rate": 2.945894737553401e-05, + "loss": 1.1891, + "step": 134 + }, + { + "epoch": 0.3707518022657055, + "grad_norm": 0.037841796875, + "learning_rate": 2.945023944676676e-05, + "loss": 0.565, + "step": 135 + }, + { + "epoch": 0.37349811191211807, + "grad_norm": 0.042236328125, + "learning_rate": 2.9441463309572797e-05, + "loss": 0.6599, + "step": 136 + }, + { + "epoch": 0.3762444215585307, + "grad_norm": 0.0380859375, + "learning_rate": 2.9432619005377496e-05, + "loss": 0.4754, + "step": 137 + }, + { + "epoch": 0.37899073120494337, + "grad_norm": 0.036376953125, + "learning_rate": 2.9423706575927985e-05, + "loss": 0.4966, + "step": 138 + }, + { + "epoch": 0.381737040851356, + "grad_norm": 0.0390625, + "learning_rate": 2.9414726063292974e-05, + "loss": 0.4269, + "step": 139 + }, + { + "epoch": 0.3844833504977686, + "grad_norm": 0.038818359375, + "learning_rate": 2.940567750986252e-05, + "loss": 0.5516, + "step": 140 + }, + { + "epoch": 0.38722966014418125, + "grad_norm": 0.0361328125, + "learning_rate": 2.9396560958347865e-05, + "loss": 0.486, + "step": 141 + }, + { + "epoch": 0.3899759697905939, + "grad_norm": 0.034912109375, + "learning_rate": 2.9387376451781215e-05, + "loss": 0.4506, + "step": 142 + }, + { + "epoch": 0.39272227943700655, + "grad_norm": 0.039306640625, + "learning_rate": 2.9378124033515533e-05, + "loss": 0.6122, + "step": 143 + }, + { + "epoch": 0.39546858908341914, + "grad_norm": 0.03466796875, + "learning_rate": 2.936880374722434e-05, + "loss": 0.4776, + "step": 144 + }, + { + "epoch": 0.3982148987298318, + "grad_norm": 0.041015625, + "learning_rate": 2.9359415636901522e-05, + "loss": 0.5574, + "step": 145 + }, + { + "epoch": 0.40096120837624444, + "grad_norm": 0.035888671875, + "learning_rate": 2.9349959746861093e-05, + "loss": 0.5289, + "step": 146 + }, + { + "epoch": 0.40370751802265703, + "grad_norm": 0.037109375, + "learning_rate": 2.9340436121737018e-05, + "loss": 0.4664, + "step": 147 + }, + { + "epoch": 0.4064538276690697, + "grad_norm": 0.04345703125, + "learning_rate": 2.9330844806482974e-05, + "loss": 0.5322, + "step": 148 + }, + { + "epoch": 0.4092001373154823, + "grad_norm": 0.037109375, + "learning_rate": 2.9321185846372162e-05, + "loss": 0.4143, + "step": 149 + }, + { + "epoch": 0.411946446961895, + "grad_norm": 0.0400390625, + "learning_rate": 2.9311459286997073e-05, + "loss": 0.5038, + "step": 150 + }, + { + "epoch": 0.41469275660830757, + "grad_norm": 0.036376953125, + "learning_rate": 2.930166517426929e-05, + "loss": 0.4905, + "step": 151 + }, + { + "epoch": 0.4174390662547202, + "grad_norm": 0.03955078125, + "learning_rate": 2.929180355441926e-05, + "loss": 0.5357, + "step": 152 + }, + { + "epoch": 0.42018537590113286, + "grad_norm": 0.038818359375, + "learning_rate": 2.9281874473996077e-05, + "loss": 0.5449, + "step": 153 + }, + { + "epoch": 0.4229316855475455, + "grad_norm": 0.1689453125, + "learning_rate": 2.9271877979867263e-05, + "loss": 1.3347, + "step": 154 + }, + { + "epoch": 0.4256779951939581, + "grad_norm": 0.03759765625, + "learning_rate": 2.926181411921855e-05, + "loss": 0.4532, + "step": 155 + }, + { + "epoch": 0.42842430484037075, + "grad_norm": 0.03857421875, + "learning_rate": 2.9251682939553662e-05, + "loss": 0.5425, + "step": 156 + }, + { + "epoch": 0.4311706144867834, + "grad_norm": 0.047607421875, + "learning_rate": 2.9241484488694074e-05, + "loss": 0.4875, + "step": 157 + }, + { + "epoch": 0.433916924133196, + "grad_norm": 0.040283203125, + "learning_rate": 2.92312188147788e-05, + "loss": 0.4574, + "step": 158 + }, + { + "epoch": 0.43666323377960864, + "grad_norm": 0.035888671875, + "learning_rate": 2.9220885966264174e-05, + "loss": 0.5003, + "step": 159 + }, + { + "epoch": 0.4394095434260213, + "grad_norm": 0.037841796875, + "learning_rate": 2.9210485991923577e-05, + "loss": 0.4766, + "step": 160 + }, + { + "epoch": 0.44215585307243394, + "grad_norm": 0.03515625, + "learning_rate": 2.9200018940847278e-05, + "loss": 0.3866, + "step": 161 + }, + { + "epoch": 0.44490216271884653, + "grad_norm": 0.035888671875, + "learning_rate": 2.918948486244214e-05, + "loss": 0.4401, + "step": 162 + }, + { + "epoch": 0.4476484723652592, + "grad_norm": 0.037841796875, + "learning_rate": 2.917888380643142e-05, + "loss": 0.5193, + "step": 163 + }, + { + "epoch": 0.4503947820116718, + "grad_norm": 0.0380859375, + "learning_rate": 2.916821582285451e-05, + "loss": 0.4802, + "step": 164 + }, + { + "epoch": 0.45314109165808447, + "grad_norm": 0.037353515625, + "learning_rate": 2.915748096206674e-05, + "loss": 0.4693, + "step": 165 + }, + { + "epoch": 0.45588740130449706, + "grad_norm": 0.04052734375, + "learning_rate": 2.914667927473909e-05, + "loss": 0.4949, + "step": 166 + }, + { + "epoch": 0.4586337109509097, + "grad_norm": 0.036376953125, + "learning_rate": 2.9135810811857994e-05, + "loss": 0.5453, + "step": 167 + }, + { + "epoch": 0.46138002059732236, + "grad_norm": 0.0498046875, + "learning_rate": 2.912487562472508e-05, + "loss": 0.4653, + "step": 168 + }, + { + "epoch": 0.464126330243735, + "grad_norm": 0.0380859375, + "learning_rate": 2.9113873764956917e-05, + "loss": 0.5032, + "step": 169 + }, + { + "epoch": 0.4668726398901476, + "grad_norm": 0.037109375, + "learning_rate": 2.91028052844848e-05, + "loss": 0.4736, + "step": 170 + }, + { + "epoch": 0.46961894953656025, + "grad_norm": 0.038818359375, + "learning_rate": 2.9091670235554478e-05, + "loss": 0.4773, + "step": 171 + }, + { + "epoch": 0.4723652591829729, + "grad_norm": 0.041748046875, + "learning_rate": 2.9080468670725922e-05, + "loss": 0.5689, + "step": 172 + }, + { + "epoch": 0.4751115688293855, + "grad_norm": 0.04296875, + "learning_rate": 2.906920064287308e-05, + "loss": 0.51, + "step": 173 + }, + { + "epoch": 0.47785787847579814, + "grad_norm": 0.03564453125, + "learning_rate": 2.9057866205183606e-05, + "loss": 0.446, + "step": 174 + }, + { + "epoch": 0.4806041881222108, + "grad_norm": 0.036376953125, + "learning_rate": 2.9046465411158634e-05, + "loss": 0.4956, + "step": 175 + }, + { + "epoch": 0.48335049776862343, + "grad_norm": 0.038818359375, + "learning_rate": 2.9034998314612516e-05, + "loss": 0.4963, + "step": 176 + }, + { + "epoch": 0.486096807415036, + "grad_norm": 0.0419921875, + "learning_rate": 2.902346496967256e-05, + "loss": 0.4928, + "step": 177 + }, + { + "epoch": 0.4888431170614487, + "grad_norm": 0.0439453125, + "learning_rate": 2.9011865430778782e-05, + "loss": 0.4731, + "step": 178 + }, + { + "epoch": 0.4915894267078613, + "grad_norm": 0.03955078125, + "learning_rate": 2.9000199752683663e-05, + "loss": 0.5374, + "step": 179 + }, + { + "epoch": 0.49433573635427397, + "grad_norm": 0.040771484375, + "learning_rate": 2.8988467990451853e-05, + "loss": 0.6108, + "step": 180 + }, + { + "epoch": 0.49708204600068656, + "grad_norm": 0.037109375, + "learning_rate": 2.8976670199459953e-05, + "loss": 0.4189, + "step": 181 + }, + { + "epoch": 0.4998283556470992, + "grad_norm": 0.0380859375, + "learning_rate": 2.8964806435396227e-05, + "loss": 0.4773, + "step": 182 + }, + { + "epoch": 0.4998283556470992, + "eval_loss": 0.5154861807823181, + "eval_runtime": 620.9342, + "eval_samples_per_second": 14.762, + "eval_steps_per_second": 14.762, + "step": 182 + }, + { + "epoch": 0.5025746652935118, + "grad_norm": 0.043701171875, + "learning_rate": 2.8952876754260342e-05, + "loss": 0.5624, + "step": 183 + }, + { + "epoch": 0.5053209749399244, + "grad_norm": 0.1904296875, + "learning_rate": 2.8940881212363124e-05, + "loss": 1.2595, + "step": 184 + }, + { + "epoch": 0.5080672845863371, + "grad_norm": 0.0390625, + "learning_rate": 2.8928819866326262e-05, + "loss": 0.6287, + "step": 185 + }, + { + "epoch": 0.5108135942327497, + "grad_norm": 0.035400390625, + "learning_rate": 2.891669277308206e-05, + "loss": 0.4508, + "step": 186 + }, + { + "epoch": 0.5135599038791624, + "grad_norm": 0.03662109375, + "learning_rate": 2.8904499989873166e-05, + "loss": 0.5141, + "step": 187 + }, + { + "epoch": 0.516306213525575, + "grad_norm": 0.0390625, + "learning_rate": 2.88922415742523e-05, + "loss": 0.4496, + "step": 188 + }, + { + "epoch": 0.5190525231719877, + "grad_norm": 0.0361328125, + "learning_rate": 2.8879917584081975e-05, + "loss": 0.5467, + "step": 189 + }, + { + "epoch": 0.5217988328184002, + "grad_norm": 0.046875, + "learning_rate": 2.886752807753424e-05, + "loss": 0.4188, + "step": 190 + }, + { + "epoch": 0.5245451424648129, + "grad_norm": 0.039794921875, + "learning_rate": 2.8855073113090395e-05, + "loss": 0.5347, + "step": 191 + }, + { + "epoch": 0.5272914521112255, + "grad_norm": 0.0400390625, + "learning_rate": 2.8842552749540708e-05, + "loss": 0.4117, + "step": 192 + }, + { + "epoch": 0.5300377617576382, + "grad_norm": 0.037841796875, + "learning_rate": 2.8829967045984155e-05, + "loss": 0.5413, + "step": 193 + }, + { + "epoch": 0.5327840714040508, + "grad_norm": 0.036865234375, + "learning_rate": 2.8817316061828126e-05, + "loss": 0.5683, + "step": 194 + }, + { + "epoch": 0.5355303810504635, + "grad_norm": 0.03955078125, + "learning_rate": 2.8804599856788154e-05, + "loss": 0.3851, + "step": 195 + }, + { + "epoch": 0.5382766906968761, + "grad_norm": 0.03662109375, + "learning_rate": 2.8791818490887628e-05, + "loss": 0.42, + "step": 196 + }, + { + "epoch": 0.5410230003432887, + "grad_norm": 0.039306640625, + "learning_rate": 2.8778972024457504e-05, + "loss": 0.5491, + "step": 197 + }, + { + "epoch": 0.5437693099897013, + "grad_norm": 0.043701171875, + "learning_rate": 2.876606051813604e-05, + "loss": 0.5299, + "step": 198 + }, + { + "epoch": 0.546515619636114, + "grad_norm": 0.039794921875, + "learning_rate": 2.8753084032868494e-05, + "loss": 0.4881, + "step": 199 + }, + { + "epoch": 0.5492619292825266, + "grad_norm": 0.03857421875, + "learning_rate": 2.8740042629906833e-05, + "loss": 0.4698, + "step": 200 + }, + { + "epoch": 0.5520082389289392, + "grad_norm": 0.042724609375, + "learning_rate": 2.8726936370809455e-05, + "loss": 0.5685, + "step": 201 + }, + { + "epoch": 0.5547545485753519, + "grad_norm": 0.040771484375, + "learning_rate": 2.8713765317440895e-05, + "loss": 0.5536, + "step": 202 + }, + { + "epoch": 0.5575008582217645, + "grad_norm": 0.040283203125, + "learning_rate": 2.870052953197152e-05, + "loss": 0.4891, + "step": 203 + }, + { + "epoch": 0.5602471678681772, + "grad_norm": 0.048828125, + "learning_rate": 2.8687229076877274e-05, + "loss": 0.4182, + "step": 204 + }, + { + "epoch": 0.5629934775145897, + "grad_norm": 0.0400390625, + "learning_rate": 2.867386401493932e-05, + "loss": 0.507, + "step": 205 + }, + { + "epoch": 0.5657397871610024, + "grad_norm": 0.03466796875, + "learning_rate": 2.8660434409243817e-05, + "loss": 0.4052, + "step": 206 + }, + { + "epoch": 0.568486096807415, + "grad_norm": 0.044677734375, + "learning_rate": 2.8646940323181553e-05, + "loss": 0.4503, + "step": 207 + }, + { + "epoch": 0.5712324064538277, + "grad_norm": 0.037841796875, + "learning_rate": 2.86333818204477e-05, + "loss": 0.4234, + "step": 208 + }, + { + "epoch": 0.5739787161002403, + "grad_norm": 0.03857421875, + "learning_rate": 2.8619758965041488e-05, + "loss": 0.5319, + "step": 209 + }, + { + "epoch": 0.576725025746653, + "grad_norm": 0.03662109375, + "learning_rate": 2.8606071821265888e-05, + "loss": 0.5282, + "step": 210 + }, + { + "epoch": 0.5794713353930656, + "grad_norm": 0.04248046875, + "learning_rate": 2.8592320453727356e-05, + "loss": 0.4596, + "step": 211 + }, + { + "epoch": 0.5822176450394781, + "grad_norm": 0.037841796875, + "learning_rate": 2.857850492733548e-05, + "loss": 0.5258, + "step": 212 + }, + { + "epoch": 0.5849639546858908, + "grad_norm": 0.0380859375, + "learning_rate": 2.856462530730269e-05, + "loss": 0.4836, + "step": 213 + }, + { + "epoch": 0.5877102643323034, + "grad_norm": 0.040771484375, + "learning_rate": 2.855068165914397e-05, + "loss": 0.4973, + "step": 214 + }, + { + "epoch": 0.5904565739787161, + "grad_norm": 0.03857421875, + "learning_rate": 2.8536674048676506e-05, + "loss": 0.5643, + "step": 215 + }, + { + "epoch": 0.5932028836251287, + "grad_norm": 0.04150390625, + "learning_rate": 2.8522602542019425e-05, + "loss": 0.476, + "step": 216 + }, + { + "epoch": 0.5959491932715414, + "grad_norm": 0.0390625, + "learning_rate": 2.850846720559345e-05, + "loss": 0.4767, + "step": 217 + }, + { + "epoch": 0.598695502917954, + "grad_norm": 0.044189453125, + "learning_rate": 2.8494268106120586e-05, + "loss": 0.5567, + "step": 218 + }, + { + "epoch": 0.6014418125643667, + "grad_norm": 0.0439453125, + "learning_rate": 2.8480005310623823e-05, + "loss": 0.536, + "step": 219 + }, + { + "epoch": 0.6041881222107792, + "grad_norm": 0.04248046875, + "learning_rate": 2.8465678886426814e-05, + "loss": 0.4813, + "step": 220 + }, + { + "epoch": 0.6069344318571919, + "grad_norm": 0.038818359375, + "learning_rate": 2.845128890115355e-05, + "loss": 0.4215, + "step": 221 + }, + { + "epoch": 0.6096807415036045, + "grad_norm": 0.03857421875, + "learning_rate": 2.8436835422728036e-05, + "loss": 0.547, + "step": 222 + }, + { + "epoch": 0.6124270511500172, + "grad_norm": 0.0380859375, + "learning_rate": 2.8422318519373996e-05, + "loss": 0.4629, + "step": 223 + }, + { + "epoch": 0.6151733607964298, + "grad_norm": 0.052490234375, + "learning_rate": 2.8407738259614524e-05, + "loss": 0.4823, + "step": 224 + }, + { + "epoch": 0.6179196704428425, + "grad_norm": 0.04052734375, + "learning_rate": 2.8393094712271772e-05, + "loss": 0.5568, + "step": 225 + }, + { + "epoch": 0.6206659800892551, + "grad_norm": 0.03857421875, + "learning_rate": 2.8378387946466623e-05, + "loss": 0.4709, + "step": 226 + }, + { + "epoch": 0.6234122897356676, + "grad_norm": 0.0439453125, + "learning_rate": 2.8363618031618364e-05, + "loss": 0.4205, + "step": 227 + }, + { + "epoch": 0.6261585993820803, + "grad_norm": 0.0380859375, + "learning_rate": 2.8348785037444366e-05, + "loss": 0.4985, + "step": 228 + }, + { + "epoch": 0.6289049090284929, + "grad_norm": 0.039794921875, + "learning_rate": 2.8333889033959746e-05, + "loss": 0.4527, + "step": 229 + }, + { + "epoch": 0.6316512186749056, + "grad_norm": 0.03857421875, + "learning_rate": 2.8318930091477037e-05, + "loss": 0.582, + "step": 230 + }, + { + "epoch": 0.6343975283213182, + "grad_norm": 0.04296875, + "learning_rate": 2.8303908280605854e-05, + "loss": 0.5028, + "step": 231 + }, + { + "epoch": 0.6371438379677309, + "grad_norm": 0.03857421875, + "learning_rate": 2.8288823672252586e-05, + "loss": 0.5349, + "step": 232 + }, + { + "epoch": 0.6398901476141435, + "grad_norm": 0.038818359375, + "learning_rate": 2.827367633762001e-05, + "loss": 0.4251, + "step": 233 + }, + { + "epoch": 0.6426364572605562, + "grad_norm": 0.041259765625, + "learning_rate": 2.825846634820701e-05, + "loss": 0.5079, + "step": 234 + }, + { + "epoch": 0.6453827669069687, + "grad_norm": 0.353515625, + "learning_rate": 2.824319377580821e-05, + "loss": 1.2174, + "step": 235 + }, + { + "epoch": 0.6481290765533814, + "grad_norm": 0.038330078125, + "learning_rate": 2.8227858692513626e-05, + "loss": 0.4188, + "step": 236 + }, + { + "epoch": 0.650875386199794, + "grad_norm": 0.042236328125, + "learning_rate": 2.821246117070835e-05, + "loss": 0.4767, + "step": 237 + }, + { + "epoch": 0.6536216958462067, + "grad_norm": 0.03662109375, + "learning_rate": 2.8197001283072205e-05, + "loss": 0.4736, + "step": 238 + }, + { + "epoch": 0.6563680054926193, + "grad_norm": 0.035400390625, + "learning_rate": 2.8181479102579383e-05, + "loss": 0.388, + "step": 239 + }, + { + "epoch": 0.659114315139032, + "grad_norm": 0.0439453125, + "learning_rate": 2.8165894702498116e-05, + "loss": 0.6023, + "step": 240 + }, + { + "epoch": 0.6618606247854446, + "grad_norm": 0.03955078125, + "learning_rate": 2.8150248156390327e-05, + "loss": 0.5319, + "step": 241 + }, + { + "epoch": 0.6646069344318571, + "grad_norm": 0.037109375, + "learning_rate": 2.8134539538111286e-05, + "loss": 0.5133, + "step": 242 + }, + { + "epoch": 0.6673532440782698, + "grad_norm": 0.039794921875, + "learning_rate": 2.8118768921809258e-05, + "loss": 0.4813, + "step": 243 + }, + { + "epoch": 0.6700995537246824, + "grad_norm": 0.03857421875, + "learning_rate": 2.8102936381925143e-05, + "loss": 0.5085, + "step": 244 + }, + { + "epoch": 0.6728458633710951, + "grad_norm": 0.04248046875, + "learning_rate": 2.8087041993192148e-05, + "loss": 0.4245, + "step": 245 + }, + { + "epoch": 0.6755921730175077, + "grad_norm": 0.045166015625, + "learning_rate": 2.8071085830635404e-05, + "loss": 0.5026, + "step": 246 + }, + { + "epoch": 0.6783384826639204, + "grad_norm": 0.0439453125, + "learning_rate": 2.8055067969571647e-05, + "loss": 0.5615, + "step": 247 + }, + { + "epoch": 0.681084792310333, + "grad_norm": 0.037841796875, + "learning_rate": 2.803898848560883e-05, + "loss": 0.4929, + "step": 248 + }, + { + "epoch": 0.6838311019567456, + "grad_norm": 0.042724609375, + "learning_rate": 2.802284745464579e-05, + "loss": 0.5747, + "step": 249 + }, + { + "epoch": 0.6865774116031582, + "grad_norm": 0.041015625, + "learning_rate": 2.800664495287187e-05, + "loss": 0.4181, + "step": 250 + }, + { + "epoch": 0.6893237212495709, + "grad_norm": 0.040283203125, + "learning_rate": 2.7990381056766583e-05, + "loss": 0.548, + "step": 251 + }, + { + "epoch": 0.6920700308959835, + "grad_norm": 0.043212890625, + "learning_rate": 2.797405584309922e-05, + "loss": 0.5344, + "step": 252 + }, + { + "epoch": 0.6948163405423962, + "grad_norm": 0.03955078125, + "learning_rate": 2.7957669388928517e-05, + "loss": 0.4484, + "step": 253 + }, + { + "epoch": 0.6975626501888088, + "grad_norm": 0.03857421875, + "learning_rate": 2.7941221771602278e-05, + "loss": 0.5194, + "step": 254 + }, + { + "epoch": 0.7003089598352215, + "grad_norm": 0.0390625, + "learning_rate": 2.7924713068757004e-05, + "loss": 0.4297, + "step": 255 + }, + { + "epoch": 0.7030552694816341, + "grad_norm": 0.0400390625, + "learning_rate": 2.7908143358317545e-05, + "loss": 0.4723, + "step": 256 + }, + { + "epoch": 0.7058015791280466, + "grad_norm": 0.03662109375, + "learning_rate": 2.7891512718496712e-05, + "loss": 0.4401, + "step": 257 + }, + { + "epoch": 0.7085478887744593, + "grad_norm": 0.041015625, + "learning_rate": 2.7874821227794915e-05, + "loss": 0.5961, + "step": 258 + }, + { + "epoch": 0.7112941984208719, + "grad_norm": 0.039794921875, + "learning_rate": 2.78580689649998e-05, + "loss": 0.5483, + "step": 259 + }, + { + "epoch": 0.7140405080672846, + "grad_norm": 0.05126953125, + "learning_rate": 2.7841256009185876e-05, + "loss": 0.493, + "step": 260 + }, + { + "epoch": 0.7167868177136972, + "grad_norm": 0.042236328125, + "learning_rate": 2.782438243971412e-05, + "loss": 0.5366, + "step": 261 + }, + { + "epoch": 0.7195331273601099, + "grad_norm": 0.0380859375, + "learning_rate": 2.7807448336231635e-05, + "loss": 0.3991, + "step": 262 + }, + { + "epoch": 0.7222794370065225, + "grad_norm": 0.048095703125, + "learning_rate": 2.7790453778671248e-05, + "loss": 0.528, + "step": 263 + }, + { + "epoch": 0.7250257466529351, + "grad_norm": 0.0458984375, + "learning_rate": 2.7773398847251152e-05, + "loss": 0.4221, + "step": 264 + }, + { + "epoch": 0.7277720562993477, + "grad_norm": 0.03955078125, + "learning_rate": 2.7756283622474515e-05, + "loss": 0.4483, + "step": 265 + }, + { + "epoch": 0.7305183659457604, + "grad_norm": 0.0400390625, + "learning_rate": 2.77391081851291e-05, + "loss": 0.4633, + "step": 266 + }, + { + "epoch": 0.733264675592173, + "grad_norm": 0.04248046875, + "learning_rate": 2.7721872616286888e-05, + "loss": 0.5595, + "step": 267 + }, + { + "epoch": 0.7360109852385857, + "grad_norm": 0.0380859375, + "learning_rate": 2.7704576997303694e-05, + "loss": 0.5091, + "step": 268 + }, + { + "epoch": 0.7387572948849983, + "grad_norm": 0.0361328125, + "learning_rate": 2.768722140981879e-05, + "loss": 0.4357, + "step": 269 + }, + { + "epoch": 0.741503604531411, + "grad_norm": 0.03955078125, + "learning_rate": 2.766980593575451e-05, + "loss": 0.4608, + "step": 270 + }, + { + "epoch": 0.7442499141778236, + "grad_norm": 0.03662109375, + "learning_rate": 2.765233065731586e-05, + "loss": 0.4593, + "step": 271 + }, + { + "epoch": 0.7469962238242361, + "grad_norm": 0.04150390625, + "learning_rate": 2.7634795656990143e-05, + "loss": 0.5097, + "step": 272 + }, + { + "epoch": 0.7497425334706488, + "grad_norm": 0.039794921875, + "learning_rate": 2.761720101754656e-05, + "loss": 0.4375, + "step": 273 + }, + { + "epoch": 0.7497425334706488, + "eval_loss": 0.5116191506385803, + "eval_runtime": 620.1922, + "eval_samples_per_second": 14.779, + "eval_steps_per_second": 14.779, + "step": 273 + }, + { + "epoch": 0.7524888431170614, + "grad_norm": 0.039306640625, + "learning_rate": 2.7599546822035817e-05, + "loss": 0.5089, + "step": 274 + }, + { + "epoch": 0.7552351527634741, + "grad_norm": 0.04150390625, + "learning_rate": 2.758183315378976e-05, + "loss": 0.5961, + "step": 275 + }, + { + "epoch": 0.7579814624098867, + "grad_norm": 0.041259765625, + "learning_rate": 2.7564060096420925e-05, + "loss": 0.4763, + "step": 276 + }, + { + "epoch": 0.7607277720562994, + "grad_norm": 0.0419921875, + "learning_rate": 2.754622773382221e-05, + "loss": 0.5076, + "step": 277 + }, + { + "epoch": 0.763474081702712, + "grad_norm": 0.0439453125, + "learning_rate": 2.7528336150166436e-05, + "loss": 0.4411, + "step": 278 + }, + { + "epoch": 0.7662203913491246, + "grad_norm": 0.04736328125, + "learning_rate": 2.751038542990595e-05, + "loss": 0.5316, + "step": 279 + }, + { + "epoch": 0.7689667009955372, + "grad_norm": 0.0439453125, + "learning_rate": 2.7492375657772254e-05, + "loss": 0.4153, + "step": 280 + }, + { + "epoch": 0.7717130106419499, + "grad_norm": 0.039794921875, + "learning_rate": 2.7474306918775576e-05, + "loss": 0.5106, + "step": 281 + }, + { + "epoch": 0.7744593202883625, + "grad_norm": 0.04638671875, + "learning_rate": 2.745617929820449e-05, + "loss": 0.474, + "step": 282 + }, + { + "epoch": 0.7772056299347752, + "grad_norm": 0.142578125, + "learning_rate": 2.74379928816255e-05, + "loss": 1.2147, + "step": 283 + }, + { + "epoch": 0.7799519395811878, + "grad_norm": 0.044677734375, + "learning_rate": 2.7419747754882637e-05, + "loss": 0.5727, + "step": 284 + }, + { + "epoch": 0.7826982492276005, + "grad_norm": 0.046142578125, + "learning_rate": 2.740144400409707e-05, + "loss": 0.5203, + "step": 285 + }, + { + "epoch": 0.7854445588740131, + "grad_norm": 0.041259765625, + "learning_rate": 2.738308171566667e-05, + "loss": 0.5998, + "step": 286 + }, + { + "epoch": 0.7881908685204256, + "grad_norm": 0.043212890625, + "learning_rate": 2.7364660976265624e-05, + "loss": 0.5133, + "step": 287 + }, + { + "epoch": 0.7909371781668383, + "grad_norm": 0.038818359375, + "learning_rate": 2.7346181872844037e-05, + "loss": 0.4711, + "step": 288 + }, + { + "epoch": 0.7936834878132509, + "grad_norm": 0.041748046875, + "learning_rate": 2.7327644492627487e-05, + "loss": 0.5563, + "step": 289 + }, + { + "epoch": 0.7964297974596636, + "grad_norm": 0.042236328125, + "learning_rate": 2.7309048923116635e-05, + "loss": 0.4684, + "step": 290 + }, + { + "epoch": 0.7991761071060762, + "grad_norm": 0.037841796875, + "learning_rate": 2.729039525208682e-05, + "loss": 0.4581, + "step": 291 + }, + { + "epoch": 0.8019224167524889, + "grad_norm": 0.03759765625, + "learning_rate": 2.7271683567587608e-05, + "loss": 0.4502, + "step": 292 + }, + { + "epoch": 0.8046687263989015, + "grad_norm": 0.0390625, + "learning_rate": 2.7252913957942435e-05, + "loss": 0.564, + "step": 293 + }, + { + "epoch": 0.8074150360453141, + "grad_norm": 0.041259765625, + "learning_rate": 2.723408651174813e-05, + "loss": 0.4386, + "step": 294 + }, + { + "epoch": 0.8101613456917267, + "grad_norm": 0.039794921875, + "learning_rate": 2.7215201317874537e-05, + "loss": 0.5623, + "step": 295 + }, + { + "epoch": 0.8129076553381394, + "grad_norm": 0.043701171875, + "learning_rate": 2.7196258465464087e-05, + "loss": 0.5303, + "step": 296 + }, + { + "epoch": 0.815653964984552, + "grad_norm": 0.04248046875, + "learning_rate": 2.7177258043931354e-05, + "loss": 0.5094, + "step": 297 + }, + { + "epoch": 0.8184002746309647, + "grad_norm": 0.038818359375, + "learning_rate": 2.7158200142962665e-05, + "loss": 0.502, + "step": 298 + }, + { + "epoch": 0.8211465842773773, + "grad_norm": 0.044921875, + "learning_rate": 2.7139084852515665e-05, + "loss": 0.4744, + "step": 299 + }, + { + "epoch": 0.82389289392379, + "grad_norm": 0.039306640625, + "learning_rate": 2.7119912262818878e-05, + "loss": 0.5895, + "step": 300 + }, + { + "epoch": 0.8266392035702025, + "grad_norm": 0.040771484375, + "learning_rate": 2.7100682464371306e-05, + "loss": 0.3948, + "step": 301 + }, + { + "epoch": 0.8293855132166151, + "grad_norm": 0.038330078125, + "learning_rate": 2.7081395547941986e-05, + "loss": 0.4514, + "step": 302 + }, + { + "epoch": 0.8321318228630278, + "grad_norm": 0.04443359375, + "learning_rate": 2.7062051604569562e-05, + "loss": 0.4525, + "step": 303 + }, + { + "epoch": 0.8348781325094404, + "grad_norm": 0.038818359375, + "learning_rate": 2.7042650725561854e-05, + "loss": 0.4161, + "step": 304 + }, + { + "epoch": 0.8376244421558531, + "grad_norm": 0.042724609375, + "learning_rate": 2.7023193002495447e-05, + "loss": 0.5065, + "step": 305 + }, + { + "epoch": 0.8403707518022657, + "grad_norm": 0.16796875, + "learning_rate": 2.7003678527215224e-05, + "loss": 1.3831, + "step": 306 + }, + { + "epoch": 0.8431170614486784, + "grad_norm": 0.0419921875, + "learning_rate": 2.6984107391833972e-05, + "loss": 0.5368, + "step": 307 + }, + { + "epoch": 0.845863371095091, + "grad_norm": 0.037353515625, + "learning_rate": 2.6964479688731897e-05, + "loss": 0.4434, + "step": 308 + }, + { + "epoch": 0.8486096807415036, + "grad_norm": 0.04541015625, + "learning_rate": 2.694479551055625e-05, + "loss": 0.5286, + "step": 309 + }, + { + "epoch": 0.8513559903879162, + "grad_norm": 0.03759765625, + "learning_rate": 2.6925054950220834e-05, + "loss": 0.4054, + "step": 310 + }, + { + "epoch": 0.8541023000343289, + "grad_norm": 0.05419921875, + "learning_rate": 2.69052581009056e-05, + "loss": 0.3735, + "step": 311 + }, + { + "epoch": 0.8568486096807415, + "grad_norm": 0.039794921875, + "learning_rate": 2.68854050560562e-05, + "loss": 0.5696, + "step": 312 + }, + { + "epoch": 0.8595949193271541, + "grad_norm": 0.041259765625, + "learning_rate": 2.6865495909383525e-05, + "loss": 0.4851, + "step": 313 + }, + { + "epoch": 0.8623412289735668, + "grad_norm": 0.042724609375, + "learning_rate": 2.684553075486329e-05, + "loss": 0.5755, + "step": 314 + }, + { + "epoch": 0.8650875386199794, + "grad_norm": 0.04150390625, + "learning_rate": 2.682550968673558e-05, + "loss": 0.5376, + "step": 315 + }, + { + "epoch": 0.867833848266392, + "grad_norm": 0.040283203125, + "learning_rate": 2.6805432799504407e-05, + "loss": 0.5374, + "step": 316 + }, + { + "epoch": 0.8705801579128046, + "grad_norm": 0.037841796875, + "learning_rate": 2.6785300187937264e-05, + "loss": 0.421, + "step": 317 + }, + { + "epoch": 0.8733264675592173, + "grad_norm": 0.035888671875, + "learning_rate": 2.6765111947064654e-05, + "loss": 0.4206, + "step": 318 + }, + { + "epoch": 0.8760727772056299, + "grad_norm": 0.046630859375, + "learning_rate": 2.6744868172179692e-05, + "loss": 0.5895, + "step": 319 + }, + { + "epoch": 0.8788190868520426, + "grad_norm": 0.041259765625, + "learning_rate": 2.672456895883761e-05, + "loss": 0.4784, + "step": 320 + }, + { + "epoch": 0.8815653964984552, + "grad_norm": 0.039794921875, + "learning_rate": 2.670421440285533e-05, + "loss": 0.4898, + "step": 321 + }, + { + "epoch": 0.8843117061448679, + "grad_norm": 0.045654296875, + "learning_rate": 2.6683804600310997e-05, + "loss": 0.6258, + "step": 322 + }, + { + "epoch": 0.8870580157912805, + "grad_norm": 0.044677734375, + "learning_rate": 2.6663339647543528e-05, + "loss": 0.5587, + "step": 323 + }, + { + "epoch": 0.8898043254376931, + "grad_norm": 0.038330078125, + "learning_rate": 2.664281964115218e-05, + "loss": 0.4539, + "step": 324 + }, + { + "epoch": 0.8925506350841057, + "grad_norm": 0.037353515625, + "learning_rate": 2.6622244677996058e-05, + "loss": 0.4652, + "step": 325 + }, + { + "epoch": 0.8952969447305184, + "grad_norm": 0.037841796875, + "learning_rate": 2.660161485519368e-05, + "loss": 0.4624, + "step": 326 + }, + { + "epoch": 0.898043254376931, + "grad_norm": 0.039306640625, + "learning_rate": 2.6580930270122524e-05, + "loss": 0.5089, + "step": 327 + }, + { + "epoch": 0.9007895640233436, + "grad_norm": 0.044921875, + "learning_rate": 2.6560191020418545e-05, + "loss": 0.4246, + "step": 328 + }, + { + "epoch": 0.9035358736697563, + "grad_norm": 0.044189453125, + "learning_rate": 2.6539397203975732e-05, + "loss": 0.516, + "step": 329 + }, + { + "epoch": 0.9062821833161689, + "grad_norm": 0.041259765625, + "learning_rate": 2.6518548918945646e-05, + "loss": 0.5008, + "step": 330 + }, + { + "epoch": 0.9090284929625815, + "grad_norm": 0.04052734375, + "learning_rate": 2.6497646263736943e-05, + "loss": 0.5195, + "step": 331 + }, + { + "epoch": 0.9117748026089941, + "grad_norm": 0.044189453125, + "learning_rate": 2.6476689337014925e-05, + "loss": 0.5701, + "step": 332 + }, + { + "epoch": 0.9145211122554068, + "grad_norm": 0.043212890625, + "learning_rate": 2.6455678237701072e-05, + "loss": 0.5766, + "step": 333 + }, + { + "epoch": 0.9172674219018194, + "grad_norm": 0.03955078125, + "learning_rate": 2.643461306497256e-05, + "loss": 0.4613, + "step": 334 + }, + { + "epoch": 0.9200137315482321, + "grad_norm": 0.041748046875, + "learning_rate": 2.641349391826182e-05, + "loss": 0.4347, + "step": 335 + }, + { + "epoch": 0.9227600411946447, + "grad_norm": 0.045166015625, + "learning_rate": 2.6392320897256034e-05, + "loss": 0.4371, + "step": 336 + }, + { + "epoch": 0.9255063508410574, + "grad_norm": 0.04052734375, + "learning_rate": 2.637109410189669e-05, + "loss": 0.5219, + "step": 337 + }, + { + "epoch": 0.92825266048747, + "grad_norm": 0.040283203125, + "learning_rate": 2.6349813632379103e-05, + "loss": 0.5435, + "step": 338 + }, + { + "epoch": 0.9309989701338826, + "grad_norm": 0.04248046875, + "learning_rate": 2.6328479589151953e-05, + "loss": 0.4764, + "step": 339 + }, + { + "epoch": 0.9337452797802952, + "grad_norm": 0.042236328125, + "learning_rate": 2.6307092072916786e-05, + "loss": 0.4664, + "step": 340 + }, + { + "epoch": 0.9364915894267078, + "grad_norm": 0.045166015625, + "learning_rate": 2.628565118462756e-05, + "loss": 0.4723, + "step": 341 + }, + { + "epoch": 0.9392378990731205, + "grad_norm": 0.041259765625, + "learning_rate": 2.626415702549015e-05, + "loss": 0.5179, + "step": 342 + }, + { + "epoch": 0.9419842087195331, + "grad_norm": 0.0419921875, + "learning_rate": 2.62426096969619e-05, + "loss": 0.5736, + "step": 343 + }, + { + "epoch": 0.9447305183659458, + "grad_norm": 0.04541015625, + "learning_rate": 2.6221009300751113e-05, + "loss": 0.5238, + "step": 344 + }, + { + "epoch": 0.9474768280123584, + "grad_norm": 0.04248046875, + "learning_rate": 2.6199355938816586e-05, + "loss": 0.4591, + "step": 345 + }, + { + "epoch": 0.950223137658771, + "grad_norm": 0.040771484375, + "learning_rate": 2.6177649713367136e-05, + "loss": 0.5288, + "step": 346 + }, + { + "epoch": 0.9529694473051836, + "grad_norm": 0.044677734375, + "learning_rate": 2.6155890726861084e-05, + "loss": 0.5066, + "step": 347 + }, + { + "epoch": 0.9557157569515963, + "grad_norm": 0.0673828125, + "learning_rate": 2.613407908200582e-05, + "loss": 0.4485, + "step": 348 + }, + { + "epoch": 0.9584620665980089, + "grad_norm": 0.04736328125, + "learning_rate": 2.6112214881757285e-05, + "loss": 0.5076, + "step": 349 + }, + { + "epoch": 0.9612083762444216, + "grad_norm": 0.044189453125, + "learning_rate": 2.6090298229319477e-05, + "loss": 0.5024, + "step": 350 + }, + { + "epoch": 0.9639546858908342, + "grad_norm": 0.043212890625, + "learning_rate": 2.6068329228144016e-05, + "loss": 0.4839, + "step": 351 + }, + { + "epoch": 0.9667009955372469, + "grad_norm": 0.04150390625, + "learning_rate": 2.604630798192959e-05, + "loss": 0.5425, + "step": 352 + }, + { + "epoch": 0.9694473051836594, + "grad_norm": 0.04150390625, + "learning_rate": 2.60242345946215e-05, + "loss": 0.4468, + "step": 353 + }, + { + "epoch": 0.972193614830072, + "grad_norm": 0.045166015625, + "learning_rate": 2.6002109170411178e-05, + "loss": 0.5624, + "step": 354 + }, + { + "epoch": 0.9749399244764847, + "grad_norm": 0.03759765625, + "learning_rate": 2.597993181373567e-05, + "loss": 0.3949, + "step": 355 + }, + { + "epoch": 0.9776862341228973, + "grad_norm": 0.0390625, + "learning_rate": 2.5957702629277154e-05, + "loss": 0.5243, + "step": 356 + }, + { + "epoch": 0.98043254376931, + "grad_norm": 0.04296875, + "learning_rate": 2.593542172196246e-05, + "loss": 0.574, + "step": 357 + }, + { + "epoch": 0.9831788534157226, + "grad_norm": 0.050048828125, + "learning_rate": 2.5913089196962547e-05, + "loss": 0.4708, + "step": 358 + }, + { + "epoch": 0.9859251630621353, + "grad_norm": 0.044189453125, + "learning_rate": 2.5890705159692036e-05, + "loss": 0.4344, + "step": 359 + }, + { + "epoch": 0.9886714727085479, + "grad_norm": 0.041748046875, + "learning_rate": 2.5868269715808685e-05, + "loss": 0.4977, + "step": 360 + }, + { + "epoch": 0.9914177823549605, + "grad_norm": 0.04248046875, + "learning_rate": 2.58457829712129e-05, + "loss": 0.551, + "step": 361 + }, + { + "epoch": 0.9941640920013731, + "grad_norm": 0.039794921875, + "learning_rate": 2.5823245032047255e-05, + "loss": 0.5069, + "step": 362 + }, + { + "epoch": 0.9969104016477858, + "grad_norm": 0.0419921875, + "learning_rate": 2.5800656004695962e-05, + "loss": 0.5246, + "step": 363 + }, + { + "epoch": 0.9996567112941984, + "grad_norm": 0.048583984375, + "learning_rate": 2.5778015995784385e-05, + "loss": 0.6325, + "step": 364 + }, + { + "epoch": 0.9996567112941984, + "eval_loss": 0.509181559085846, + "eval_runtime": 618.8303, + "eval_samples_per_second": 14.812, + "eval_steps_per_second": 14.812, + "step": 364 + }, + { + "epoch": 1.002403020940611, + "grad_norm": 0.04248046875, + "learning_rate": 2.575532511217852e-05, + "loss": 0.607, + "step": 365 + }, + { + "epoch": 1.0051493305870236, + "grad_norm": 0.043701171875, + "learning_rate": 2.5732583460984527e-05, + "loss": 0.5572, + "step": 366 + }, + { + "epoch": 1.0078956402334363, + "grad_norm": 0.201171875, + "learning_rate": 2.5709791149548184e-05, + "loss": 1.256, + "step": 367 + }, + { + "epoch": 1.010641949879849, + "grad_norm": 0.04248046875, + "learning_rate": 2.56869482854544e-05, + "loss": 0.4604, + "step": 368 + }, + { + "epoch": 1.0020597322348095, + "grad_norm": 0.04345703125, + "learning_rate": 2.5664054976526702e-05, + "loss": 0.5396, + "step": 369 + }, + { + "epoch": 1.0048060418812221, + "grad_norm": 0.04541015625, + "learning_rate": 2.564111133082674e-05, + "loss": 0.4803, + "step": 370 + }, + { + "epoch": 1.0075523515276348, + "grad_norm": 0.05712890625, + "learning_rate": 2.561811745665374e-05, + "loss": 0.3781, + "step": 371 + }, + { + "epoch": 1.0102986611740474, + "grad_norm": 0.041015625, + "learning_rate": 2.5595073462544046e-05, + "loss": 0.4143, + "step": 372 + }, + { + "epoch": 1.01304497082046, + "grad_norm": 0.043212890625, + "learning_rate": 2.5571979457270565e-05, + "loss": 0.4698, + "step": 373 + }, + { + "epoch": 1.0157912804668727, + "grad_norm": 0.0400390625, + "learning_rate": 2.5548835549842274e-05, + "loss": 0.5101, + "step": 374 + }, + { + "epoch": 1.0185375901132854, + "grad_norm": 0.039794921875, + "learning_rate": 2.5525641849503685e-05, + "loss": 0.4252, + "step": 375 + }, + { + "epoch": 1.0212838997596978, + "grad_norm": 0.0458984375, + "learning_rate": 2.5502398465734357e-05, + "loss": 0.5116, + "step": 376 + }, + { + "epoch": 1.0240302094061104, + "grad_norm": 0.040283203125, + "learning_rate": 2.5479105508248373e-05, + "loss": 0.4816, + "step": 377 + }, + { + "epoch": 1.026776519052523, + "grad_norm": 0.044921875, + "learning_rate": 2.54557630869938e-05, + "loss": 0.4521, + "step": 378 + }, + { + "epoch": 1.0295228286989357, + "grad_norm": 0.041748046875, + "learning_rate": 2.543237131215219e-05, + "loss": 0.4769, + "step": 379 + }, + { + "epoch": 1.0322691383453484, + "grad_norm": 0.044189453125, + "learning_rate": 2.5408930294138065e-05, + "loss": 0.5011, + "step": 380 + }, + { + "epoch": 1.035015447991761, + "grad_norm": 0.0390625, + "learning_rate": 2.538544014359837e-05, + "loss": 0.407, + "step": 381 + }, + { + "epoch": 1.0377617576381737, + "grad_norm": 0.038330078125, + "learning_rate": 2.536190097141197e-05, + "loss": 0.4991, + "step": 382 + }, + { + "epoch": 1.0405080672845863, + "grad_norm": 0.04248046875, + "learning_rate": 2.5338312888689137e-05, + "loss": 0.5129, + "step": 383 + }, + { + "epoch": 1.043254376930999, + "grad_norm": 0.043212890625, + "learning_rate": 2.5314676006771e-05, + "loss": 0.4409, + "step": 384 + }, + { + "epoch": 1.0460006865774116, + "grad_norm": 0.038818359375, + "learning_rate": 2.529099043722903e-05, + "loss": 0.542, + "step": 385 + }, + { + "epoch": 1.0487469962238243, + "grad_norm": 0.041748046875, + "learning_rate": 2.526725629186452e-05, + "loss": 0.5767, + "step": 386 + }, + { + "epoch": 1.051493305870237, + "grad_norm": 0.04345703125, + "learning_rate": 2.5243473682708057e-05, + "loss": 0.5457, + "step": 387 + }, + { + "epoch": 1.0542396155166496, + "grad_norm": 0.0380859375, + "learning_rate": 2.5219642722018975e-05, + "loss": 0.4768, + "step": 388 + }, + { + "epoch": 1.0569859251630622, + "grad_norm": 0.04345703125, + "learning_rate": 2.5195763522284848e-05, + "loss": 0.58, + "step": 389 + }, + { + "epoch": 1.0597322348094749, + "grad_norm": 0.041259765625, + "learning_rate": 2.5171836196220946e-05, + "loss": 0.5176, + "step": 390 + }, + { + "epoch": 1.0624785444558873, + "grad_norm": 0.047607421875, + "learning_rate": 2.51478608567697e-05, + "loss": 0.4992, + "step": 391 + }, + { + "epoch": 1.0652248541023, + "grad_norm": 0.036865234375, + "learning_rate": 2.512383761710019e-05, + "loss": 0.5167, + "step": 392 + }, + { + "epoch": 1.0679711637487126, + "grad_norm": 0.162109375, + "learning_rate": 2.5099766590607587e-05, + "loss": 1.119, + "step": 393 + }, + { + "epoch": 1.0707174733951252, + "grad_norm": 0.048828125, + "learning_rate": 2.5075647890912628e-05, + "loss": 0.4643, + "step": 394 + }, + { + "epoch": 1.0734637830415379, + "grad_norm": 0.04052734375, + "learning_rate": 2.505148163186107e-05, + "loss": 0.5572, + "step": 395 + }, + { + "epoch": 1.0762100926879505, + "grad_norm": 0.041748046875, + "learning_rate": 2.5027267927523178e-05, + "loss": 0.4685, + "step": 396 + }, + { + "epoch": 1.0789564023343632, + "grad_norm": 0.040771484375, + "learning_rate": 2.500300689219315e-05, + "loss": 0.5597, + "step": 397 + }, + { + "epoch": 1.0817027119807758, + "grad_norm": 0.04052734375, + "learning_rate": 2.4978698640388617e-05, + "loss": 0.47, + "step": 398 + }, + { + "epoch": 1.0844490216271885, + "grad_norm": 0.04833984375, + "learning_rate": 2.495434328685007e-05, + "loss": 0.5364, + "step": 399 + }, + { + "epoch": 1.0871953312736011, + "grad_norm": 0.041748046875, + "learning_rate": 2.492994094654033e-05, + "loss": 0.4303, + "step": 400 + }, + { + "epoch": 1.0899416409200138, + "grad_norm": 0.1435546875, + "learning_rate": 2.490549173464402e-05, + "loss": 1.1982, + "step": 401 + }, + { + "epoch": 1.0926879505664264, + "grad_norm": 0.0390625, + "learning_rate": 2.4880995766566986e-05, + "loss": 0.5137, + "step": 402 + }, + { + "epoch": 1.095434260212839, + "grad_norm": 0.04248046875, + "learning_rate": 2.4856453157935795e-05, + "loss": 0.4997, + "step": 403 + }, + { + "epoch": 1.0981805698592517, + "grad_norm": 0.040771484375, + "learning_rate": 2.483186402459715e-05, + "loss": 0.5209, + "step": 404 + }, + { + "epoch": 1.1009268795056641, + "grad_norm": 0.04541015625, + "learning_rate": 2.4807228482617376e-05, + "loss": 0.483, + "step": 405 + }, + { + "epoch": 1.1036731891520768, + "grad_norm": 0.04345703125, + "learning_rate": 2.4782546648281848e-05, + "loss": 0.5055, + "step": 406 + }, + { + "epoch": 1.1064194987984894, + "grad_norm": 0.039306640625, + "learning_rate": 2.4757818638094457e-05, + "loss": 0.462, + "step": 407 + }, + { + "epoch": 1.109165808444902, + "grad_norm": 0.04150390625, + "learning_rate": 2.473304456877705e-05, + "loss": 0.4663, + "step": 408 + }, + { + "epoch": 1.1119121180913147, + "grad_norm": 0.04541015625, + "learning_rate": 2.470822455726889e-05, + "loss": 0.5343, + "step": 409 + }, + { + "epoch": 1.1146584277377274, + "grad_norm": 0.039306640625, + "learning_rate": 2.468335872072609e-05, + "loss": 0.4854, + "step": 410 + }, + { + "epoch": 1.11740473738414, + "grad_norm": 0.046875, + "learning_rate": 2.4658447176521076e-05, + "loss": 0.5206, + "step": 411 + }, + { + "epoch": 1.1201510470305527, + "grad_norm": 0.04150390625, + "learning_rate": 2.463349004224201e-05, + "loss": 0.4738, + "step": 412 + }, + { + "epoch": 1.1228973566769653, + "grad_norm": 0.04248046875, + "learning_rate": 2.460848743569227e-05, + "loss": 0.5632, + "step": 413 + }, + { + "epoch": 1.125643666323378, + "grad_norm": 0.043212890625, + "learning_rate": 2.458343947488985e-05, + "loss": 0.6056, + "step": 414 + }, + { + "epoch": 1.1283899759697906, + "grad_norm": 0.146484375, + "learning_rate": 2.4558346278066853e-05, + "loss": 1.1007, + "step": 415 + }, + { + "epoch": 1.1311362856162033, + "grad_norm": 0.041259765625, + "learning_rate": 2.4533207963668883e-05, + "loss": 0.4747, + "step": 416 + }, + { + "epoch": 1.133882595262616, + "grad_norm": 0.0419921875, + "learning_rate": 2.4508024650354525e-05, + "loss": 0.439, + "step": 417 + }, + { + "epoch": 1.1366289049090286, + "grad_norm": 0.041748046875, + "learning_rate": 2.4482796456994757e-05, + "loss": 0.4913, + "step": 418 + }, + { + "epoch": 1.1393752145554412, + "grad_norm": 0.049072265625, + "learning_rate": 2.4457523502672415e-05, + "loss": 0.5722, + "step": 419 + }, + { + "epoch": 1.1421215242018539, + "grad_norm": 0.040283203125, + "learning_rate": 2.44322059066816e-05, + "loss": 0.3971, + "step": 420 + }, + { + "epoch": 1.1448678338482665, + "grad_norm": 0.0419921875, + "learning_rate": 2.440684378852714e-05, + "loss": 0.4724, + "step": 421 + }, + { + "epoch": 1.147614143494679, + "grad_norm": 0.052734375, + "learning_rate": 2.438143726792403e-05, + "loss": 0.5305, + "step": 422 + }, + { + "epoch": 1.1503604531410916, + "grad_norm": 0.056396484375, + "learning_rate": 2.435598646479683e-05, + "loss": 0.4924, + "step": 423 + }, + { + "epoch": 1.1531067627875042, + "grad_norm": 0.0390625, + "learning_rate": 2.4330491499279148e-05, + "loss": 0.4927, + "step": 424 + }, + { + "epoch": 1.1558530724339169, + "grad_norm": 0.0390625, + "learning_rate": 2.4304952491713035e-05, + "loss": 0.45, + "step": 425 + }, + { + "epoch": 1.1585993820803295, + "grad_norm": 0.048095703125, + "learning_rate": 2.4279369562648424e-05, + "loss": 0.5892, + "step": 426 + }, + { + "epoch": 1.1613456917267422, + "grad_norm": 0.045654296875, + "learning_rate": 2.4253742832842583e-05, + "loss": 0.4727, + "step": 427 + }, + { + "epoch": 1.1640920013731548, + "grad_norm": 0.04443359375, + "learning_rate": 2.4228072423259527e-05, + "loss": 0.5063, + "step": 428 + }, + { + "epoch": 1.1668383110195675, + "grad_norm": 0.06201171875, + "learning_rate": 2.420235845506944e-05, + "loss": 0.4872, + "step": 429 + }, + { + "epoch": 1.1695846206659801, + "grad_norm": 0.0390625, + "learning_rate": 2.4176601049648116e-05, + "loss": 0.3843, + "step": 430 + }, + { + "epoch": 1.1723309303123928, + "grad_norm": 0.05224609375, + "learning_rate": 2.415080032857639e-05, + "loss": 0.4478, + "step": 431 + }, + { + "epoch": 1.1750772399588054, + "grad_norm": 0.0419921875, + "learning_rate": 2.4124956413639548e-05, + "loss": 0.4964, + "step": 432 + }, + { + "epoch": 1.177823549605218, + "grad_norm": 0.04248046875, + "learning_rate": 2.4099069426826766e-05, + "loss": 0.5176, + "step": 433 + }, + { + "epoch": 1.1805698592516307, + "grad_norm": 0.04296875, + "learning_rate": 2.4073139490330526e-05, + "loss": 0.5596, + "step": 434 + }, + { + "epoch": 1.1833161688980431, + "grad_norm": 0.04638671875, + "learning_rate": 2.4047166726546047e-05, + "loss": 0.485, + "step": 435 + }, + { + "epoch": 1.1860624785444558, + "grad_norm": 0.04248046875, + "learning_rate": 2.4021151258070694e-05, + "loss": 0.4768, + "step": 436 + }, + { + "epoch": 1.1888087881908684, + "grad_norm": 0.05810546875, + "learning_rate": 2.3995093207703413e-05, + "loss": 0.5097, + "step": 437 + }, + { + "epoch": 1.191555097837281, + "grad_norm": 0.048095703125, + "learning_rate": 2.3968992698444153e-05, + "loss": 0.5401, + "step": 438 + }, + { + "epoch": 1.1943014074836937, + "grad_norm": 0.04248046875, + "learning_rate": 2.394284985349327e-05, + "loss": 0.425, + "step": 439 + }, + { + "epoch": 1.1970477171301064, + "grad_norm": 0.044677734375, + "learning_rate": 2.3916664796250946e-05, + "loss": 0.3752, + "step": 440 + }, + { + "epoch": 1.199794026776519, + "grad_norm": 0.042236328125, + "learning_rate": 2.389043765031664e-05, + "loss": 0.4724, + "step": 441 + }, + { + "epoch": 1.2025403364229317, + "grad_norm": 0.046142578125, + "learning_rate": 2.386416853948845e-05, + "loss": 0.5598, + "step": 442 + }, + { + "epoch": 1.2052866460693443, + "grad_norm": 0.0458984375, + "learning_rate": 2.3837857587762583e-05, + "loss": 0.3885, + "step": 443 + }, + { + "epoch": 1.208032955715757, + "grad_norm": 0.04931640625, + "learning_rate": 2.3811504919332727e-05, + "loss": 0.4608, + "step": 444 + }, + { + "epoch": 1.2107792653621696, + "grad_norm": 0.0390625, + "learning_rate": 2.378511065858949e-05, + "loss": 0.4457, + "step": 445 + }, + { + "epoch": 1.2135255750085823, + "grad_norm": 0.039306640625, + "learning_rate": 2.3758674930119807e-05, + "loss": 0.4162, + "step": 446 + }, + { + "epoch": 1.216271884654995, + "grad_norm": 0.054443359375, + "learning_rate": 2.3732197858706343e-05, + "loss": 0.4656, + "step": 447 + }, + { + "epoch": 1.2190181943014076, + "grad_norm": 0.0478515625, + "learning_rate": 2.370567956932692e-05, + "loss": 0.4525, + "step": 448 + }, + { + "epoch": 1.2217645039478202, + "grad_norm": 0.044921875, + "learning_rate": 2.367912018715391e-05, + "loss": 0.498, + "step": 449 + }, + { + "epoch": 1.2245108135942329, + "grad_norm": 0.047119140625, + "learning_rate": 2.3652519837553655e-05, + "loss": 0.3724, + "step": 450 + }, + { + "epoch": 1.2272571232406453, + "grad_norm": 0.051025390625, + "learning_rate": 2.3625878646085873e-05, + "loss": 0.3611, + "step": 451 + }, + { + "epoch": 1.230003432887058, + "grad_norm": 0.044189453125, + "learning_rate": 2.3599196738503068e-05, + "loss": 0.4002, + "step": 452 + }, + { + "epoch": 1.2327497425334706, + "grad_norm": 0.045654296875, + "learning_rate": 2.3572474240749932e-05, + "loss": 0.5691, + "step": 453 + }, + { + "epoch": 1.2354960521798832, + "grad_norm": 0.042236328125, + "learning_rate": 2.354571127896275e-05, + "loss": 0.536, + "step": 454 + }, + { + "epoch": 1.2382423618262959, + "grad_norm": 0.045654296875, + "learning_rate": 2.3518907979468807e-05, + "loss": 0.4385, + "step": 455 + }, + { + "epoch": 1.2382423618262959, + "eval_loss": 0.5073373913764954, + "eval_runtime": 627.5271, + "eval_samples_per_second": 14.607, + "eval_steps_per_second": 14.607, + "step": 455 + }, + { + "epoch": 1.2409886714727085, + "grad_norm": 0.04296875, + "learning_rate": 2.349206446878578e-05, + "loss": 0.5131, + "step": 456 + }, + { + "epoch": 1.2437349811191212, + "grad_norm": 0.042236328125, + "learning_rate": 2.346518087362118e-05, + "loss": 0.4821, + "step": 457 + }, + { + "epoch": 1.2464812907655338, + "grad_norm": 0.04541015625, + "learning_rate": 2.3438257320871704e-05, + "loss": 0.5344, + "step": 458 + }, + { + "epoch": 1.2492276004119465, + "grad_norm": 0.0419921875, + "learning_rate": 2.3411293937622658e-05, + "loss": 0.4752, + "step": 459 + }, + { + "epoch": 1.2519739100583591, + "grad_norm": 0.0400390625, + "learning_rate": 2.338429085114737e-05, + "loss": 0.4887, + "step": 460 + }, + { + "epoch": 1.2547202197047718, + "grad_norm": 0.04248046875, + "learning_rate": 2.335724818890656e-05, + "loss": 0.4445, + "step": 461 + }, + { + "epoch": 1.2574665293511844, + "grad_norm": 0.047119140625, + "learning_rate": 2.3330166078547763e-05, + "loss": 0.5841, + "step": 462 + }, + { + "epoch": 1.2602128389975968, + "grad_norm": 0.045654296875, + "learning_rate": 2.3303044647904725e-05, + "loss": 0.519, + "step": 463 + }, + { + "epoch": 1.2629591486440095, + "grad_norm": 0.044189453125, + "learning_rate": 2.3275884024996784e-05, + "loss": 0.5149, + "step": 464 + }, + { + "epoch": 1.2657054582904221, + "grad_norm": 0.04248046875, + "learning_rate": 2.324868433802827e-05, + "loss": 0.4681, + "step": 465 + }, + { + "epoch": 1.2684517679368348, + "grad_norm": 0.044921875, + "learning_rate": 2.3221445715387917e-05, + "loss": 0.5058, + "step": 466 + }, + { + "epoch": 1.2711980775832474, + "grad_norm": 0.05126953125, + "learning_rate": 2.319416828564824e-05, + "loss": 0.5142, + "step": 467 + }, + { + "epoch": 1.27394438722966, + "grad_norm": 0.042236328125, + "learning_rate": 2.3166852177564925e-05, + "loss": 0.4682, + "step": 468 + }, + { + "epoch": 1.2766906968760727, + "grad_norm": 0.044677734375, + "learning_rate": 2.3139497520076233e-05, + "loss": 0.4361, + "step": 469 + }, + { + "epoch": 1.2794370065224854, + "grad_norm": 0.043212890625, + "learning_rate": 2.3112104442302393e-05, + "loss": 0.5738, + "step": 470 + }, + { + "epoch": 1.282183316168898, + "grad_norm": 0.0478515625, + "learning_rate": 2.3084673073544976e-05, + "loss": 0.4828, + "step": 471 + }, + { + "epoch": 1.2849296258153107, + "grad_norm": 0.04248046875, + "learning_rate": 2.3057203543286297e-05, + "loss": 0.503, + "step": 472 + }, + { + "epoch": 1.2876759354617233, + "grad_norm": 0.04541015625, + "learning_rate": 2.3029695981188818e-05, + "loss": 0.5526, + "step": 473 + }, + { + "epoch": 1.290422245108136, + "grad_norm": 0.042236328125, + "learning_rate": 2.3002150517094496e-05, + "loss": 0.4757, + "step": 474 + }, + { + "epoch": 1.2931685547545486, + "grad_norm": 0.044189453125, + "learning_rate": 2.297456728102421e-05, + "loss": 0.5773, + "step": 475 + }, + { + "epoch": 1.2959148644009613, + "grad_norm": 0.041748046875, + "learning_rate": 2.294694640317713e-05, + "loss": 0.5248, + "step": 476 + }, + { + "epoch": 1.298661174047374, + "grad_norm": 0.045166015625, + "learning_rate": 2.2919288013930094e-05, + "loss": 0.4915, + "step": 477 + }, + { + "epoch": 1.3014074836937866, + "grad_norm": 0.0400390625, + "learning_rate": 2.2891592243837015e-05, + "loss": 0.5389, + "step": 478 + }, + { + "epoch": 1.3041537933401992, + "grad_norm": 0.04248046875, + "learning_rate": 2.286385922362824e-05, + "loss": 0.4232, + "step": 479 + }, + { + "epoch": 1.3069001029866119, + "grad_norm": 0.0439453125, + "learning_rate": 2.2836089084209955e-05, + "loss": 0.5072, + "step": 480 + }, + { + "epoch": 1.3096464126330245, + "grad_norm": 0.044677734375, + "learning_rate": 2.280828195666355e-05, + "loss": 0.54, + "step": 481 + }, + { + "epoch": 1.312392722279437, + "grad_norm": 0.048828125, + "learning_rate": 2.2780437972245014e-05, + "loss": 0.5446, + "step": 482 + }, + { + "epoch": 1.3151390319258496, + "grad_norm": 0.046142578125, + "learning_rate": 2.2752557262384307e-05, + "loss": 0.4725, + "step": 483 + }, + { + "epoch": 1.3178853415722622, + "grad_norm": 0.1650390625, + "learning_rate": 2.2724639958684733e-05, + "loss": 1.2587, + "step": 484 + }, + { + "epoch": 1.3206316512186749, + "grad_norm": 0.043212890625, + "learning_rate": 2.2696686192922342e-05, + "loss": 0.4965, + "step": 485 + }, + { + "epoch": 1.3233779608650875, + "grad_norm": 0.043701171875, + "learning_rate": 2.2668696097045284e-05, + "loss": 0.5382, + "step": 486 + }, + { + "epoch": 1.3261242705115002, + "grad_norm": 0.048583984375, + "learning_rate": 2.2640669803173195e-05, + "loss": 0.4305, + "step": 487 + }, + { + "epoch": 1.3288705801579128, + "grad_norm": 0.04150390625, + "learning_rate": 2.2612607443596572e-05, + "loss": 0.4622, + "step": 488 + }, + { + "epoch": 1.3316168898043255, + "grad_norm": 0.05908203125, + "learning_rate": 2.258450915077616e-05, + "loss": 0.4975, + "step": 489 + }, + { + "epoch": 1.3343631994507381, + "grad_norm": 0.04541015625, + "learning_rate": 2.2556375057342306e-05, + "loss": 0.6356, + "step": 490 + }, + { + "epoch": 1.3371095090971508, + "grad_norm": 0.042724609375, + "learning_rate": 2.2528205296094356e-05, + "loss": 0.4422, + "step": 491 + }, + { + "epoch": 1.3398558187435634, + "grad_norm": 0.04296875, + "learning_rate": 2.25e-05, + "loss": 0.446, + "step": 492 + }, + { + "epoch": 1.3426021283899758, + "grad_norm": 0.051025390625, + "learning_rate": 2.247175930219468e-05, + "loss": 0.5996, + "step": 493 + }, + { + "epoch": 1.3453484380363885, + "grad_norm": 0.04833984375, + "learning_rate": 2.2443483335980924e-05, + "loss": 0.5905, + "step": 494 + }, + { + "epoch": 1.3480947476828011, + "grad_norm": 0.047607421875, + "learning_rate": 2.2415172234827754e-05, + "loss": 0.5824, + "step": 495 + }, + { + "epoch": 1.3508410573292138, + "grad_norm": 0.041748046875, + "learning_rate": 2.238682613237001e-05, + "loss": 0.4885, + "step": 496 + }, + { + "epoch": 1.3535873669756264, + "grad_norm": 0.048095703125, + "learning_rate": 2.2358445162407775e-05, + "loss": 0.587, + "step": 497 + }, + { + "epoch": 1.356333676622039, + "grad_norm": 0.042724609375, + "learning_rate": 2.2330029458905697e-05, + "loss": 0.5453, + "step": 498 + }, + { + "epoch": 1.3590799862684517, + "grad_norm": 0.04296875, + "learning_rate": 2.230157915599238e-05, + "loss": 0.4596, + "step": 499 + }, + { + "epoch": 1.3618262959148644, + "grad_norm": 0.04736328125, + "learning_rate": 2.2273094387959747e-05, + "loss": 0.4349, + "step": 500 + }, + { + "epoch": 1.364572605561277, + "grad_norm": 0.0458984375, + "learning_rate": 2.2244575289262394e-05, + "loss": 0.4613, + "step": 501 + }, + { + "epoch": 1.3673189152076897, + "grad_norm": 0.0419921875, + "learning_rate": 2.221602199451698e-05, + "loss": 0.4176, + "step": 502 + }, + { + "epoch": 1.3700652248541023, + "grad_norm": 0.049560546875, + "learning_rate": 2.2187434638501564e-05, + "loss": 0.4799, + "step": 503 + }, + { + "epoch": 1.372811534500515, + "grad_norm": 0.03955078125, + "learning_rate": 2.215881335615499e-05, + "loss": 0.4335, + "step": 504 + }, + { + "epoch": 1.3755578441469276, + "grad_norm": 0.0478515625, + "learning_rate": 2.2130158282576245e-05, + "loss": 0.5999, + "step": 505 + }, + { + "epoch": 1.3783041537933403, + "grad_norm": 0.0478515625, + "learning_rate": 2.2101469553023807e-05, + "loss": 0.4654, + "step": 506 + }, + { + "epoch": 1.381050463439753, + "grad_norm": 0.04150390625, + "learning_rate": 2.2072747302915026e-05, + "loss": 0.4423, + "step": 507 + }, + { + "epoch": 1.3837967730861656, + "grad_norm": 0.04541015625, + "learning_rate": 2.2043991667825478e-05, + "loss": 0.5145, + "step": 508 + }, + { + "epoch": 1.3865430827325782, + "grad_norm": 0.0400390625, + "learning_rate": 2.2015202783488316e-05, + "loss": 0.5894, + "step": 509 + }, + { + "epoch": 1.3892893923789909, + "grad_norm": 0.042236328125, + "learning_rate": 2.1986380785793646e-05, + "loss": 0.5228, + "step": 510 + }, + { + "epoch": 1.3920357020254035, + "grad_norm": 0.048828125, + "learning_rate": 2.195752581078787e-05, + "loss": 0.5529, + "step": 511 + }, + { + "epoch": 1.394782011671816, + "grad_norm": 0.043701171875, + "learning_rate": 2.1928637994673053e-05, + "loss": 0.5783, + "step": 512 + }, + { + "epoch": 1.3975283213182286, + "grad_norm": 0.04248046875, + "learning_rate": 2.1899717473806273e-05, + "loss": 0.418, + "step": 513 + }, + { + "epoch": 1.4002746309646412, + "grad_norm": 0.04541015625, + "learning_rate": 2.1870764384698992e-05, + "loss": 0.4945, + "step": 514 + }, + { + "epoch": 1.4030209406110539, + "grad_norm": 0.050048828125, + "learning_rate": 2.1841778864016396e-05, + "loss": 0.496, + "step": 515 + }, + { + "epoch": 1.4057672502574665, + "grad_norm": 0.042724609375, + "learning_rate": 2.1812761048576752e-05, + "loss": 0.5087, + "step": 516 + }, + { + "epoch": 1.4085135599038792, + "grad_norm": 0.04833984375, + "learning_rate": 2.1783711075350766e-05, + "loss": 0.4898, + "step": 517 + }, + { + "epoch": 1.4112598695502918, + "grad_norm": 0.0419921875, + "learning_rate": 2.1754629081460947e-05, + "loss": 0.4379, + "step": 518 + }, + { + "epoch": 1.4140061791967045, + "grad_norm": 0.04541015625, + "learning_rate": 2.172551520418093e-05, + "loss": 0.4827, + "step": 519 + }, + { + "epoch": 1.416752488843117, + "grad_norm": 0.044677734375, + "learning_rate": 2.169636958093487e-05, + "loss": 0.5007, + "step": 520 + }, + { + "epoch": 1.4194987984895298, + "grad_norm": 0.043212890625, + "learning_rate": 2.1667192349296746e-05, + "loss": 0.4651, + "step": 521 + }, + { + "epoch": 1.4222451081359424, + "grad_norm": 0.041748046875, + "learning_rate": 2.1637983646989758e-05, + "loss": 0.4674, + "step": 522 + }, + { + "epoch": 1.4249914177823548, + "grad_norm": 0.045166015625, + "learning_rate": 2.1608743611885633e-05, + "loss": 0.4794, + "step": 523 + }, + { + "epoch": 1.4277377274287675, + "grad_norm": 0.045166015625, + "learning_rate": 2.1579472382004015e-05, + "loss": 0.5292, + "step": 524 + }, + { + "epoch": 1.4304840370751801, + "grad_norm": 0.04443359375, + "learning_rate": 2.1550170095511784e-05, + "loss": 0.4964, + "step": 525 + }, + { + "epoch": 1.4332303467215928, + "grad_norm": 0.0537109375, + "learning_rate": 2.1520836890722416e-05, + "loss": 0.4236, + "step": 526 + }, + { + "epoch": 1.4359766563680054, + "grad_norm": 0.044921875, + "learning_rate": 2.149147290609533e-05, + "loss": 0.4859, + "step": 527 + }, + { + "epoch": 1.438722966014418, + "grad_norm": 0.04638671875, + "learning_rate": 2.146207828023524e-05, + "loss": 0.4659, + "step": 528 + }, + { + "epoch": 1.4414692756608307, + "grad_norm": 0.048828125, + "learning_rate": 2.1432653151891473e-05, + "loss": 0.4424, + "step": 529 + }, + { + "epoch": 1.4442155853072434, + "grad_norm": 0.04345703125, + "learning_rate": 2.1403197659957356e-05, + "loss": 0.4515, + "step": 530 + }, + { + "epoch": 1.446961894953656, + "grad_norm": 0.041015625, + "learning_rate": 2.137371194346953e-05, + "loss": 0.4618, + "step": 531 + }, + { + "epoch": 1.4497082046000687, + "grad_norm": 0.042236328125, + "learning_rate": 2.1344196141607297e-05, + "loss": 0.3928, + "step": 532 + }, + { + "epoch": 1.4524545142464813, + "grad_norm": 0.044189453125, + "learning_rate": 2.1314650393691984e-05, + "loss": 0.4598, + "step": 533 + }, + { + "epoch": 1.455200823892894, + "grad_norm": 0.046630859375, + "learning_rate": 2.1285074839186257e-05, + "loss": 0.5646, + "step": 534 + }, + { + "epoch": 1.4579471335393066, + "grad_norm": 0.0517578125, + "learning_rate": 2.1255469617693476e-05, + "loss": 0.5984, + "step": 535 + }, + { + "epoch": 1.4606934431857193, + "grad_norm": 0.04296875, + "learning_rate": 2.122583486895705e-05, + "loss": 0.5419, + "step": 536 + }, + { + "epoch": 1.463439752832132, + "grad_norm": 0.043212890625, + "learning_rate": 2.119617073285974e-05, + "loss": 0.5481, + "step": 537 + }, + { + "epoch": 1.4661860624785445, + "grad_norm": 0.042236328125, + "learning_rate": 2.116647734942305e-05, + "loss": 0.5588, + "step": 538 + }, + { + "epoch": 1.4689323721249572, + "grad_norm": 0.04736328125, + "learning_rate": 2.113675485880652e-05, + "loss": 0.5621, + "step": 539 + }, + { + "epoch": 1.4716786817713698, + "grad_norm": 0.044189453125, + "learning_rate": 2.110700340130708e-05, + "loss": 0.5056, + "step": 540 + }, + { + "epoch": 1.4744249914177825, + "grad_norm": 0.044921875, + "learning_rate": 2.1077223117358395e-05, + "loss": 0.5526, + "step": 541 + }, + { + "epoch": 1.477171301064195, + "grad_norm": 0.048828125, + "learning_rate": 2.104741414753021e-05, + "loss": 0.5414, + "step": 542 + }, + { + "epoch": 1.4799176107106076, + "grad_norm": 0.056640625, + "learning_rate": 2.1017576632527662e-05, + "loss": 0.5472, + "step": 543 + }, + { + "epoch": 1.4826639203570202, + "grad_norm": 0.0517578125, + "learning_rate": 2.098771071319062e-05, + "loss": 0.4568, + "step": 544 + }, + { + "epoch": 1.4854102300034329, + "grad_norm": 0.046142578125, + "learning_rate": 2.0957816530493037e-05, + "loss": 0.4277, + "step": 545 + }, + { + "epoch": 1.4881565396498455, + "grad_norm": 0.048583984375, + "learning_rate": 2.0927894225542282e-05, + "loss": 0.4949, + "step": 546 + }, + { + "epoch": 1.4881565396498455, + "eval_loss": 0.5060501098632812, + "eval_runtime": 630.5882, + "eval_samples_per_second": 14.536, + "eval_steps_per_second": 14.536, + "step": 546 + }, + { + "epoch": 1.4909028492962582, + "grad_norm": 0.04052734375, + "learning_rate": 2.089794393957846e-05, + "loss": 0.3558, + "step": 547 + }, + { + "epoch": 1.4936491589426708, + "grad_norm": 0.043212890625, + "learning_rate": 2.086796581397374e-05, + "loss": 0.4622, + "step": 548 + }, + { + "epoch": 1.4963954685890835, + "grad_norm": 0.04833984375, + "learning_rate": 2.083795999023173e-05, + "loss": 0.5402, + "step": 549 + }, + { + "epoch": 1.499141778235496, + "grad_norm": 0.048583984375, + "learning_rate": 2.080792660998676e-05, + "loss": 0.5271, + "step": 550 + }, + { + "epoch": 1.5018880878819085, + "grad_norm": 0.0439453125, + "learning_rate": 2.0777865815003234e-05, + "loss": 0.5152, + "step": 551 + }, + { + "epoch": 1.5046343975283212, + "grad_norm": 0.04541015625, + "learning_rate": 2.074777774717496e-05, + "loss": 0.5099, + "step": 552 + }, + { + "epoch": 1.5073807071747338, + "grad_norm": 0.041015625, + "learning_rate": 2.0717662548524482e-05, + "loss": 0.4075, + "step": 553 + }, + { + "epoch": 1.5101270168211465, + "grad_norm": 0.048828125, + "learning_rate": 2.068752036120241e-05, + "loss": 0.5205, + "step": 554 + }, + { + "epoch": 1.5128733264675591, + "grad_norm": 0.045166015625, + "learning_rate": 2.0657351327486745e-05, + "loss": 0.5127, + "step": 555 + }, + { + "epoch": 1.5156196361139718, + "grad_norm": 0.044921875, + "learning_rate": 2.0627155589782212e-05, + "loss": 0.5399, + "step": 556 + }, + { + "epoch": 1.5183659457603844, + "grad_norm": 0.04150390625, + "learning_rate": 2.0596933290619572e-05, + "loss": 0.4869, + "step": 557 + }, + { + "epoch": 1.521112255406797, + "grad_norm": 0.0498046875, + "learning_rate": 2.0566684572654978e-05, + "loss": 0.6318, + "step": 558 + }, + { + "epoch": 1.5238585650532097, + "grad_norm": 0.0576171875, + "learning_rate": 2.0536409578669277e-05, + "loss": 0.4729, + "step": 559 + }, + { + "epoch": 1.5266048746996224, + "grad_norm": 0.046630859375, + "learning_rate": 2.0506108451567347e-05, + "loss": 0.5059, + "step": 560 + }, + { + "epoch": 1.529351184346035, + "grad_norm": 0.043701171875, + "learning_rate": 2.0475781334377426e-05, + "loss": 0.3829, + "step": 561 + }, + { + "epoch": 1.5320974939924477, + "grad_norm": 0.0478515625, + "learning_rate": 2.044542837025042e-05, + "loss": 0.4582, + "step": 562 + }, + { + "epoch": 1.5348438036388603, + "grad_norm": 0.046630859375, + "learning_rate": 2.0415049702459244e-05, + "loss": 0.5344, + "step": 563 + }, + { + "epoch": 1.537590113285273, + "grad_norm": 0.042236328125, + "learning_rate": 2.0384645474398137e-05, + "loss": 0.4508, + "step": 564 + }, + { + "epoch": 1.5403364229316856, + "grad_norm": 0.040283203125, + "learning_rate": 2.0354215829582005e-05, + "loss": 0.4973, + "step": 565 + }, + { + "epoch": 1.5430827325780982, + "grad_norm": 0.046630859375, + "learning_rate": 2.03237609116457e-05, + "loss": 0.5406, + "step": 566 + }, + { + "epoch": 1.545829042224511, + "grad_norm": 0.041259765625, + "learning_rate": 2.029328086434339e-05, + "loss": 0.4956, + "step": 567 + }, + { + "epoch": 1.5485753518709235, + "grad_norm": 0.06396484375, + "learning_rate": 2.0262775831547847e-05, + "loss": 0.5642, + "step": 568 + }, + { + "epoch": 1.5513216615173362, + "grad_norm": 0.043701171875, + "learning_rate": 2.0232245957249788e-05, + "loss": 0.5424, + "step": 569 + }, + { + "epoch": 1.5540679711637488, + "grad_norm": 0.044677734375, + "learning_rate": 2.020169138555718e-05, + "loss": 0.4972, + "step": 570 + }, + { + "epoch": 1.5568142808101615, + "grad_norm": 0.05126953125, + "learning_rate": 2.0171112260694576e-05, + "loss": 0.4511, + "step": 571 + }, + { + "epoch": 1.5595605904565741, + "grad_norm": 0.045654296875, + "learning_rate": 2.0140508727002422e-05, + "loss": 0.4669, + "step": 572 + }, + { + "epoch": 1.5623069001029866, + "grad_norm": 0.046875, + "learning_rate": 2.0109880928936375e-05, + "loss": 0.5472, + "step": 573 + }, + { + "epoch": 1.5650532097493992, + "grad_norm": 0.042236328125, + "learning_rate": 2.007922901106663e-05, + "loss": 0.5493, + "step": 574 + }, + { + "epoch": 1.5677995193958119, + "grad_norm": 0.0439453125, + "learning_rate": 2.0048553118077238e-05, + "loss": 0.46, + "step": 575 + }, + { + "epoch": 1.5705458290422245, + "grad_norm": 0.04931640625, + "learning_rate": 2.0017853394765402e-05, + "loss": 0.6062, + "step": 576 + }, + { + "epoch": 1.5732921386886372, + "grad_norm": 0.044189453125, + "learning_rate": 1.9987129986040825e-05, + "loss": 0.5053, + "step": 577 + }, + { + "epoch": 1.5760384483350498, + "grad_norm": 0.05029296875, + "learning_rate": 1.9956383036925006e-05, + "loss": 0.5205, + "step": 578 + }, + { + "epoch": 1.5787847579814624, + "grad_norm": 0.045654296875, + "learning_rate": 1.9925612692550554e-05, + "loss": 0.5296, + "step": 579 + }, + { + "epoch": 1.581531067627875, + "grad_norm": 0.042236328125, + "learning_rate": 1.989481909816052e-05, + "loss": 0.577, + "step": 580 + }, + { + "epoch": 1.5842773772742875, + "grad_norm": 0.04833984375, + "learning_rate": 1.986400239910769e-05, + "loss": 0.5867, + "step": 581 + }, + { + "epoch": 1.5870236869207002, + "grad_norm": 0.044189453125, + "learning_rate": 1.9833162740853916e-05, + "loss": 0.5371, + "step": 582 + }, + { + "epoch": 1.5897699965671128, + "grad_norm": 0.044921875, + "learning_rate": 1.980230026896942e-05, + "loss": 0.4848, + "step": 583 + }, + { + "epoch": 1.5925163062135255, + "grad_norm": 0.040283203125, + "learning_rate": 1.977141512913211e-05, + "loss": 0.4747, + "step": 584 + }, + { + "epoch": 1.5952626158599381, + "grad_norm": 0.041748046875, + "learning_rate": 1.974050746712689e-05, + "loss": 0.4296, + "step": 585 + }, + { + "epoch": 1.5980089255063508, + "grad_norm": 0.04296875, + "learning_rate": 1.9709577428844984e-05, + "loss": 0.4943, + "step": 586 + }, + { + "epoch": 1.6007552351527634, + "grad_norm": 0.04296875, + "learning_rate": 1.967862516028321e-05, + "loss": 0.487, + "step": 587 + }, + { + "epoch": 1.603501544799176, + "grad_norm": 0.046875, + "learning_rate": 1.9647650807543358e-05, + "loss": 0.5275, + "step": 588 + }, + { + "epoch": 1.6062478544455887, + "grad_norm": 0.046630859375, + "learning_rate": 1.961665451683143e-05, + "loss": 0.557, + "step": 589 + }, + { + "epoch": 1.6089941640920014, + "grad_norm": 0.0419921875, + "learning_rate": 1.9585636434456988e-05, + "loss": 0.4689, + "step": 590 + }, + { + "epoch": 1.611740473738414, + "grad_norm": 0.0439453125, + "learning_rate": 1.9554596706832457e-05, + "loss": 0.5351, + "step": 591 + }, + { + "epoch": 1.6144867833848267, + "grad_norm": 0.04931640625, + "learning_rate": 1.952353548047243e-05, + "loss": 0.5714, + "step": 592 + }, + { + "epoch": 1.6172330930312393, + "grad_norm": 0.045654296875, + "learning_rate": 1.9492452901992987e-05, + "loss": 0.5468, + "step": 593 + }, + { + "epoch": 1.619979402677652, + "grad_norm": 0.04931640625, + "learning_rate": 1.946134911811099e-05, + "loss": 0.5812, + "step": 594 + }, + { + "epoch": 1.6227257123240646, + "grad_norm": 0.044189453125, + "learning_rate": 1.9430224275643388e-05, + "loss": 0.5367, + "step": 595 + }, + { + "epoch": 1.6254720219704772, + "grad_norm": 0.046630859375, + "learning_rate": 1.9399078521506546e-05, + "loss": 0.5746, + "step": 596 + }, + { + "epoch": 1.62821833161689, + "grad_norm": 0.051513671875, + "learning_rate": 1.9367912002715524e-05, + "loss": 0.4458, + "step": 597 + }, + { + "epoch": 1.6309646412633025, + "grad_norm": 0.043212890625, + "learning_rate": 1.93367248663834e-05, + "loss": 0.4413, + "step": 598 + }, + { + "epoch": 1.6337109509097152, + "grad_norm": 0.0498046875, + "learning_rate": 1.9305517259720573e-05, + "loss": 0.5666, + "step": 599 + }, + { + "epoch": 1.6364572605561278, + "grad_norm": 0.042236328125, + "learning_rate": 1.9274289330034068e-05, + "loss": 0.5282, + "step": 600 + }, + { + "epoch": 1.6392035702025405, + "grad_norm": 0.050537109375, + "learning_rate": 1.924304122472683e-05, + "loss": 0.5065, + "step": 601 + }, + { + "epoch": 1.6419498798489531, + "grad_norm": 0.042724609375, + "learning_rate": 1.9211773091297057e-05, + "loss": 0.5519, + "step": 602 + }, + { + "epoch": 1.6446961894953656, + "grad_norm": 0.044189453125, + "learning_rate": 1.9180485077337462e-05, + "loss": 0.5044, + "step": 603 + }, + { + "epoch": 1.6474424991417782, + "grad_norm": 0.040283203125, + "learning_rate": 1.9149177330534614e-05, + "loss": 0.4895, + "step": 604 + }, + { + "epoch": 1.6501888087881909, + "grad_norm": 0.044677734375, + "learning_rate": 1.9117849998668212e-05, + "loss": 0.4553, + "step": 605 + }, + { + "epoch": 1.6529351184346035, + "grad_norm": 0.04638671875, + "learning_rate": 1.9086503229610418e-05, + "loss": 0.5583, + "step": 606 + }, + { + "epoch": 1.6556814280810161, + "grad_norm": 0.03955078125, + "learning_rate": 1.905513717132513e-05, + "loss": 0.3757, + "step": 607 + }, + { + "epoch": 1.6584277377274288, + "grad_norm": 0.04541015625, + "learning_rate": 1.90237519718673e-05, + "loss": 0.5956, + "step": 608 + }, + { + "epoch": 1.6611740473738414, + "grad_norm": 0.14453125, + "learning_rate": 1.899234777938222e-05, + "loss": 1.1236, + "step": 609 + }, + { + "epoch": 1.6639203570202539, + "grad_norm": 0.044677734375, + "learning_rate": 1.8960924742104856e-05, + "loss": 0.5466, + "step": 610 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.042724609375, + "learning_rate": 1.892948300835911e-05, + "loss": 0.4874, + "step": 611 + }, + { + "epoch": 1.6694129763130792, + "grad_norm": 0.0478515625, + "learning_rate": 1.889802272655713e-05, + "loss": 0.5116, + "step": 612 + }, + { + "epoch": 1.6721592859594918, + "grad_norm": 0.043701171875, + "learning_rate": 1.8866544045198634e-05, + "loss": 0.587, + "step": 613 + }, + { + "epoch": 1.6749055956059045, + "grad_norm": 0.04296875, + "learning_rate": 1.8835047112870163e-05, + "loss": 0.4174, + "step": 614 + }, + { + "epoch": 1.677651905252317, + "grad_norm": 0.04150390625, + "learning_rate": 1.880353207824444e-05, + "loss": 0.4023, + "step": 615 + }, + { + "epoch": 1.6803982148987298, + "grad_norm": 0.042724609375, + "learning_rate": 1.8771999090079613e-05, + "loss": 0.5134, + "step": 616 + }, + { + "epoch": 1.6831445245451424, + "grad_norm": 0.044677734375, + "learning_rate": 1.8740448297218575e-05, + "loss": 0.4694, + "step": 617 + }, + { + "epoch": 1.685890834191555, + "grad_norm": 0.0458984375, + "learning_rate": 1.8708879848588268e-05, + "loss": 0.5185, + "step": 618 + }, + { + "epoch": 1.6886371438379677, + "grad_norm": 0.0458984375, + "learning_rate": 1.8677293893198976e-05, + "loss": 0.5077, + "step": 619 + }, + { + "epoch": 1.6913834534843804, + "grad_norm": 0.045166015625, + "learning_rate": 1.864569058014361e-05, + "loss": 0.4517, + "step": 620 + }, + { + "epoch": 1.694129763130793, + "grad_norm": 0.040771484375, + "learning_rate": 1.8614070058597014e-05, + "loss": 0.4703, + "step": 621 + }, + { + "epoch": 1.6968760727772056, + "grad_norm": 0.04541015625, + "learning_rate": 1.8582432477815268e-05, + "loss": 0.5061, + "step": 622 + }, + { + "epoch": 1.6996223824236183, + "grad_norm": 0.04248046875, + "learning_rate": 1.855077798713497e-05, + "loss": 0.5413, + "step": 623 + }, + { + "epoch": 1.702368692070031, + "grad_norm": 0.05615234375, + "learning_rate": 1.8519106735972535e-05, + "loss": 0.4586, + "step": 624 + }, + { + "epoch": 1.7051150017164436, + "grad_norm": 0.042236328125, + "learning_rate": 1.84874188738235e-05, + "loss": 0.5022, + "step": 625 + }, + { + "epoch": 1.7078613113628562, + "grad_norm": 0.05224609375, + "learning_rate": 1.8455714550261793e-05, + "loss": 0.4945, + "step": 626 + }, + { + "epoch": 1.7106076210092689, + "grad_norm": 0.0478515625, + "learning_rate": 1.8423993914939063e-05, + "loss": 0.5806, + "step": 627 + }, + { + "epoch": 1.7133539306556815, + "grad_norm": 0.04296875, + "learning_rate": 1.8392257117583944e-05, + "loss": 0.462, + "step": 628 + }, + { + "epoch": 1.7161002403020942, + "grad_norm": 0.042236328125, + "learning_rate": 1.836050430800135e-05, + "loss": 0.4944, + "step": 629 + }, + { + "epoch": 1.7188465499485068, + "grad_norm": 0.1708984375, + "learning_rate": 1.83287356360718e-05, + "loss": 1.1722, + "step": 630 + }, + { + "epoch": 1.7215928595949195, + "grad_norm": 0.04248046875, + "learning_rate": 1.8296951251750667e-05, + "loss": 0.3718, + "step": 631 + }, + { + "epoch": 1.7243391692413321, + "grad_norm": 0.04345703125, + "learning_rate": 1.8265151305067486e-05, + "loss": 0.484, + "step": 632 + }, + { + "epoch": 1.7270854788877446, + "grad_norm": 0.04443359375, + "learning_rate": 1.8233335946125275e-05, + "loss": 0.4783, + "step": 633 + }, + { + "epoch": 1.7298317885341572, + "grad_norm": 0.045654296875, + "learning_rate": 1.8201505325099782e-05, + "loss": 0.5684, + "step": 634 + }, + { + "epoch": 1.7325780981805698, + "grad_norm": 0.043212890625, + "learning_rate": 1.8169659592238797e-05, + "loss": 0.4518, + "step": 635 + }, + { + "epoch": 1.7353244078269825, + "grad_norm": 0.0458984375, + "learning_rate": 1.813779889786144e-05, + "loss": 0.4535, + "step": 636 + }, + { + "epoch": 1.7380707174733951, + "grad_norm": 0.0458984375, + "learning_rate": 1.8105923392357464e-05, + "loss": 0.503, + "step": 637 + }, + { + "epoch": 1.7380707174733951, + "eval_loss": 0.5051947832107544, + "eval_runtime": 630.1537, + "eval_samples_per_second": 14.546, + "eval_steps_per_second": 14.546, + "step": 637 + }, + { + "epoch": 1.7408170271198078, + "grad_norm": 0.045166015625, + "learning_rate": 1.807403322618653e-05, + "loss": 0.4961, + "step": 638 + }, + { + "epoch": 1.7435633367662204, + "grad_norm": 0.05224609375, + "learning_rate": 1.8042128549877483e-05, + "loss": 0.519, + "step": 639 + }, + { + "epoch": 1.7463096464126329, + "grad_norm": 0.04541015625, + "learning_rate": 1.8010209514027687e-05, + "loss": 0.4011, + "step": 640 + }, + { + "epoch": 1.7490559560590455, + "grad_norm": 0.045166015625, + "learning_rate": 1.7978276269302275e-05, + "loss": 0.3935, + "step": 641 + }, + { + "epoch": 1.7518022657054582, + "grad_norm": 0.047607421875, + "learning_rate": 1.794632896643343e-05, + "loss": 0.6534, + "step": 642 + }, + { + "epoch": 1.7545485753518708, + "grad_norm": 0.04638671875, + "learning_rate": 1.7914367756219725e-05, + "loss": 0.5715, + "step": 643 + }, + { + "epoch": 1.7572948849982835, + "grad_norm": 0.04931640625, + "learning_rate": 1.7882392789525358e-05, + "loss": 0.5439, + "step": 644 + }, + { + "epoch": 1.760041194644696, + "grad_norm": 0.047607421875, + "learning_rate": 1.7850404217279467e-05, + "loss": 0.5277, + "step": 645 + }, + { + "epoch": 1.7627875042911088, + "grad_norm": 0.04736328125, + "learning_rate": 1.781840219047541e-05, + "loss": 0.586, + "step": 646 + }, + { + "epoch": 1.7655338139375214, + "grad_norm": 0.045166015625, + "learning_rate": 1.7786386860170054e-05, + "loss": 0.5291, + "step": 647 + }, + { + "epoch": 1.768280123583934, + "grad_norm": 0.04296875, + "learning_rate": 1.775435837748306e-05, + "loss": 0.3863, + "step": 648 + }, + { + "epoch": 1.7710264332303467, + "grad_norm": 0.044677734375, + "learning_rate": 1.7722316893596176e-05, + "loss": 0.5247, + "step": 649 + }, + { + "epoch": 1.7737727428767593, + "grad_norm": 0.04345703125, + "learning_rate": 1.7690262559752516e-05, + "loss": 0.4046, + "step": 650 + }, + { + "epoch": 1.776519052523172, + "grad_norm": 0.04345703125, + "learning_rate": 1.7658195527255847e-05, + "loss": 0.4744, + "step": 651 + }, + { + "epoch": 1.7792653621695846, + "grad_norm": 0.0625, + "learning_rate": 1.7626115947469877e-05, + "loss": 0.424, + "step": 652 + }, + { + "epoch": 1.7820116718159973, + "grad_norm": 0.045166015625, + "learning_rate": 1.759402397181754e-05, + "loss": 0.4644, + "step": 653 + }, + { + "epoch": 1.78475798146241, + "grad_norm": 0.04833984375, + "learning_rate": 1.7561919751780278e-05, + "loss": 0.6509, + "step": 654 + }, + { + "epoch": 1.7875042911088226, + "grad_norm": 0.04345703125, + "learning_rate": 1.7529803438897346e-05, + "loss": 0.4544, + "step": 655 + }, + { + "epoch": 1.7902506007552352, + "grad_norm": 0.04541015625, + "learning_rate": 1.7497675184765064e-05, + "loss": 0.4991, + "step": 656 + }, + { + "epoch": 1.7929969104016479, + "grad_norm": 0.04296875, + "learning_rate": 1.746553514103611e-05, + "loss": 0.5494, + "step": 657 + }, + { + "epoch": 1.7957432200480605, + "grad_norm": 0.04345703125, + "learning_rate": 1.743338345941883e-05, + "loss": 0.4772, + "step": 658 + }, + { + "epoch": 1.7984895296944732, + "grad_norm": 0.04638671875, + "learning_rate": 1.74012202916765e-05, + "loss": 0.5995, + "step": 659 + }, + { + "epoch": 1.8012358393408858, + "grad_norm": 0.0439453125, + "learning_rate": 1.7369045789626603e-05, + "loss": 0.5156, + "step": 660 + }, + { + "epoch": 1.8039821489872985, + "grad_norm": 0.0498046875, + "learning_rate": 1.7336860105140134e-05, + "loss": 0.3329, + "step": 661 + }, + { + "epoch": 1.806728458633711, + "grad_norm": 0.05078125, + "learning_rate": 1.730466339014086e-05, + "loss": 0.4797, + "step": 662 + }, + { + "epoch": 1.8094747682801235, + "grad_norm": 0.043212890625, + "learning_rate": 1.7272455796604622e-05, + "loss": 0.4494, + "step": 663 + }, + { + "epoch": 1.8122210779265362, + "grad_norm": 0.04345703125, + "learning_rate": 1.7240237476558615e-05, + "loss": 0.5881, + "step": 664 + }, + { + "epoch": 1.8149673875729488, + "grad_norm": 0.046142578125, + "learning_rate": 1.7208008582080652e-05, + "loss": 0.451, + "step": 665 + }, + { + "epoch": 1.8177136972193615, + "grad_norm": 0.05419921875, + "learning_rate": 1.7175769265298472e-05, + "loss": 0.3846, + "step": 666 + }, + { + "epoch": 1.8204600068657741, + "grad_norm": 0.04443359375, + "learning_rate": 1.7143519678389004e-05, + "loss": 0.4766, + "step": 667 + }, + { + "epoch": 1.8232063165121868, + "grad_norm": 0.04638671875, + "learning_rate": 1.7111259973577655e-05, + "loss": 0.4932, + "step": 668 + }, + { + "epoch": 1.8259526261585994, + "grad_norm": 0.045654296875, + "learning_rate": 1.7078990303137584e-05, + "loss": 0.4978, + "step": 669 + }, + { + "epoch": 1.8286989358050119, + "grad_norm": 0.043701171875, + "learning_rate": 1.7046710819389012e-05, + "loss": 0.5164, + "step": 670 + }, + { + "epoch": 1.8314452454514245, + "grad_norm": 0.04248046875, + "learning_rate": 1.7014421674698458e-05, + "loss": 0.5542, + "step": 671 + }, + { + "epoch": 1.8341915550978372, + "grad_norm": 0.041259765625, + "learning_rate": 1.6982123021478046e-05, + "loss": 0.3729, + "step": 672 + }, + { + "epoch": 1.8369378647442498, + "grad_norm": 0.046142578125, + "learning_rate": 1.6949815012184795e-05, + "loss": 0.4723, + "step": 673 + }, + { + "epoch": 1.8396841743906625, + "grad_norm": 0.044921875, + "learning_rate": 1.6917497799319876e-05, + "loss": 0.5643, + "step": 674 + }, + { + "epoch": 1.842430484037075, + "grad_norm": 0.052978515625, + "learning_rate": 1.6885171535427913e-05, + "loss": 0.4695, + "step": 675 + }, + { + "epoch": 1.8451767936834877, + "grad_norm": 0.048095703125, + "learning_rate": 1.685283637309623e-05, + "loss": 0.4316, + "step": 676 + }, + { + "epoch": 1.8479231033299004, + "grad_norm": 0.041748046875, + "learning_rate": 1.6820492464954187e-05, + "loss": 0.4624, + "step": 677 + }, + { + "epoch": 1.850669412976313, + "grad_norm": 0.0439453125, + "learning_rate": 1.67881399636724e-05, + "loss": 0.4515, + "step": 678 + }, + { + "epoch": 1.8534157226227257, + "grad_norm": 0.046630859375, + "learning_rate": 1.6755779021962056e-05, + "loss": 0.5498, + "step": 679 + }, + { + "epoch": 1.8561620322691383, + "grad_norm": 0.0419921875, + "learning_rate": 1.6723409792574185e-05, + "loss": 0.4184, + "step": 680 + }, + { + "epoch": 1.858908341915551, + "grad_norm": 0.042236328125, + "learning_rate": 1.6691032428298934e-05, + "loss": 0.437, + "step": 681 + }, + { + "epoch": 1.8616546515619636, + "grad_norm": 0.046142578125, + "learning_rate": 1.665864708196485e-05, + "loss": 0.5498, + "step": 682 + }, + { + "epoch": 1.8644009612083763, + "grad_norm": 0.046630859375, + "learning_rate": 1.6626253906438148e-05, + "loss": 0.4403, + "step": 683 + }, + { + "epoch": 1.867147270854789, + "grad_norm": 0.04736328125, + "learning_rate": 1.6593853054622016e-05, + "loss": 0.5116, + "step": 684 + }, + { + "epoch": 1.8698935805012016, + "grad_norm": 0.04345703125, + "learning_rate": 1.6561444679455858e-05, + "loss": 0.4179, + "step": 685 + }, + { + "epoch": 1.8726398901476142, + "grad_norm": 0.04638671875, + "learning_rate": 1.6529028933914604e-05, + "loss": 0.4291, + "step": 686 + }, + { + "epoch": 1.8753861997940269, + "grad_norm": 0.047119140625, + "learning_rate": 1.649660597100797e-05, + "loss": 0.4856, + "step": 687 + }, + { + "epoch": 1.8781325094404395, + "grad_norm": 0.042236328125, + "learning_rate": 1.646417594377973e-05, + "loss": 0.5419, + "step": 688 + }, + { + "epoch": 1.8808788190868522, + "grad_norm": 0.045166015625, + "learning_rate": 1.6431739005307014e-05, + "loss": 0.4287, + "step": 689 + }, + { + "epoch": 1.8836251287332648, + "grad_norm": 0.05078125, + "learning_rate": 1.6399295308699572e-05, + "loss": 0.4848, + "step": 690 + }, + { + "epoch": 1.8863714383796775, + "grad_norm": 0.044677734375, + "learning_rate": 1.636684500709905e-05, + "loss": 0.3635, + "step": 691 + }, + { + "epoch": 1.88911774802609, + "grad_norm": 0.04541015625, + "learning_rate": 1.6334388253678285e-05, + "loss": 0.5319, + "step": 692 + }, + { + "epoch": 1.8918640576725025, + "grad_norm": 0.047607421875, + "learning_rate": 1.6301925201640542e-05, + "loss": 0.5852, + "step": 693 + }, + { + "epoch": 1.8946103673189152, + "grad_norm": 0.0419921875, + "learning_rate": 1.6269456004218844e-05, + "loss": 0.5184, + "step": 694 + }, + { + "epoch": 1.8973566769653278, + "grad_norm": 0.04638671875, + "learning_rate": 1.6236980814675204e-05, + "loss": 0.4528, + "step": 695 + }, + { + "epoch": 1.9001029866117405, + "grad_norm": 0.04541015625, + "learning_rate": 1.620449978629993e-05, + "loss": 0.4608, + "step": 696 + }, + { + "epoch": 1.9028492962581531, + "grad_norm": 0.048095703125, + "learning_rate": 1.617201307241088e-05, + "loss": 0.5007, + "step": 697 + }, + { + "epoch": 1.9055956059045658, + "grad_norm": 0.0458984375, + "learning_rate": 1.6139520826352765e-05, + "loss": 0.5226, + "step": 698 + }, + { + "epoch": 1.9083419155509782, + "grad_norm": 0.045166015625, + "learning_rate": 1.6107023201496378e-05, + "loss": 0.4345, + "step": 699 + }, + { + "epoch": 1.9110882251973909, + "grad_norm": 0.048583984375, + "learning_rate": 1.6074520351237947e-05, + "loss": 0.4386, + "step": 700 + }, + { + "epoch": 1.9138345348438035, + "grad_norm": 0.042724609375, + "learning_rate": 1.6042012428998325e-05, + "loss": 0.4791, + "step": 701 + }, + { + "epoch": 1.9165808444902162, + "grad_norm": 0.04443359375, + "learning_rate": 1.6009499588222325e-05, + "loss": 0.3982, + "step": 702 + }, + { + "epoch": 1.9193271541366288, + "grad_norm": 0.044189453125, + "learning_rate": 1.597698198237797e-05, + "loss": 0.3487, + "step": 703 + }, + { + "epoch": 1.9220734637830414, + "grad_norm": 0.046630859375, + "learning_rate": 1.5944459764955784e-05, + "loss": 0.3082, + "step": 704 + }, + { + "epoch": 1.924819773429454, + "grad_norm": 0.04443359375, + "learning_rate": 1.5911933089468048e-05, + "loss": 0.4835, + "step": 705 + }, + { + "epoch": 1.9275660830758667, + "grad_norm": 0.047607421875, + "learning_rate": 1.5879402109448093e-05, + "loss": 0.503, + "step": 706 + }, + { + "epoch": 1.9303123927222794, + "grad_norm": 0.04638671875, + "learning_rate": 1.584686697844956e-05, + "loss": 0.5597, + "step": 707 + }, + { + "epoch": 1.933058702368692, + "grad_norm": 0.044189453125, + "learning_rate": 1.5814327850045697e-05, + "loss": 0.5074, + "step": 708 + }, + { + "epoch": 1.9358050120151047, + "grad_norm": 0.05029296875, + "learning_rate": 1.5781784877828607e-05, + "loss": 0.6022, + "step": 709 + }, + { + "epoch": 1.9385513216615173, + "grad_norm": 0.043701171875, + "learning_rate": 1.5749238215408548e-05, + "loss": 0.5197, + "step": 710 + }, + { + "epoch": 1.94129763130793, + "grad_norm": 0.044677734375, + "learning_rate": 1.571668801641319e-05, + "loss": 0.5147, + "step": 711 + }, + { + "epoch": 1.9440439409543426, + "grad_norm": 0.042724609375, + "learning_rate": 1.5684134434486893e-05, + "loss": 0.5506, + "step": 712 + }, + { + "epoch": 1.9467902506007553, + "grad_norm": 0.044921875, + "learning_rate": 1.565157762329e-05, + "loss": 0.4598, + "step": 713 + }, + { + "epoch": 1.949536560247168, + "grad_norm": 0.048828125, + "learning_rate": 1.5619017736498076e-05, + "loss": 0.5802, + "step": 714 + }, + { + "epoch": 1.9522828698935806, + "grad_norm": 0.046630859375, + "learning_rate": 1.5586454927801223e-05, + "loss": 0.567, + "step": 715 + }, + { + "epoch": 1.9550291795399932, + "grad_norm": 0.04931640625, + "learning_rate": 1.555388935090332e-05, + "loss": 0.5956, + "step": 716 + }, + { + "epoch": 1.9577754891864059, + "grad_norm": 0.0400390625, + "learning_rate": 1.5521321159521326e-05, + "loss": 0.4019, + "step": 717 + }, + { + "epoch": 1.9605217988328185, + "grad_norm": 0.045654296875, + "learning_rate": 1.548875050738453e-05, + "loss": 0.4996, + "step": 718 + }, + { + "epoch": 1.9632681084792312, + "grad_norm": 0.0498046875, + "learning_rate": 1.545617754823384e-05, + "loss": 0.4999, + "step": 719 + }, + { + "epoch": 1.9660144181256438, + "grad_norm": 0.047607421875, + "learning_rate": 1.5423602435821055e-05, + "loss": 0.6049, + "step": 720 + }, + { + "epoch": 1.9687607277720565, + "grad_norm": 0.049560546875, + "learning_rate": 1.5391025323908134e-05, + "loss": 0.4799, + "step": 721 + }, + { + "epoch": 1.9715070374184689, + "grad_norm": 0.043212890625, + "learning_rate": 1.5358446366266483e-05, + "loss": 0.4836, + "step": 722 + }, + { + "epoch": 1.9742533470648815, + "grad_norm": 0.1767578125, + "learning_rate": 1.532586571667621e-05, + "loss": 1.1204, + "step": 723 + }, + { + "epoch": 1.9769996567112942, + "grad_norm": 0.043701171875, + "learning_rate": 1.5293283528925412e-05, + "loss": 0.4647, + "step": 724 + }, + { + "epoch": 1.9797459663577068, + "grad_norm": 0.042236328125, + "learning_rate": 1.5260699956809456e-05, + "loss": 0.4984, + "step": 725 + }, + { + "epoch": 1.9824922760041195, + "grad_norm": 0.04052734375, + "learning_rate": 1.522811515413023e-05, + "loss": 0.4019, + "step": 726 + }, + { + "epoch": 1.9852385856505321, + "grad_norm": 0.041748046875, + "learning_rate": 1.5195529274695436e-05, + "loss": 0.4028, + "step": 727 + }, + { + "epoch": 1.9879848952969448, + "grad_norm": 0.05029296875, + "learning_rate": 1.5162942472317858e-05, + "loss": 0.5023, + "step": 728 + }, + { + "epoch": 1.9879848952969448, + "eval_loss": 0.5045637488365173, + "eval_runtime": 616.8097, + "eval_samples_per_second": 14.86, + "eval_steps_per_second": 14.86, + "step": 728 + }, + { + "epoch": 1.9907312049433572, + "grad_norm": 0.056884765625, + "learning_rate": 1.5130354900814643e-05, + "loss": 0.4917, + "step": 729 + }, + { + "epoch": 1.9934775145897699, + "grad_norm": 0.0498046875, + "learning_rate": 1.5097766714006553e-05, + "loss": 0.4892, + "step": 730 + }, + { + "epoch": 1.9962238242361825, + "grad_norm": 0.04736328125, + "learning_rate": 1.5065178065717274e-05, + "loss": 0.4261, + "step": 731 + }, + { + "epoch": 1.9989701338825951, + "grad_norm": 0.044921875, + "learning_rate": 1.5032589109772655e-05, + "loss": 0.5345, + "step": 732 + }, + { + "epoch": 2.001716443529008, + "grad_norm": 0.048095703125, + "learning_rate": 1.5e-05, + "loss": 0.5285, + "step": 733 + }, + { + "epoch": 2.0044627531754204, + "grad_norm": 0.044189453125, + "learning_rate": 1.4967410890227347e-05, + "loss": 0.4216, + "step": 734 + }, + { + "epoch": 2.007209062821833, + "grad_norm": 0.044921875, + "learning_rate": 1.4934821934282728e-05, + "loss": 0.5005, + "step": 735 + }, + { + "epoch": 2.0099553724682457, + "grad_norm": 0.0458984375, + "learning_rate": 1.4902233285993447e-05, + "loss": 0.495, + "step": 736 + }, + { + "epoch": 2.0127016821146584, + "grad_norm": 0.04638671875, + "learning_rate": 1.4869645099185361e-05, + "loss": 0.4652, + "step": 737 + }, + { + "epoch": 2.0013731548232063, + "grad_norm": 0.051025390625, + "learning_rate": 1.4837057527682142e-05, + "loss": 0.621, + "step": 738 + }, + { + "epoch": 2.004119464469619, + "grad_norm": 0.0419921875, + "learning_rate": 1.4804470725304567e-05, + "loss": 0.353, + "step": 739 + }, + { + "epoch": 2.0068657741160316, + "grad_norm": 0.042724609375, + "learning_rate": 1.4771884845869772e-05, + "loss": 0.431, + "step": 740 + }, + { + "epoch": 2.0096120837624443, + "grad_norm": 0.04638671875, + "learning_rate": 1.4739300043190547e-05, + "loss": 0.5136, + "step": 741 + }, + { + "epoch": 2.012358393408857, + "grad_norm": 0.040771484375, + "learning_rate": 1.470671647107459e-05, + "loss": 0.3657, + "step": 742 + }, + { + "epoch": 2.0151047030552696, + "grad_norm": 0.043701171875, + "learning_rate": 1.4674134283323792e-05, + "loss": 0.5771, + "step": 743 + }, + { + "epoch": 2.017851012701682, + "grad_norm": 0.039794921875, + "learning_rate": 1.4641553633733519e-05, + "loss": 0.3684, + "step": 744 + }, + { + "epoch": 2.020597322348095, + "grad_norm": 0.047119140625, + "learning_rate": 1.460897467609187e-05, + "loss": 0.5238, + "step": 745 + }, + { + "epoch": 2.0233436319945075, + "grad_norm": 0.0654296875, + "learning_rate": 1.4576397564178951e-05, + "loss": 0.451, + "step": 746 + }, + { + "epoch": 2.02608994164092, + "grad_norm": 0.043701171875, + "learning_rate": 1.4543822451766166e-05, + "loss": 0.5708, + "step": 747 + }, + { + "epoch": 2.028836251287333, + "grad_norm": 0.050537109375, + "learning_rate": 1.4511249492615477e-05, + "loss": 0.6172, + "step": 748 + }, + { + "epoch": 2.0315825609337455, + "grad_norm": 0.056884765625, + "learning_rate": 1.447867884047868e-05, + "loss": 0.6446, + "step": 749 + }, + { + "epoch": 2.034328870580158, + "grad_norm": 0.044677734375, + "learning_rate": 1.4446110649096683e-05, + "loss": 0.5011, + "step": 750 + }, + { + "epoch": 2.0370751802265707, + "grad_norm": 0.054443359375, + "learning_rate": 1.4413545072198783e-05, + "loss": 0.4862, + "step": 751 + }, + { + "epoch": 2.0398214898729834, + "grad_norm": 0.046875, + "learning_rate": 1.438098226350193e-05, + "loss": 0.4955, + "step": 752 + }, + { + "epoch": 2.0425677995193956, + "grad_norm": 0.048828125, + "learning_rate": 1.4348422376710009e-05, + "loss": 0.3889, + "step": 753 + }, + { + "epoch": 2.0453141091658082, + "grad_norm": 0.166015625, + "learning_rate": 1.4315865565513111e-05, + "loss": 1.1553, + "step": 754 + }, + { + "epoch": 2.048060418812221, + "grad_norm": 0.044921875, + "learning_rate": 1.4283311983586818e-05, + "loss": 0.5192, + "step": 755 + }, + { + "epoch": 2.0508067284586335, + "grad_norm": 0.048828125, + "learning_rate": 1.4250761784591451e-05, + "loss": 0.4926, + "step": 756 + }, + { + "epoch": 2.053553038105046, + "grad_norm": 0.044921875, + "learning_rate": 1.4218215122171392e-05, + "loss": 0.5362, + "step": 757 + }, + { + "epoch": 2.056299347751459, + "grad_norm": 0.0517578125, + "learning_rate": 1.4185672149954304e-05, + "loss": 0.4926, + "step": 758 + }, + { + "epoch": 2.0590456573978715, + "grad_norm": 0.0478515625, + "learning_rate": 1.4153133021550438e-05, + "loss": 0.6137, + "step": 759 + }, + { + "epoch": 2.061791967044284, + "grad_norm": 0.04296875, + "learning_rate": 1.4120597890551908e-05, + "loss": 0.4648, + "step": 760 + }, + { + "epoch": 2.064538276690697, + "grad_norm": 0.045166015625, + "learning_rate": 1.4088066910531951e-05, + "loss": 0.6486, + "step": 761 + }, + { + "epoch": 2.0672845863371094, + "grad_norm": 0.042236328125, + "learning_rate": 1.4055540235044213e-05, + "loss": 0.4291, + "step": 762 + }, + { + "epoch": 2.070030895983522, + "grad_norm": 0.04736328125, + "learning_rate": 1.402301801762203e-05, + "loss": 0.515, + "step": 763 + }, + { + "epoch": 2.0727772056299347, + "grad_norm": 0.05224609375, + "learning_rate": 1.3990500411777677e-05, + "loss": 0.6079, + "step": 764 + }, + { + "epoch": 2.0755235152763474, + "grad_norm": 0.046875, + "learning_rate": 1.3957987571001676e-05, + "loss": 0.5589, + "step": 765 + }, + { + "epoch": 2.07826982492276, + "grad_norm": 0.049560546875, + "learning_rate": 1.3925479648762055e-05, + "loss": 0.6439, + "step": 766 + }, + { + "epoch": 2.0810161345691727, + "grad_norm": 0.053955078125, + "learning_rate": 1.3892976798503621e-05, + "loss": 0.4723, + "step": 767 + }, + { + "epoch": 2.0837624442155853, + "grad_norm": 0.048828125, + "learning_rate": 1.3860479173647241e-05, + "loss": 0.5328, + "step": 768 + }, + { + "epoch": 2.086508753861998, + "grad_norm": 0.046142578125, + "learning_rate": 1.3827986927589118e-05, + "loss": 0.5182, + "step": 769 + }, + { + "epoch": 2.0892550635084106, + "grad_norm": 0.043701171875, + "learning_rate": 1.3795500213700072e-05, + "loss": 0.4433, + "step": 770 + }, + { + "epoch": 2.0920013731548233, + "grad_norm": 0.044921875, + "learning_rate": 1.3763019185324797e-05, + "loss": 0.483, + "step": 771 + }, + { + "epoch": 2.094747682801236, + "grad_norm": 0.046630859375, + "learning_rate": 1.3730543995781158e-05, + "loss": 0.4826, + "step": 772 + }, + { + "epoch": 2.0974939924476486, + "grad_norm": 0.054931640625, + "learning_rate": 1.3698074798359458e-05, + "loss": 0.5313, + "step": 773 + }, + { + "epoch": 2.100240302094061, + "grad_norm": 0.04833984375, + "learning_rate": 1.3665611746321718e-05, + "loss": 0.4303, + "step": 774 + }, + { + "epoch": 2.102986611740474, + "grad_norm": 0.0546875, + "learning_rate": 1.363315499290095e-05, + "loss": 0.5252, + "step": 775 + }, + { + "epoch": 2.1057329213868865, + "grad_norm": 0.044921875, + "learning_rate": 1.360070469130043e-05, + "loss": 0.4501, + "step": 776 + }, + { + "epoch": 2.108479231033299, + "grad_norm": 0.044921875, + "learning_rate": 1.3568260994692988e-05, + "loss": 0.4423, + "step": 777 + }, + { + "epoch": 2.111225540679712, + "grad_norm": 0.04638671875, + "learning_rate": 1.3535824056220273e-05, + "loss": 0.5341, + "step": 778 + }, + { + "epoch": 2.1139718503261244, + "grad_norm": 0.042724609375, + "learning_rate": 1.3503394028992032e-05, + "loss": 0.4019, + "step": 779 + }, + { + "epoch": 2.116718159972537, + "grad_norm": 0.051513671875, + "learning_rate": 1.3470971066085395e-05, + "loss": 0.5329, + "step": 780 + }, + { + "epoch": 2.1194644696189497, + "grad_norm": 0.041748046875, + "learning_rate": 1.3438555320544143e-05, + "loss": 0.5412, + "step": 781 + }, + { + "epoch": 2.1222107792653624, + "grad_norm": 0.0439453125, + "learning_rate": 1.3406146945377987e-05, + "loss": 0.4902, + "step": 782 + }, + { + "epoch": 2.1249570889117746, + "grad_norm": 0.04736328125, + "learning_rate": 1.3373746093561855e-05, + "loss": 0.6356, + "step": 783 + }, + { + "epoch": 2.1277033985581872, + "grad_norm": 0.047119140625, + "learning_rate": 1.3341352918035156e-05, + "loss": 0.4674, + "step": 784 + }, + { + "epoch": 2.1304497082046, + "grad_norm": 0.05029296875, + "learning_rate": 1.330896757170107e-05, + "loss": 0.5155, + "step": 785 + }, + { + "epoch": 2.1331960178510125, + "grad_norm": 0.046875, + "learning_rate": 1.327659020742582e-05, + "loss": 0.5342, + "step": 786 + }, + { + "epoch": 2.135942327497425, + "grad_norm": 0.047607421875, + "learning_rate": 1.3244220978037945e-05, + "loss": 0.5219, + "step": 787 + }, + { + "epoch": 2.138688637143838, + "grad_norm": 0.042724609375, + "learning_rate": 1.3211860036327604e-05, + "loss": 0.5404, + "step": 788 + }, + { + "epoch": 2.1414349467902505, + "grad_norm": 0.05078125, + "learning_rate": 1.3179507535045819e-05, + "loss": 0.4683, + "step": 789 + }, + { + "epoch": 2.144181256436663, + "grad_norm": 0.049560546875, + "learning_rate": 1.3147163626903774e-05, + "loss": 0.4784, + "step": 790 + }, + { + "epoch": 2.1469275660830758, + "grad_norm": 0.0478515625, + "learning_rate": 1.3114828464572096e-05, + "loss": 0.5399, + "step": 791 + }, + { + "epoch": 2.1496738757294884, + "grad_norm": 0.041259765625, + "learning_rate": 1.3082502200680128e-05, + "loss": 0.425, + "step": 792 + }, + { + "epoch": 2.152420185375901, + "grad_norm": 0.044189453125, + "learning_rate": 1.305018498781521e-05, + "loss": 0.4779, + "step": 793 + }, + { + "epoch": 2.1551664950223137, + "grad_norm": 0.04443359375, + "learning_rate": 1.301787697852196e-05, + "loss": 0.5025, + "step": 794 + }, + { + "epoch": 2.1579128046687264, + "grad_norm": 0.043701171875, + "learning_rate": 1.298557832530155e-05, + "loss": 0.5144, + "step": 795 + }, + { + "epoch": 2.160659114315139, + "grad_norm": 0.042724609375, + "learning_rate": 1.2953289180610994e-05, + "loss": 0.4723, + "step": 796 + }, + { + "epoch": 2.1634054239615517, + "grad_norm": 0.04248046875, + "learning_rate": 1.2921009696862419e-05, + "loss": 0.5309, + "step": 797 + }, + { + "epoch": 2.1661517336079643, + "grad_norm": 0.048828125, + "learning_rate": 1.2888740026422354e-05, + "loss": 0.3767, + "step": 798 + }, + { + "epoch": 2.168898043254377, + "grad_norm": 0.044189453125, + "learning_rate": 1.2856480321611004e-05, + "loss": 0.559, + "step": 799 + }, + { + "epoch": 2.1716443529007896, + "grad_norm": 0.048095703125, + "learning_rate": 1.2824230734701535e-05, + "loss": 0.565, + "step": 800 + }, + { + "epoch": 2.1743906625472023, + "grad_norm": 0.0458984375, + "learning_rate": 1.2791991417919347e-05, + "loss": 0.5005, + "step": 801 + }, + { + "epoch": 2.177136972193615, + "grad_norm": 0.04248046875, + "learning_rate": 1.2759762523441386e-05, + "loss": 0.4312, + "step": 802 + }, + { + "epoch": 2.1798832818400276, + "grad_norm": 0.044189453125, + "learning_rate": 1.2727544203395377e-05, + "loss": 0.5526, + "step": 803 + }, + { + "epoch": 2.18262959148644, + "grad_norm": 0.047119140625, + "learning_rate": 1.269533660985914e-05, + "loss": 0.4463, + "step": 804 + }, + { + "epoch": 2.185375901132853, + "grad_norm": 0.043701171875, + "learning_rate": 1.2663139894859867e-05, + "loss": 0.4219, + "step": 805 + }, + { + "epoch": 2.1881222107792655, + "grad_norm": 0.045166015625, + "learning_rate": 1.2630954210373396e-05, + "loss": 0.3865, + "step": 806 + }, + { + "epoch": 2.190868520425678, + "grad_norm": 0.045654296875, + "learning_rate": 1.2598779708323499e-05, + "loss": 0.5792, + "step": 807 + }, + { + "epoch": 2.193614830072091, + "grad_norm": 0.043701171875, + "learning_rate": 1.2566616540581168e-05, + "loss": 0.462, + "step": 808 + }, + { + "epoch": 2.1963611397185034, + "grad_norm": 0.046142578125, + "learning_rate": 1.2534464858963892e-05, + "loss": 0.4869, + "step": 809 + }, + { + "epoch": 2.199107449364916, + "grad_norm": 0.0458984375, + "learning_rate": 1.2502324815234942e-05, + "loss": 0.5559, + "step": 810 + }, + { + "epoch": 2.2018537590113283, + "grad_norm": 0.041015625, + "learning_rate": 1.2470196561102655e-05, + "loss": 0.3752, + "step": 811 + }, + { + "epoch": 2.204600068657741, + "grad_norm": 0.04443359375, + "learning_rate": 1.2438080248219723e-05, + "loss": 0.5054, + "step": 812 + }, + { + "epoch": 2.2073463783041536, + "grad_norm": 0.041259765625, + "learning_rate": 1.2405976028182464e-05, + "loss": 0.3525, + "step": 813 + }, + { + "epoch": 2.2100926879505662, + "grad_norm": 0.043701171875, + "learning_rate": 1.2373884052530127e-05, + "loss": 0.4951, + "step": 814 + }, + { + "epoch": 2.212838997596979, + "grad_norm": 0.047119140625, + "learning_rate": 1.2341804472744157e-05, + "loss": 0.4484, + "step": 815 + }, + { + "epoch": 2.2155853072433915, + "grad_norm": 0.046630859375, + "learning_rate": 1.2309737440247486e-05, + "loss": 0.5412, + "step": 816 + }, + { + "epoch": 2.218331616889804, + "grad_norm": 0.050048828125, + "learning_rate": 1.2277683106403826e-05, + "loss": 0.6162, + "step": 817 + }, + { + "epoch": 2.221077926536217, + "grad_norm": 0.04443359375, + "learning_rate": 1.2245641622516943e-05, + "loss": 0.4606, + "step": 818 + }, + { + "epoch": 2.2238242361826295, + "grad_norm": 0.055908203125, + "learning_rate": 1.2213613139829949e-05, + "loss": 0.3737, + "step": 819 + }, + { + "epoch": 2.2238242361826295, + "eval_loss": 0.504136323928833, + "eval_runtime": 615.1173, + "eval_samples_per_second": 14.901, + "eval_steps_per_second": 14.901, + "step": 819 + }, + { + "epoch": 2.226570545829042, + "grad_norm": 0.044921875, + "learning_rate": 1.2181597809524594e-05, + "loss": 0.3953, + "step": 820 + }, + { + "epoch": 2.2293168554754548, + "grad_norm": 0.044189453125, + "learning_rate": 1.2149595782720537e-05, + "loss": 0.4174, + "step": 821 + }, + { + "epoch": 2.2320631651218674, + "grad_norm": 0.04638671875, + "learning_rate": 1.2117607210474645e-05, + "loss": 0.5269, + "step": 822 + }, + { + "epoch": 2.23480947476828, + "grad_norm": 0.042724609375, + "learning_rate": 1.2085632243780278e-05, + "loss": 0.4668, + "step": 823 + }, + { + "epoch": 2.2375557844146927, + "grad_norm": 0.046630859375, + "learning_rate": 1.205367103356657e-05, + "loss": 0.4565, + "step": 824 + }, + { + "epoch": 2.2403020940611054, + "grad_norm": 0.043701171875, + "learning_rate": 1.202172373069773e-05, + "loss": 0.4427, + "step": 825 + }, + { + "epoch": 2.243048403707518, + "grad_norm": 0.06103515625, + "learning_rate": 1.1989790485972312e-05, + "loss": 0.4414, + "step": 826 + }, + { + "epoch": 2.2457947133539307, + "grad_norm": 0.046142578125, + "learning_rate": 1.1957871450122516e-05, + "loss": 0.5547, + "step": 827 + }, + { + "epoch": 2.2485410230003433, + "grad_norm": 0.044677734375, + "learning_rate": 1.1925966773813476e-05, + "loss": 0.5273, + "step": 828 + }, + { + "epoch": 2.251287332646756, + "grad_norm": 0.0439453125, + "learning_rate": 1.1894076607642537e-05, + "loss": 0.5066, + "step": 829 + }, + { + "epoch": 2.2540336422931686, + "grad_norm": 0.047119140625, + "learning_rate": 1.1862201102138562e-05, + "loss": 0.5397, + "step": 830 + }, + { + "epoch": 2.2567799519395813, + "grad_norm": 0.0478515625, + "learning_rate": 1.1830340407761207e-05, + "loss": 0.4944, + "step": 831 + }, + { + "epoch": 2.259526261585994, + "grad_norm": 0.042724609375, + "learning_rate": 1.1798494674900222e-05, + "loss": 0.4056, + "step": 832 + }, + { + "epoch": 2.2622725712324065, + "grad_norm": 0.050537109375, + "learning_rate": 1.1766664053874726e-05, + "loss": 0.5453, + "step": 833 + }, + { + "epoch": 2.265018880878819, + "grad_norm": 0.04638671875, + "learning_rate": 1.1734848694932514e-05, + "loss": 0.456, + "step": 834 + }, + { + "epoch": 2.267765190525232, + "grad_norm": 0.050537109375, + "learning_rate": 1.170304874824934e-05, + "loss": 0.5696, + "step": 835 + }, + { + "epoch": 2.2705115001716445, + "grad_norm": 0.04248046875, + "learning_rate": 1.1671264363928205e-05, + "loss": 0.4873, + "step": 836 + }, + { + "epoch": 2.273257809818057, + "grad_norm": 0.046142578125, + "learning_rate": 1.1639495691998653e-05, + "loss": 0.5142, + "step": 837 + }, + { + "epoch": 2.27600411946447, + "grad_norm": 0.044189453125, + "learning_rate": 1.1607742882416064e-05, + "loss": 0.4905, + "step": 838 + }, + { + "epoch": 2.2787504291108824, + "grad_norm": 0.044677734375, + "learning_rate": 1.1576006085060941e-05, + "loss": 0.4352, + "step": 839 + }, + { + "epoch": 2.281496738757295, + "grad_norm": 0.05078125, + "learning_rate": 1.1544285449738211e-05, + "loss": 0.5675, + "step": 840 + }, + { + "epoch": 2.2842430484037077, + "grad_norm": 0.047119140625, + "learning_rate": 1.1512581126176508e-05, + "loss": 0.4553, + "step": 841 + }, + { + "epoch": 2.2869893580501204, + "grad_norm": 0.052001953125, + "learning_rate": 1.1480893264027469e-05, + "loss": 0.5391, + "step": 842 + }, + { + "epoch": 2.289735667696533, + "grad_norm": 0.04736328125, + "learning_rate": 1.1449222012865037e-05, + "loss": 0.5003, + "step": 843 + }, + { + "epoch": 2.2924819773429452, + "grad_norm": 0.051513671875, + "learning_rate": 1.1417567522184738e-05, + "loss": 0.5302, + "step": 844 + }, + { + "epoch": 2.295228286989358, + "grad_norm": 0.04931640625, + "learning_rate": 1.1385929941402993e-05, + "loss": 0.571, + "step": 845 + }, + { + "epoch": 2.2979745966357705, + "grad_norm": 0.04638671875, + "learning_rate": 1.1354309419856392e-05, + "loss": 0.5726, + "step": 846 + }, + { + "epoch": 2.300720906282183, + "grad_norm": 0.045166015625, + "learning_rate": 1.1322706106801025e-05, + "loss": 0.5884, + "step": 847 + }, + { + "epoch": 2.303467215928596, + "grad_norm": 0.046142578125, + "learning_rate": 1.1291120151411731e-05, + "loss": 0.5926, + "step": 848 + }, + { + "epoch": 2.3062135255750085, + "grad_norm": 0.0439453125, + "learning_rate": 1.1259551702781426e-05, + "loss": 0.4487, + "step": 849 + }, + { + "epoch": 2.308959835221421, + "grad_norm": 0.040771484375, + "learning_rate": 1.1228000909920388e-05, + "loss": 0.3924, + "step": 850 + }, + { + "epoch": 2.3117061448678338, + "grad_norm": 0.044189453125, + "learning_rate": 1.119646792175556e-05, + "loss": 0.4218, + "step": 851 + }, + { + "epoch": 2.3144524545142464, + "grad_norm": 0.236328125, + "learning_rate": 1.1164952887129836e-05, + "loss": 1.1613, + "step": 852 + }, + { + "epoch": 2.317198764160659, + "grad_norm": 0.05029296875, + "learning_rate": 1.1133455954801372e-05, + "loss": 0.4224, + "step": 853 + }, + { + "epoch": 2.3199450738070717, + "grad_norm": 0.045166015625, + "learning_rate": 1.1101977273442873e-05, + "loss": 0.4405, + "step": 854 + }, + { + "epoch": 2.3226913834534844, + "grad_norm": 0.046142578125, + "learning_rate": 1.1070516991640894e-05, + "loss": 0.4972, + "step": 855 + }, + { + "epoch": 2.325437693099897, + "grad_norm": 0.044921875, + "learning_rate": 1.1039075257895146e-05, + "loss": 0.5403, + "step": 856 + }, + { + "epoch": 2.3281840027463097, + "grad_norm": 0.046630859375, + "learning_rate": 1.1007652220617778e-05, + "loss": 0.5295, + "step": 857 + }, + { + "epoch": 2.3309303123927223, + "grad_norm": 0.046630859375, + "learning_rate": 1.0976248028132705e-05, + "loss": 0.5899, + "step": 858 + }, + { + "epoch": 2.333676622039135, + "grad_norm": 0.04541015625, + "learning_rate": 1.0944862828674872e-05, + "loss": 0.4907, + "step": 859 + }, + { + "epoch": 2.3364229316855476, + "grad_norm": 0.047119140625, + "learning_rate": 1.0913496770389585e-05, + "loss": 0.5142, + "step": 860 + }, + { + "epoch": 2.3391692413319602, + "grad_norm": 0.04150390625, + "learning_rate": 1.088215000133179e-05, + "loss": 0.5103, + "step": 861 + }, + { + "epoch": 2.341915550978373, + "grad_norm": 0.04833984375, + "learning_rate": 1.0850822669465392e-05, + "loss": 0.5814, + "step": 862 + }, + { + "epoch": 2.3446618606247855, + "grad_norm": 0.0517578125, + "learning_rate": 1.081951492266254e-05, + "loss": 0.5544, + "step": 863 + }, + { + "epoch": 2.347408170271198, + "grad_norm": 0.1416015625, + "learning_rate": 1.0788226908702945e-05, + "loss": 1.1435, + "step": 864 + }, + { + "epoch": 2.350154479917611, + "grad_norm": 0.043701171875, + "learning_rate": 1.0756958775273169e-05, + "loss": 0.4895, + "step": 865 + }, + { + "epoch": 2.3529007895640235, + "grad_norm": 0.045654296875, + "learning_rate": 1.0725710669965936e-05, + "loss": 0.5886, + "step": 866 + }, + { + "epoch": 2.355647099210436, + "grad_norm": 0.04248046875, + "learning_rate": 1.0694482740279428e-05, + "loss": 0.4469, + "step": 867 + }, + { + "epoch": 2.358393408856849, + "grad_norm": 0.043212890625, + "learning_rate": 1.0663275133616603e-05, + "loss": 0.4049, + "step": 868 + }, + { + "epoch": 2.3611397185032614, + "grad_norm": 0.04052734375, + "learning_rate": 1.063208799728448e-05, + "loss": 0.3659, + "step": 869 + }, + { + "epoch": 2.3638860281496736, + "grad_norm": 0.0458984375, + "learning_rate": 1.0600921478493455e-05, + "loss": 0.5023, + "step": 870 + }, + { + "epoch": 2.3666323377960863, + "grad_norm": 0.04296875, + "learning_rate": 1.0569775724356611e-05, + "loss": 0.4065, + "step": 871 + }, + { + "epoch": 2.369378647442499, + "grad_norm": 0.043701171875, + "learning_rate": 1.0538650881889013e-05, + "loss": 0.4033, + "step": 872 + }, + { + "epoch": 2.3721249570889116, + "grad_norm": 0.039794921875, + "learning_rate": 1.0507547098007015e-05, + "loss": 0.4139, + "step": 873 + }, + { + "epoch": 2.3748712667353242, + "grad_norm": 0.048095703125, + "learning_rate": 1.0476464519527574e-05, + "loss": 0.5499, + "step": 874 + }, + { + "epoch": 2.377617576381737, + "grad_norm": 0.047119140625, + "learning_rate": 1.0445403293167547e-05, + "loss": 0.4546, + "step": 875 + }, + { + "epoch": 2.3803638860281495, + "grad_norm": 0.04150390625, + "learning_rate": 1.0414363565543016e-05, + "loss": 0.4263, + "step": 876 + }, + { + "epoch": 2.383110195674562, + "grad_norm": 0.049072265625, + "learning_rate": 1.0383345483168573e-05, + "loss": 0.446, + "step": 877 + }, + { + "epoch": 2.385856505320975, + "grad_norm": 0.044921875, + "learning_rate": 1.0352349192456643e-05, + "loss": 0.445, + "step": 878 + }, + { + "epoch": 2.3886028149673875, + "grad_norm": 0.04833984375, + "learning_rate": 1.032137483971679e-05, + "loss": 0.5788, + "step": 879 + }, + { + "epoch": 2.3913491246138, + "grad_norm": 0.04638671875, + "learning_rate": 1.0290422571155024e-05, + "loss": 0.4657, + "step": 880 + }, + { + "epoch": 2.3940954342602128, + "grad_norm": 0.04345703125, + "learning_rate": 1.0259492532873113e-05, + "loss": 0.5917, + "step": 881 + }, + { + "epoch": 2.3968417439066254, + "grad_norm": 0.05126953125, + "learning_rate": 1.0228584870867896e-05, + "loss": 0.4403, + "step": 882 + }, + { + "epoch": 2.399588053553038, + "grad_norm": 0.041259765625, + "learning_rate": 1.0197699731030584e-05, + "loss": 0.4274, + "step": 883 + }, + { + "epoch": 2.4023343631994507, + "grad_norm": 0.048095703125, + "learning_rate": 1.016683725914609e-05, + "loss": 0.5997, + "step": 884 + }, + { + "epoch": 2.4050806728458634, + "grad_norm": 0.046630859375, + "learning_rate": 1.0135997600892316e-05, + "loss": 0.4612, + "step": 885 + }, + { + "epoch": 2.407826982492276, + "grad_norm": 0.044677734375, + "learning_rate": 1.0105180901839487e-05, + "loss": 0.4969, + "step": 886 + }, + { + "epoch": 2.4105732921386887, + "grad_norm": 0.045166015625, + "learning_rate": 1.0074387307449452e-05, + "loss": 0.4989, + "step": 887 + }, + { + "epoch": 2.4133196017851013, + "grad_norm": 0.05029296875, + "learning_rate": 1.0043616963075001e-05, + "loss": 0.4879, + "step": 888 + }, + { + "epoch": 2.416065911431514, + "grad_norm": 0.046630859375, + "learning_rate": 1.0012870013959182e-05, + "loss": 0.4521, + "step": 889 + }, + { + "epoch": 2.4188122210779266, + "grad_norm": 0.0498046875, + "learning_rate": 9.982146605234604e-06, + "loss": 0.6295, + "step": 890 + }, + { + "epoch": 2.4215585307243392, + "grad_norm": 0.044921875, + "learning_rate": 9.95144688192277e-06, + "loss": 0.4266, + "step": 891 + }, + { + "epoch": 2.424304840370752, + "grad_norm": 0.044677734375, + "learning_rate": 9.920770988933366e-06, + "loss": 0.3733, + "step": 892 + }, + { + "epoch": 2.4270511500171645, + "grad_norm": 0.046630859375, + "learning_rate": 9.890119071063624e-06, + "loss": 0.5311, + "step": 893 + }, + { + "epoch": 2.429797459663577, + "grad_norm": 0.040283203125, + "learning_rate": 9.859491272997579e-06, + "loss": 0.3879, + "step": 894 + }, + { + "epoch": 2.43254376930999, + "grad_norm": 0.048095703125, + "learning_rate": 9.828887739305423e-06, + "loss": 0.4734, + "step": 895 + }, + { + "epoch": 2.4352900789564025, + "grad_norm": 0.05224609375, + "learning_rate": 9.798308614442822e-06, + "loss": 0.4965, + "step": 896 + }, + { + "epoch": 2.438036388602815, + "grad_norm": 0.05224609375, + "learning_rate": 9.767754042750214e-06, + "loss": 0.502, + "step": 897 + }, + { + "epoch": 2.4407826982492278, + "grad_norm": 0.046142578125, + "learning_rate": 9.737224168452154e-06, + "loss": 0.4924, + "step": 898 + }, + { + "epoch": 2.4435290078956404, + "grad_norm": 0.045166015625, + "learning_rate": 9.706719135656613e-06, + "loss": 0.5116, + "step": 899 + }, + { + "epoch": 2.446275317542053, + "grad_norm": 0.048095703125, + "learning_rate": 9.676239088354302e-06, + "loss": 0.5165, + "step": 900 + }, + { + "epoch": 2.4490216271884657, + "grad_norm": 0.04296875, + "learning_rate": 9.645784170417996e-06, + "loss": 0.4293, + "step": 901 + }, + { + "epoch": 2.4517679368348784, + "grad_norm": 0.04443359375, + "learning_rate": 9.615354525601859e-06, + "loss": 0.5352, + "step": 902 + }, + { + "epoch": 2.4545142464812906, + "grad_norm": 0.042236328125, + "learning_rate": 9.584950297540759e-06, + "loss": 0.474, + "step": 903 + }, + { + "epoch": 2.457260556127703, + "grad_norm": 0.047607421875, + "learning_rate": 9.554571629749585e-06, + "loss": 0.495, + "step": 904 + }, + { + "epoch": 2.460006865774116, + "grad_norm": 0.049072265625, + "learning_rate": 9.524218665622578e-06, + "loss": 0.4432, + "step": 905 + }, + { + "epoch": 2.4627531754205285, + "grad_norm": 0.050048828125, + "learning_rate": 9.493891548432654e-06, + "loss": 0.4222, + "step": 906 + }, + { + "epoch": 2.465499485066941, + "grad_norm": 0.045166015625, + "learning_rate": 9.463590421330727e-06, + "loss": 0.5396, + "step": 907 + }, + { + "epoch": 2.468245794713354, + "grad_norm": 0.048095703125, + "learning_rate": 9.433315427345028e-06, + "loss": 0.4899, + "step": 908 + }, + { + "epoch": 2.4709921043597665, + "grad_norm": 0.047119140625, + "learning_rate": 9.403066709380432e-06, + "loss": 0.6021, + "step": 909 + }, + { + "epoch": 2.473738414006179, + "grad_norm": 0.048095703125, + "learning_rate": 9.372844410217792e-06, + "loss": 0.505, + "step": 910 + }, + { + "epoch": 2.473738414006179, + "eval_loss": 0.5039077997207642, + "eval_runtime": 617.6957, + "eval_samples_per_second": 14.839, + "eval_steps_per_second": 14.839, + "step": 910 + }, + { + "epoch": 2.4764847236525918, + "grad_norm": 0.051513671875, + "learning_rate": 9.342648672513254e-06, + "loss": 0.5927, + "step": 911 + }, + { + "epoch": 2.4792310332990044, + "grad_norm": 0.05126953125, + "learning_rate": 9.31247963879759e-06, + "loss": 0.4284, + "step": 912 + }, + { + "epoch": 2.481977342945417, + "grad_norm": 0.0458984375, + "learning_rate": 9.28233745147552e-06, + "loss": 0.4401, + "step": 913 + }, + { + "epoch": 2.4847236525918297, + "grad_norm": 0.045166015625, + "learning_rate": 9.252222252825043e-06, + "loss": 0.5268, + "step": 914 + }, + { + "epoch": 2.4874699622382423, + "grad_norm": 0.045654296875, + "learning_rate": 9.222134184996769e-06, + "loss": 0.5029, + "step": 915 + }, + { + "epoch": 2.490216271884655, + "grad_norm": 0.044677734375, + "learning_rate": 9.19207339001324e-06, + "loss": 0.4947, + "step": 916 + }, + { + "epoch": 2.4929625815310676, + "grad_norm": 0.048828125, + "learning_rate": 9.16204000976827e-06, + "loss": 0.5041, + "step": 917 + }, + { + "epoch": 2.4957088911774803, + "grad_norm": 0.0439453125, + "learning_rate": 9.13203418602626e-06, + "loss": 0.4975, + "step": 918 + }, + { + "epoch": 2.498455200823893, + "grad_norm": 0.045166015625, + "learning_rate": 9.102056060421545e-06, + "loss": 0.4445, + "step": 919 + }, + { + "epoch": 2.5012015104703056, + "grad_norm": 0.049072265625, + "learning_rate": 9.07210577445772e-06, + "loss": 0.4441, + "step": 920 + }, + { + "epoch": 2.5039478201167182, + "grad_norm": 0.045654296875, + "learning_rate": 9.042183469506964e-06, + "loss": 0.5294, + "step": 921 + }, + { + "epoch": 2.506694129763131, + "grad_norm": 0.049072265625, + "learning_rate": 9.012289286809384e-06, + "loss": 0.5546, + "step": 922 + }, + { + "epoch": 2.5094404394095435, + "grad_norm": 0.1748046875, + "learning_rate": 8.982423367472344e-06, + "loss": 1.2097, + "step": 923 + }, + { + "epoch": 2.512186749055956, + "grad_norm": 0.046630859375, + "learning_rate": 8.95258585246979e-06, + "loss": 0.5349, + "step": 924 + }, + { + "epoch": 2.514933058702369, + "grad_norm": 0.05078125, + "learning_rate": 8.922776882641604e-06, + "loss": 0.531, + "step": 925 + }, + { + "epoch": 2.5176793683487815, + "grad_norm": 0.043701171875, + "learning_rate": 8.892996598692928e-06, + "loss": 0.4151, + "step": 926 + }, + { + "epoch": 2.5204256779951937, + "grad_norm": 0.0478515625, + "learning_rate": 8.863245141193487e-06, + "loss": 0.4963, + "step": 927 + }, + { + "epoch": 2.5231719876416063, + "grad_norm": 0.045654296875, + "learning_rate": 8.833522650576955e-06, + "loss": 0.5466, + "step": 928 + }, + { + "epoch": 2.525918297288019, + "grad_norm": 0.043212890625, + "learning_rate": 8.803829267140263e-06, + "loss": 0.4034, + "step": 929 + }, + { + "epoch": 2.5286646069344316, + "grad_norm": 0.048095703125, + "learning_rate": 8.774165131042957e-06, + "loss": 0.4094, + "step": 930 + }, + { + "epoch": 2.5314109165808443, + "grad_norm": 0.0537109375, + "learning_rate": 8.744530382306528e-06, + "loss": 0.47, + "step": 931 + }, + { + "epoch": 2.534157226227257, + "grad_norm": 0.04345703125, + "learning_rate": 8.714925160813752e-06, + "loss": 0.4783, + "step": 932 + }, + { + "epoch": 2.5369035358736696, + "grad_norm": 0.0390625, + "learning_rate": 8.68534960630802e-06, + "loss": 0.4795, + "step": 933 + }, + { + "epoch": 2.539649845520082, + "grad_norm": 0.04833984375, + "learning_rate": 8.655803858392707e-06, + "loss": 0.5637, + "step": 934 + }, + { + "epoch": 2.542396155166495, + "grad_norm": 0.047607421875, + "learning_rate": 8.626288056530474e-06, + "loss": 0.5958, + "step": 935 + }, + { + "epoch": 2.5451424648129075, + "grad_norm": 0.0546875, + "learning_rate": 8.596802340042648e-06, + "loss": 0.5443, + "step": 936 + }, + { + "epoch": 2.54788877445932, + "grad_norm": 0.04443359375, + "learning_rate": 8.567346848108523e-06, + "loss": 0.5042, + "step": 937 + }, + { + "epoch": 2.550635084105733, + "grad_norm": 0.04052734375, + "learning_rate": 8.53792171976476e-06, + "loss": 0.3745, + "step": 938 + }, + { + "epoch": 2.5533813937521455, + "grad_norm": 0.051025390625, + "learning_rate": 8.508527093904663e-06, + "loss": 0.4595, + "step": 939 + }, + { + "epoch": 2.556127703398558, + "grad_norm": 0.048583984375, + "learning_rate": 8.479163109277583e-06, + "loss": 0.5502, + "step": 940 + }, + { + "epoch": 2.5588740130449708, + "grad_norm": 0.044189453125, + "learning_rate": 8.449829904488216e-06, + "loss": 0.4784, + "step": 941 + }, + { + "epoch": 2.5616203226913834, + "grad_norm": 0.043212890625, + "learning_rate": 8.42052761799599e-06, + "loss": 0.5084, + "step": 942 + }, + { + "epoch": 2.564366632337796, + "grad_norm": 0.04736328125, + "learning_rate": 8.391256388114367e-06, + "loss": 0.4844, + "step": 943 + }, + { + "epoch": 2.5671129419842087, + "grad_norm": 0.047607421875, + "learning_rate": 8.362016353010248e-06, + "loss": 0.5863, + "step": 944 + }, + { + "epoch": 2.5698592516306213, + "grad_norm": 0.0517578125, + "learning_rate": 8.332807650703255e-06, + "loss": 0.453, + "step": 945 + }, + { + "epoch": 2.572605561277034, + "grad_norm": 0.051513671875, + "learning_rate": 8.303630419065136e-06, + "loss": 0.6364, + "step": 946 + }, + { + "epoch": 2.5753518709234466, + "grad_norm": 0.047119140625, + "learning_rate": 8.274484795819068e-06, + "loss": 0.521, + "step": 947 + }, + { + "epoch": 2.5780981805698593, + "grad_norm": 0.0478515625, + "learning_rate": 8.245370918539057e-06, + "loss": 0.47, + "step": 948 + }, + { + "epoch": 2.580844490216272, + "grad_norm": 0.04931640625, + "learning_rate": 8.216288924649233e-06, + "loss": 0.516, + "step": 949 + }, + { + "epoch": 2.5835907998626846, + "grad_norm": 0.043701171875, + "learning_rate": 8.187238951423254e-06, + "loss": 0.4951, + "step": 950 + }, + { + "epoch": 2.5863371095090972, + "grad_norm": 0.048583984375, + "learning_rate": 8.158221135983606e-06, + "loss": 0.4366, + "step": 951 + }, + { + "epoch": 2.58908341915551, + "grad_norm": 0.047607421875, + "learning_rate": 8.129235615301012e-06, + "loss": 0.5727, + "step": 952 + }, + { + "epoch": 2.5918297288019225, + "grad_norm": 0.05078125, + "learning_rate": 8.10028252619373e-06, + "loss": 0.601, + "step": 953 + }, + { + "epoch": 2.594576038448335, + "grad_norm": 0.0439453125, + "learning_rate": 8.07136200532695e-06, + "loss": 0.474, + "step": 954 + }, + { + "epoch": 2.597322348094748, + "grad_norm": 0.04052734375, + "learning_rate": 8.042474189212133e-06, + "loss": 0.3888, + "step": 955 + }, + { + "epoch": 2.6000686577411605, + "grad_norm": 0.043701171875, + "learning_rate": 8.013619214206353e-06, + "loss": 0.4508, + "step": 956 + }, + { + "epoch": 2.602814967387573, + "grad_norm": 0.04541015625, + "learning_rate": 7.984797216511686e-06, + "loss": 0.45, + "step": 957 + }, + { + "epoch": 2.6055612770339858, + "grad_norm": 0.048583984375, + "learning_rate": 7.956008332174523e-06, + "loss": 0.4348, + "step": 958 + }, + { + "epoch": 2.6083075866803984, + "grad_norm": 0.043212890625, + "learning_rate": 7.927252697084976e-06, + "loss": 0.4279, + "step": 959 + }, + { + "epoch": 2.611053896326811, + "grad_norm": 0.045654296875, + "learning_rate": 7.898530446976194e-06, + "loss": 0.5555, + "step": 960 + }, + { + "epoch": 2.6138002059732237, + "grad_norm": 0.043701171875, + "learning_rate": 7.86984171742376e-06, + "loss": 0.5695, + "step": 961 + }, + { + "epoch": 2.6165465156196364, + "grad_norm": 0.04541015625, + "learning_rate": 7.841186643845009e-06, + "loss": 0.4705, + "step": 962 + }, + { + "epoch": 2.619292825266049, + "grad_norm": 0.048095703125, + "learning_rate": 7.81256536149844e-06, + "loss": 0.486, + "step": 963 + }, + { + "epoch": 2.6220391349124617, + "grad_norm": 0.0546875, + "learning_rate": 7.783978005483024e-06, + "loss": 0.5018, + "step": 964 + }, + { + "epoch": 2.624785444558874, + "grad_norm": 0.04736328125, + "learning_rate": 7.75542471073761e-06, + "loss": 0.4491, + "step": 965 + }, + { + "epoch": 2.6275317542052865, + "grad_norm": 0.04345703125, + "learning_rate": 7.726905612040257e-06, + "loss": 0.4566, + "step": 966 + }, + { + "epoch": 2.630278063851699, + "grad_norm": 0.04931640625, + "learning_rate": 7.698420844007624e-06, + "loss": 0.5227, + "step": 967 + }, + { + "epoch": 2.633024373498112, + "grad_norm": 0.049072265625, + "learning_rate": 7.669970541094304e-06, + "loss": 0.4866, + "step": 968 + }, + { + "epoch": 2.6357706831445245, + "grad_norm": 0.047607421875, + "learning_rate": 7.64155483759223e-06, + "loss": 0.4499, + "step": 969 + }, + { + "epoch": 2.638516992790937, + "grad_norm": 0.1337890625, + "learning_rate": 7.613173867629991e-06, + "loss": 0.9577, + "step": 970 + }, + { + "epoch": 2.6412633024373497, + "grad_norm": 0.0498046875, + "learning_rate": 7.584827765172254e-06, + "loss": 0.51, + "step": 971 + }, + { + "epoch": 2.6440096120837624, + "grad_norm": 0.046142578125, + "learning_rate": 7.5565166640190784e-06, + "loss": 0.4697, + "step": 972 + }, + { + "epoch": 2.646755921730175, + "grad_norm": 0.046875, + "learning_rate": 7.528240697805321e-06, + "loss": 0.4789, + "step": 973 + }, + { + "epoch": 2.6495022313765877, + "grad_norm": 0.045654296875, + "learning_rate": 7.500000000000004e-06, + "loss": 0.5087, + "step": 974 + }, + { + "epoch": 2.6522485410230003, + "grad_norm": 0.045654296875, + "learning_rate": 7.471794703905647e-06, + "loss": 0.5238, + "step": 975 + }, + { + "epoch": 2.654994850669413, + "grad_norm": 0.045654296875, + "learning_rate": 7.443624942657698e-06, + "loss": 0.5521, + "step": 976 + }, + { + "epoch": 2.6577411603158256, + "grad_norm": 0.04833984375, + "learning_rate": 7.415490849223844e-06, + "loss": 0.4471, + "step": 977 + }, + { + "epoch": 2.6604874699622383, + "grad_norm": 0.045654296875, + "learning_rate": 7.387392556403433e-06, + "loss": 0.4795, + "step": 978 + }, + { + "epoch": 2.663233779608651, + "grad_norm": 0.044921875, + "learning_rate": 7.359330196826808e-06, + "loss": 0.4769, + "step": 979 + }, + { + "epoch": 2.6659800892550636, + "grad_norm": 0.0400390625, + "learning_rate": 7.33130390295472e-06, + "loss": 0.3953, + "step": 980 + }, + { + "epoch": 2.6687263989014762, + "grad_norm": 0.0498046875, + "learning_rate": 7.303313807077658e-06, + "loss": 0.5334, + "step": 981 + }, + { + "epoch": 2.671472708547889, + "grad_norm": 0.046630859375, + "learning_rate": 7.275360041315263e-06, + "loss": 0.512, + "step": 982 + }, + { + "epoch": 2.6742190181943015, + "grad_norm": 0.042236328125, + "learning_rate": 7.24744273761569e-06, + "loss": 0.4317, + "step": 983 + }, + { + "epoch": 2.676965327840714, + "grad_norm": 0.0419921875, + "learning_rate": 7.219562027754985e-06, + "loss": 0.5105, + "step": 984 + }, + { + "epoch": 2.679711637487127, + "grad_norm": 0.0458984375, + "learning_rate": 7.191718043336447e-06, + "loss": 0.4319, + "step": 985 + }, + { + "epoch": 2.6824579471335395, + "grad_norm": 0.04443359375, + "learning_rate": 7.163910915790047e-06, + "loss": 0.4596, + "step": 986 + }, + { + "epoch": 2.6852042567799517, + "grad_norm": 0.0537109375, + "learning_rate": 7.13614077637176e-06, + "loss": 0.4915, + "step": 987 + }, + { + "epoch": 2.6879505664263643, + "grad_norm": 0.04296875, + "learning_rate": 7.108407756162988e-06, + "loss": 0.4317, + "step": 988 + }, + { + "epoch": 2.690696876072777, + "grad_norm": 0.04296875, + "learning_rate": 7.080711986069905e-06, + "loss": 0.5411, + "step": 989 + }, + { + "epoch": 2.6934431857191896, + "grad_norm": 0.040283203125, + "learning_rate": 7.053053596822872e-06, + "loss": 0.3315, + "step": 990 + }, + { + "epoch": 2.6961894953656023, + "grad_norm": 0.052978515625, + "learning_rate": 7.025432718975787e-06, + "loss": 0.417, + "step": 991 + }, + { + "epoch": 2.698935805012015, + "grad_norm": 0.048828125, + "learning_rate": 6.997849482905506e-06, + "loss": 0.5751, + "step": 992 + }, + { + "epoch": 2.7016821146584276, + "grad_norm": 0.04736328125, + "learning_rate": 6.970304018811183e-06, + "loss": 0.5515, + "step": 993 + }, + { + "epoch": 2.70442842430484, + "grad_norm": 0.04638671875, + "learning_rate": 6.942796456713706e-06, + "loss": 0.553, + "step": 994 + }, + { + "epoch": 2.707174733951253, + "grad_norm": 0.043212890625, + "learning_rate": 6.915326926455029e-06, + "loss": 0.4753, + "step": 995 + }, + { + "epoch": 2.7099210435976655, + "grad_norm": 0.05322265625, + "learning_rate": 6.887895557697614e-06, + "loss": 0.4289, + "step": 996 + }, + { + "epoch": 2.712667353244078, + "grad_norm": 0.044189453125, + "learning_rate": 6.860502479923769e-06, + "loss": 0.4171, + "step": 997 + }, + { + "epoch": 2.715413662890491, + "grad_norm": 0.05029296875, + "learning_rate": 6.833147822435075e-06, + "loss": 0.4769, + "step": 998 + }, + { + "epoch": 2.7181599725369034, + "grad_norm": 0.04248046875, + "learning_rate": 6.8058317143517615e-06, + "loss": 0.4042, + "step": 999 + }, + { + "epoch": 2.720906282183316, + "grad_norm": 0.0439453125, + "learning_rate": 6.778554284612078e-06, + "loss": 0.5019, + "step": 1000 + }, + { + "epoch": 2.7236525918297287, + "grad_norm": 0.0458984375, + "learning_rate": 6.751315661971731e-06, + "loss": 0.4833, + "step": 1001 + }, + { + "epoch": 2.7236525918297287, + "eval_loss": 0.5037957429885864, + "eval_runtime": 619.9243, + "eval_samples_per_second": 14.786, + "eval_steps_per_second": 14.786, + "step": 1001 + }, + { + "epoch": 2.7263989014761414, + "grad_norm": 0.0419921875, + "learning_rate": 6.724115975003217e-06, + "loss": 0.4036, + "step": 1002 + }, + { + "epoch": 2.729145211122554, + "grad_norm": 0.053955078125, + "learning_rate": 6.696955352095277e-06, + "loss": 0.4995, + "step": 1003 + }, + { + "epoch": 2.7318915207689667, + "grad_norm": 0.04052734375, + "learning_rate": 6.6698339214522374e-06, + "loss": 0.39, + "step": 1004 + }, + { + "epoch": 2.7346378304153793, + "grad_norm": 0.04541015625, + "learning_rate": 6.642751811093446e-06, + "loss": 0.4771, + "step": 1005 + }, + { + "epoch": 2.737384140061792, + "grad_norm": 0.0439453125, + "learning_rate": 6.6157091488526324e-06, + "loss": 0.4343, + "step": 1006 + }, + { + "epoch": 2.7401304497082046, + "grad_norm": 0.044677734375, + "learning_rate": 6.588706062377344e-06, + "loss": 0.4141, + "step": 1007 + }, + { + "epoch": 2.7428767593546173, + "grad_norm": 0.043701171875, + "learning_rate": 6.561742679128296e-06, + "loss": 0.4756, + "step": 1008 + }, + { + "epoch": 2.74562306900103, + "grad_norm": 0.0458984375, + "learning_rate": 6.534819126378821e-06, + "loss": 0.6022, + "step": 1009 + }, + { + "epoch": 2.7483693786474426, + "grad_norm": 0.049560546875, + "learning_rate": 6.507935531214218e-06, + "loss": 0.5495, + "step": 1010 + }, + { + "epoch": 2.751115688293855, + "grad_norm": 0.04833984375, + "learning_rate": 6.4810920205312006e-06, + "loss": 0.4997, + "step": 1011 + }, + { + "epoch": 2.753861997940268, + "grad_norm": 0.044189453125, + "learning_rate": 6.454288721037252e-06, + "loss": 0.438, + "step": 1012 + }, + { + "epoch": 2.7566083075866805, + "grad_norm": 0.047607421875, + "learning_rate": 6.427525759250071e-06, + "loss": 0.5343, + "step": 1013 + }, + { + "epoch": 2.759354617233093, + "grad_norm": 0.16796875, + "learning_rate": 6.400803261496933e-06, + "loss": 1.0934, + "step": 1014 + }, + { + "epoch": 2.762100926879506, + "grad_norm": 0.04541015625, + "learning_rate": 6.374121353914132e-06, + "loss": 0.4902, + "step": 1015 + }, + { + "epoch": 2.7648472365259185, + "grad_norm": 0.047607421875, + "learning_rate": 6.347480162446349e-06, + "loss": 0.6164, + "step": 1016 + }, + { + "epoch": 2.767593546172331, + "grad_norm": 0.047119140625, + "learning_rate": 6.320879812846093e-06, + "loss": 0.3764, + "step": 1017 + }, + { + "epoch": 2.7703398558187438, + "grad_norm": 0.048095703125, + "learning_rate": 6.294320430673085e-06, + "loss": 0.5365, + "step": 1018 + }, + { + "epoch": 2.7730861654651564, + "grad_norm": 0.0458984375, + "learning_rate": 6.267802141293657e-06, + "loss": 0.4324, + "step": 1019 + }, + { + "epoch": 2.775832475111569, + "grad_norm": 0.04248046875, + "learning_rate": 6.241325069880198e-06, + "loss": 0.367, + "step": 1020 + }, + { + "epoch": 2.7785787847579817, + "grad_norm": 0.0498046875, + "learning_rate": 6.214889341410512e-06, + "loss": 0.4586, + "step": 1021 + }, + { + "epoch": 2.7813250944043943, + "grad_norm": 0.050048828125, + "learning_rate": 6.188495080667278e-06, + "loss": 0.5402, + "step": 1022 + }, + { + "epoch": 2.784071404050807, + "grad_norm": 0.04736328125, + "learning_rate": 6.162142412237421e-06, + "loss": 0.5498, + "step": 1023 + }, + { + "epoch": 2.786817713697219, + "grad_norm": 0.05029296875, + "learning_rate": 6.135831460511555e-06, + "loss": 0.4409, + "step": 1024 + }, + { + "epoch": 2.789564023343632, + "grad_norm": 0.043212890625, + "learning_rate": 6.109562349683366e-06, + "loss": 0.4341, + "step": 1025 + }, + { + "epoch": 2.7923103329900445, + "grad_norm": 0.053466796875, + "learning_rate": 6.083335203749059e-06, + "loss": 0.6233, + "step": 1026 + }, + { + "epoch": 2.795056642636457, + "grad_norm": 0.04541015625, + "learning_rate": 6.057150146506732e-06, + "loss": 0.5764, + "step": 1027 + }, + { + "epoch": 2.79780295228287, + "grad_norm": 0.055908203125, + "learning_rate": 6.031007301555849e-06, + "loss": 0.4758, + "step": 1028 + }, + { + "epoch": 2.8005492619292824, + "grad_norm": 0.05224609375, + "learning_rate": 6.004906792296584e-06, + "loss": 0.4903, + "step": 1029 + }, + { + "epoch": 2.803295571575695, + "grad_norm": 0.05029296875, + "learning_rate": 5.978848741929308e-06, + "loss": 0.5788, + "step": 1030 + }, + { + "epoch": 2.8060418812221077, + "grad_norm": 0.0439453125, + "learning_rate": 5.952833273453953e-06, + "loss": 0.4795, + "step": 1031 + }, + { + "epoch": 2.8087881908685204, + "grad_norm": 0.044677734375, + "learning_rate": 5.926860509669474e-06, + "loss": 0.4128, + "step": 1032 + }, + { + "epoch": 2.811534500514933, + "grad_norm": 0.04638671875, + "learning_rate": 5.900930573173232e-06, + "loss": 0.5129, + "step": 1033 + }, + { + "epoch": 2.8142808101613457, + "grad_norm": 0.046630859375, + "learning_rate": 5.8750435863604515e-06, + "loss": 0.5751, + "step": 1034 + }, + { + "epoch": 2.8170271198077583, + "grad_norm": 0.045166015625, + "learning_rate": 5.849199671423609e-06, + "loss": 0.4868, + "step": 1035 + }, + { + "epoch": 2.819773429454171, + "grad_norm": 0.047607421875, + "learning_rate": 5.823398950351886e-06, + "loss": 0.5558, + "step": 1036 + }, + { + "epoch": 2.8225197391005836, + "grad_norm": 0.04638671875, + "learning_rate": 5.797641544930561e-06, + "loss": 0.497, + "step": 1037 + }, + { + "epoch": 2.8252660487469963, + "grad_norm": 0.045654296875, + "learning_rate": 5.771927576740476e-06, + "loss": 0.4415, + "step": 1038 + }, + { + "epoch": 2.828012358393409, + "grad_norm": 0.04736328125, + "learning_rate": 5.746257167157416e-06, + "loss": 0.5724, + "step": 1039 + }, + { + "epoch": 2.8307586680398216, + "grad_norm": 0.048583984375, + "learning_rate": 5.72063043735158e-06, + "loss": 0.5275, + "step": 1040 + }, + { + "epoch": 2.833504977686234, + "grad_norm": 0.049560546875, + "learning_rate": 5.6950475082869685e-06, + "loss": 0.4577, + "step": 1041 + }, + { + "epoch": 2.836251287332647, + "grad_norm": 0.045166015625, + "learning_rate": 5.669508500720849e-06, + "loss": 0.5401, + "step": 1042 + }, + { + "epoch": 2.8389975969790595, + "grad_norm": 0.0419921875, + "learning_rate": 5.6440135352031695e-06, + "loss": 0.4133, + "step": 1043 + }, + { + "epoch": 2.841743906625472, + "grad_norm": 0.051025390625, + "learning_rate": 5.618562732075969e-06, + "loss": 0.4756, + "step": 1044 + }, + { + "epoch": 2.844490216271885, + "grad_norm": 0.04833984375, + "learning_rate": 5.593156211472861e-06, + "loss": 0.5736, + "step": 1045 + }, + { + "epoch": 2.847236525918297, + "grad_norm": 0.044677734375, + "learning_rate": 5.567794093318403e-06, + "loss": 0.5078, + "step": 1046 + }, + { + "epoch": 2.8499828355647097, + "grad_norm": 0.050048828125, + "learning_rate": 5.542476497327591e-06, + "loss": 0.5637, + "step": 1047 + }, + { + "epoch": 2.8527291452111223, + "grad_norm": 0.054931640625, + "learning_rate": 5.517203543005242e-06, + "loss": 0.4383, + "step": 1048 + }, + { + "epoch": 2.855475454857535, + "grad_norm": 0.04541015625, + "learning_rate": 5.491975349645479e-06, + "loss": 0.5174, + "step": 1049 + }, + { + "epoch": 2.8582217645039476, + "grad_norm": 0.04248046875, + "learning_rate": 5.466792036331117e-06, + "loss": 0.4554, + "step": 1050 + }, + { + "epoch": 2.8609680741503603, + "grad_norm": 0.04736328125, + "learning_rate": 5.44165372193315e-06, + "loss": 0.537, + "step": 1051 + }, + { + "epoch": 2.863714383796773, + "grad_norm": 0.042724609375, + "learning_rate": 5.416560525110149e-06, + "loss": 0.4111, + "step": 1052 + }, + { + "epoch": 2.8664606934431855, + "grad_norm": 0.048828125, + "learning_rate": 5.391512564307737e-06, + "loss": 0.5282, + "step": 1053 + }, + { + "epoch": 2.869207003089598, + "grad_norm": 0.049072265625, + "learning_rate": 5.36650995775799e-06, + "loss": 0.5688, + "step": 1054 + }, + { + "epoch": 2.871953312736011, + "grad_norm": 0.04248046875, + "learning_rate": 5.341552823478929e-06, + "loss": 0.3545, + "step": 1055 + }, + { + "epoch": 2.8746996223824235, + "grad_norm": 0.04150390625, + "learning_rate": 5.316641279273909e-06, + "loss": 0.3866, + "step": 1056 + }, + { + "epoch": 2.877445932028836, + "grad_norm": 0.044189453125, + "learning_rate": 5.291775442731112e-06, + "loss": 0.4777, + "step": 1057 + }, + { + "epoch": 2.880192241675249, + "grad_norm": 0.0458984375, + "learning_rate": 5.266955431222949e-06, + "loss": 0.498, + "step": 1058 + }, + { + "epoch": 2.8829385513216614, + "grad_norm": 0.047119140625, + "learning_rate": 5.242181361905548e-06, + "loss": 0.4791, + "step": 1059 + }, + { + "epoch": 2.885684860968074, + "grad_norm": 0.044677734375, + "learning_rate": 5.217453351718155e-06, + "loss": 0.435, + "step": 1060 + }, + { + "epoch": 2.8884311706144867, + "grad_norm": 0.04150390625, + "learning_rate": 5.192771517382627e-06, + "loss": 0.4513, + "step": 1061 + }, + { + "epoch": 2.8911774802608994, + "grad_norm": 0.049560546875, + "learning_rate": 5.168135975402854e-06, + "loss": 0.5548, + "step": 1062 + }, + { + "epoch": 2.893923789907312, + "grad_norm": 0.04296875, + "learning_rate": 5.143546842064209e-06, + "loss": 0.4624, + "step": 1063 + }, + { + "epoch": 2.8966700995537247, + "grad_norm": 0.046875, + "learning_rate": 5.1190042334330185e-06, + "loss": 0.5901, + "step": 1064 + }, + { + "epoch": 2.8994164092001373, + "grad_norm": 0.054931640625, + "learning_rate": 5.094508265355983e-06, + "loss": 0.5007, + "step": 1065 + }, + { + "epoch": 2.90216271884655, + "grad_norm": 0.056396484375, + "learning_rate": 5.070059053459672e-06, + "loss": 0.3924, + "step": 1066 + }, + { + "epoch": 2.9049090284929626, + "grad_norm": 0.04638671875, + "learning_rate": 5.045656713149932e-06, + "loss": 0.5346, + "step": 1067 + }, + { + "epoch": 2.9076553381393753, + "grad_norm": 0.046630859375, + "learning_rate": 5.021301359611387e-06, + "loss": 0.4761, + "step": 1068 + }, + { + "epoch": 2.910401647785788, + "grad_norm": 0.04150390625, + "learning_rate": 4.996993107806853e-06, + "loss": 0.4432, + "step": 1069 + }, + { + "epoch": 2.9131479574322006, + "grad_norm": 0.04736328125, + "learning_rate": 4.972732072476831e-06, + "loss": 0.4404, + "step": 1070 + }, + { + "epoch": 2.915894267078613, + "grad_norm": 0.046875, + "learning_rate": 4.948518368138933e-06, + "loss": 0.5556, + "step": 1071 + }, + { + "epoch": 2.918640576725026, + "grad_norm": 0.048583984375, + "learning_rate": 4.9243521090873745e-06, + "loss": 0.523, + "step": 1072 + }, + { + "epoch": 2.9213868863714385, + "grad_norm": 0.046875, + "learning_rate": 4.900233409392409e-06, + "loss": 0.5381, + "step": 1073 + }, + { + "epoch": 2.924133196017851, + "grad_norm": 0.046630859375, + "learning_rate": 4.876162382899809e-06, + "loss": 0.5505, + "step": 1074 + }, + { + "epoch": 2.926879505664264, + "grad_norm": 0.197265625, + "learning_rate": 4.852139143230296e-06, + "loss": 1.176, + "step": 1075 + }, + { + "epoch": 2.9296258153106765, + "grad_norm": 0.042724609375, + "learning_rate": 4.828163803779057e-06, + "loss": 0.4169, + "step": 1076 + }, + { + "epoch": 2.932372124957089, + "grad_norm": 0.04638671875, + "learning_rate": 4.804236477715152e-06, + "loss": 0.6101, + "step": 1077 + }, + { + "epoch": 2.9351184346035017, + "grad_norm": 0.044921875, + "learning_rate": 4.780357277981027e-06, + "loss": 0.4059, + "step": 1078 + }, + { + "epoch": 2.9378647442499144, + "grad_norm": 0.0556640625, + "learning_rate": 4.7565263172919415e-06, + "loss": 0.5825, + "step": 1079 + }, + { + "epoch": 2.940611053896327, + "grad_norm": 0.0458984375, + "learning_rate": 4.732743708135479e-06, + "loss": 0.45, + "step": 1080 + }, + { + "epoch": 2.9433573635427397, + "grad_norm": 0.047119140625, + "learning_rate": 4.709009562770971e-06, + "loss": 0.4906, + "step": 1081 + }, + { + "epoch": 2.9461036731891523, + "grad_norm": 0.047119140625, + "learning_rate": 4.685323993229005e-06, + "loss": 0.5843, + "step": 1082 + }, + { + "epoch": 2.948849982835565, + "grad_norm": 0.047607421875, + "learning_rate": 4.661687111310865e-06, + "loss": 0.4679, + "step": 1083 + }, + { + "epoch": 2.951596292481977, + "grad_norm": 0.047119140625, + "learning_rate": 4.638099028588034e-06, + "loss": 0.5253, + "step": 1084 + }, + { + "epoch": 2.95434260212839, + "grad_norm": 0.0439453125, + "learning_rate": 4.614559856401635e-06, + "loss": 0.4255, + "step": 1085 + }, + { + "epoch": 2.9570889117748025, + "grad_norm": 0.044189453125, + "learning_rate": 4.591069705861935e-06, + "loss": 0.4591, + "step": 1086 + }, + { + "epoch": 2.959835221421215, + "grad_norm": 0.047119140625, + "learning_rate": 4.567628687847808e-06, + "loss": 0.4433, + "step": 1087 + }, + { + "epoch": 2.962581531067628, + "grad_norm": 0.045166015625, + "learning_rate": 4.544236913006199e-06, + "loss": 0.4516, + "step": 1088 + }, + { + "epoch": 2.9653278407140404, + "grad_norm": 0.047607421875, + "learning_rate": 4.520894491751629e-06, + "loss": 0.5292, + "step": 1089 + }, + { + "epoch": 2.968074150360453, + "grad_norm": 0.04638671875, + "learning_rate": 4.497601534265641e-06, + "loss": 0.5397, + "step": 1090 + }, + { + "epoch": 2.9708204600068657, + "grad_norm": 0.045166015625, + "learning_rate": 4.4743581504963206e-06, + "loss": 0.5584, + "step": 1091 + }, + { + "epoch": 2.9735667696532784, + "grad_norm": 0.04541015625, + "learning_rate": 4.451164450157729e-06, + "loss": 0.4986, + "step": 1092 + }, + { + "epoch": 2.9735667696532784, + "eval_loss": 0.5037021636962891, + "eval_runtime": 614.9978, + "eval_samples_per_second": 14.904, + "eval_steps_per_second": 14.904, + "step": 1092 + }, + { + "epoch": 2.976313079299691, + "grad_norm": 0.049560546875, + "learning_rate": 4.428020542729436e-06, + "loss": 0.5396, + "step": 1093 + }, + { + "epoch": 2.9790593889461037, + "grad_norm": 0.043701171875, + "learning_rate": 4.4049265374559536e-06, + "loss": 0.4538, + "step": 1094 + }, + { + "epoch": 2.9818056985925163, + "grad_norm": 0.046142578125, + "learning_rate": 4.381882543346262e-06, + "loss": 0.3633, + "step": 1095 + }, + { + "epoch": 2.984552008238929, + "grad_norm": 0.046875, + "learning_rate": 4.358888669173264e-06, + "loss": 0.5483, + "step": 1096 + }, + { + "epoch": 2.9872983178853416, + "grad_norm": 0.05029296875, + "learning_rate": 4.3359450234733e-06, + "loss": 0.3848, + "step": 1097 + }, + { + "epoch": 2.9900446275317543, + "grad_norm": 0.049560546875, + "learning_rate": 4.3130517145456e-06, + "loss": 0.6011, + "step": 1098 + }, + { + "epoch": 2.992790937178167, + "grad_norm": 0.04443359375, + "learning_rate": 4.29020885045182e-06, + "loss": 0.4609, + "step": 1099 + }, + { + "epoch": 2.9955372468245796, + "grad_norm": 0.052001953125, + "learning_rate": 4.267416539015474e-06, + "loss": 0.3615, + "step": 1100 + }, + { + "epoch": 2.998283556470992, + "grad_norm": 0.049072265625, + "learning_rate": 4.244674887821483e-06, + "loss": 0.4688, + "step": 1101 + }, + { + "epoch": 3.0010298661174044, + "grad_norm": 0.040771484375, + "learning_rate": 4.221984004215623e-06, + "loss": 0.3021, + "step": 1102 + }, + { + "epoch": 3.003776175763817, + "grad_norm": 0.05322265625, + "learning_rate": 4.199343995304044e-06, + "loss": 0.3841, + "step": 1103 + }, + { + "epoch": 3.0065224854102297, + "grad_norm": 0.055908203125, + "learning_rate": 4.176754967952749e-06, + "loss": 0.4316, + "step": 1104 + }, + { + "epoch": 3.0092687950566424, + "grad_norm": 0.046142578125, + "learning_rate": 4.154217028787101e-06, + "loss": 0.5092, + "step": 1105 + }, + { + "epoch": 3.012015104703055, + "grad_norm": 0.050537109375, + "learning_rate": 4.131730284191321e-06, + "loss": 0.4633, + "step": 1106 + }, + { + "epoch": 3.0020597322348093, + "grad_norm": 0.04541015625, + "learning_rate": 4.109294840307966e-06, + "loss": 0.4454, + "step": 1107 + }, + { + "epoch": 3.004806041881222, + "grad_norm": 0.046142578125, + "learning_rate": 4.086910803037453e-06, + "loss": 0.4654, + "step": 1108 + }, + { + "epoch": 3.0075523515276346, + "grad_norm": 0.045654296875, + "learning_rate": 4.064578278037542e-06, + "loss": 0.4323, + "step": 1109 + }, + { + "epoch": 3.010298661174047, + "grad_norm": 0.04248046875, + "learning_rate": 4.042297370722851e-06, + "loss": 0.4796, + "step": 1110 + }, + { + "epoch": 3.01304497082046, + "grad_norm": 0.04345703125, + "learning_rate": 4.0200681862643355e-06, + "loss": 0.4253, + "step": 1111 + }, + { + "epoch": 3.0157912804668725, + "grad_norm": 0.04345703125, + "learning_rate": 3.9978908295888285e-06, + "loss": 0.4095, + "step": 1112 + }, + { + "epoch": 3.018537590113285, + "grad_norm": 0.044921875, + "learning_rate": 3.975765405378502e-06, + "loss": 0.4575, + "step": 1113 + }, + { + "epoch": 3.021283899759698, + "grad_norm": 0.04248046875, + "learning_rate": 3.953692018070417e-06, + "loss": 0.4556, + "step": 1114 + }, + { + "epoch": 3.0240302094061104, + "grad_norm": 0.047119140625, + "learning_rate": 3.931670771855986e-06, + "loss": 0.5403, + "step": 1115 + }, + { + "epoch": 3.026776519052523, + "grad_norm": 0.045166015625, + "learning_rate": 3.909701770680524e-06, + "loss": 0.4718, + "step": 1116 + }, + { + "epoch": 3.0295228286989357, + "grad_norm": 0.0419921875, + "learning_rate": 3.887785118242722e-06, + "loss": 0.4633, + "step": 1117 + }, + { + "epoch": 3.0322691383453484, + "grad_norm": 0.045654296875, + "learning_rate": 3.8659209179941804e-06, + "loss": 0.5703, + "step": 1118 + }, + { + "epoch": 3.035015447991761, + "grad_norm": 0.046630859375, + "learning_rate": 3.844109273138914e-06, + "loss": 0.5709, + "step": 1119 + }, + { + "epoch": 3.0377617576381737, + "grad_norm": 0.046875, + "learning_rate": 3.822350286632867e-06, + "loss": 0.4592, + "step": 1120 + }, + { + "epoch": 3.0405080672845863, + "grad_norm": 0.044677734375, + "learning_rate": 3.8006440611834103e-06, + "loss": 0.4843, + "step": 1121 + }, + { + "epoch": 3.043254376930999, + "grad_norm": 0.04833984375, + "learning_rate": 3.7789906992488875e-06, + "loss": 0.4962, + "step": 1122 + }, + { + "epoch": 3.0460006865774116, + "grad_norm": 0.0478515625, + "learning_rate": 3.7573903030381003e-06, + "loss": 0.5629, + "step": 1123 + }, + { + "epoch": 3.0487469962238243, + "grad_norm": 0.04833984375, + "learning_rate": 3.7358429745098525e-06, + "loss": 0.5103, + "step": 1124 + }, + { + "epoch": 3.051493305870237, + "grad_norm": 0.044189453125, + "learning_rate": 3.7143488153724454e-06, + "loss": 0.4677, + "step": 1125 + }, + { + "epoch": 3.0542396155166496, + "grad_norm": 0.042724609375, + "learning_rate": 3.692907927083217e-06, + "loss": 0.423, + "step": 1126 + }, + { + "epoch": 3.0569859251630622, + "grad_norm": 0.048095703125, + "learning_rate": 3.6715204108480473e-06, + "loss": 0.4903, + "step": 1127 + }, + { + "epoch": 3.059732234809475, + "grad_norm": 0.1494140625, + "learning_rate": 3.6501863676208984e-06, + "loss": 0.9496, + "step": 1128 + }, + { + "epoch": 3.0624785444558875, + "grad_norm": 0.044677734375, + "learning_rate": 3.6289058981033136e-06, + "loss": 0.4253, + "step": 1129 + }, + { + "epoch": 3.0652248541023, + "grad_norm": 0.04736328125, + "learning_rate": 3.607679102743968e-06, + "loss": 0.5686, + "step": 1130 + }, + { + "epoch": 3.067971163748713, + "grad_norm": 0.04638671875, + "learning_rate": 3.586506081738181e-06, + "loss": 0.5278, + "step": 1131 + }, + { + "epoch": 3.0707174733951255, + "grad_norm": 0.043212890625, + "learning_rate": 3.5653869350274357e-06, + "loss": 0.4348, + "step": 1132 + }, + { + "epoch": 3.073463783041538, + "grad_norm": 0.050537109375, + "learning_rate": 3.5443217622989294e-06, + "loss": 0.5263, + "step": 1133 + }, + { + "epoch": 3.0762100926879508, + "grad_norm": 0.046875, + "learning_rate": 3.5233106629850736e-06, + "loss": 0.5263, + "step": 1134 + }, + { + "epoch": 3.0789564023343634, + "grad_norm": 0.04443359375, + "learning_rate": 3.5023537362630605e-06, + "loss": 0.4807, + "step": 1135 + }, + { + "epoch": 3.0817027119807756, + "grad_norm": 0.04638671875, + "learning_rate": 3.4814510810543553e-06, + "loss": 0.6053, + "step": 1136 + }, + { + "epoch": 3.0844490216271883, + "grad_norm": 0.05126953125, + "learning_rate": 3.46060279602427e-06, + "loss": 0.391, + "step": 1137 + }, + { + "epoch": 3.087195331273601, + "grad_norm": 0.0439453125, + "learning_rate": 3.439808979581455e-06, + "loss": 0.4525, + "step": 1138 + }, + { + "epoch": 3.0899416409200136, + "grad_norm": 0.04833984375, + "learning_rate": 3.4190697298774772e-06, + "loss": 0.532, + "step": 1139 + }, + { + "epoch": 3.092687950566426, + "grad_norm": 0.045166015625, + "learning_rate": 3.398385144806318e-06, + "loss": 0.5811, + "step": 1140 + }, + { + "epoch": 3.095434260212839, + "grad_norm": 0.04638671875, + "learning_rate": 3.3777553220039455e-06, + "loss": 0.5059, + "step": 1141 + }, + { + "epoch": 3.0981805698592515, + "grad_norm": 0.04638671875, + "learning_rate": 3.357180358847822e-06, + "loss": 0.4974, + "step": 1142 + }, + { + "epoch": 3.100926879505664, + "grad_norm": 0.047607421875, + "learning_rate": 3.3366603524564736e-06, + "loss": 0.52, + "step": 1143 + }, + { + "epoch": 3.103673189152077, + "grad_norm": 0.04296875, + "learning_rate": 3.316195399689007e-06, + "loss": 0.4295, + "step": 1144 + }, + { + "epoch": 3.1064194987984894, + "grad_norm": 0.0498046875, + "learning_rate": 3.2957855971446737e-06, + "loss": 0.4381, + "step": 1145 + }, + { + "epoch": 3.109165808444902, + "grad_norm": 0.048095703125, + "learning_rate": 3.2754310411623888e-06, + "loss": 0.4879, + "step": 1146 + }, + { + "epoch": 3.1119121180913147, + "grad_norm": 0.04296875, + "learning_rate": 3.255131827820311e-06, + "loss": 0.4444, + "step": 1147 + }, + { + "epoch": 3.1146584277377274, + "grad_norm": 0.043212890625, + "learning_rate": 3.2348880529353484e-06, + "loss": 0.4969, + "step": 1148 + }, + { + "epoch": 3.11740473738414, + "grad_norm": 0.047607421875, + "learning_rate": 3.21469981206274e-06, + "loss": 0.5399, + "step": 1149 + }, + { + "epoch": 3.1201510470305527, + "grad_norm": 0.04638671875, + "learning_rate": 3.194567200495593e-06, + "loss": 0.3839, + "step": 1150 + }, + { + "epoch": 3.1228973566769653, + "grad_norm": 0.046875, + "learning_rate": 3.1744903132644197e-06, + "loss": 0.5803, + "step": 1151 + }, + { + "epoch": 3.125643666323378, + "grad_norm": 0.048095703125, + "learning_rate": 3.1544692451367147e-06, + "loss": 0.5422, + "step": 1152 + }, + { + "epoch": 3.1283899759697906, + "grad_norm": 0.04443359375, + "learning_rate": 3.1345040906164787e-06, + "loss": 0.4212, + "step": 1153 + }, + { + "epoch": 3.1311362856162033, + "grad_norm": 0.04638671875, + "learning_rate": 3.1145949439438054e-06, + "loss": 0.4019, + "step": 1154 + }, + { + "epoch": 3.133882595262616, + "grad_norm": 0.04443359375, + "learning_rate": 3.094741899094399e-06, + "loss": 0.3445, + "step": 1155 + }, + { + "epoch": 3.1366289049090286, + "grad_norm": 0.04248046875, + "learning_rate": 3.0749450497791693e-06, + "loss": 0.518, + "step": 1156 + }, + { + "epoch": 3.139375214555441, + "grad_norm": 0.04833984375, + "learning_rate": 3.055204489443753e-06, + "loss": 0.4594, + "step": 1157 + }, + { + "epoch": 3.142121524201854, + "grad_norm": 0.0458984375, + "learning_rate": 3.0355203112681063e-06, + "loss": 0.5042, + "step": 1158 + }, + { + "epoch": 3.1448678338482665, + "grad_norm": 0.041015625, + "learning_rate": 3.0158926081660338e-06, + "loss": 0.4187, + "step": 1159 + }, + { + "epoch": 3.147614143494679, + "grad_norm": 0.047119140625, + "learning_rate": 2.9963214727847773e-06, + "loss": 0.5166, + "step": 1160 + }, + { + "epoch": 3.150360453141092, + "grad_norm": 0.04443359375, + "learning_rate": 2.976806997504555e-06, + "loss": 0.4656, + "step": 1161 + }, + { + "epoch": 3.1531067627875045, + "grad_norm": 0.041748046875, + "learning_rate": 2.9573492744381475e-06, + "loss": 0.4555, + "step": 1162 + }, + { + "epoch": 3.155853072433917, + "grad_norm": 0.051513671875, + "learning_rate": 2.9379483954304386e-06, + "loss": 0.5357, + "step": 1163 + }, + { + "epoch": 3.1585993820803298, + "grad_norm": 0.1708984375, + "learning_rate": 2.9186044520580145e-06, + "loss": 1.2069, + "step": 1164 + }, + { + "epoch": 3.1613456917267424, + "grad_norm": 0.04736328125, + "learning_rate": 2.8993175356286934e-06, + "loss": 0.5469, + "step": 1165 + }, + { + "epoch": 3.164092001373155, + "grad_norm": 0.045654296875, + "learning_rate": 2.8800877371811245e-06, + "loss": 0.5138, + "step": 1166 + }, + { + "epoch": 3.1668383110195673, + "grad_norm": 0.042724609375, + "learning_rate": 2.8609151474843377e-06, + "loss": 0.486, + "step": 1167 + }, + { + "epoch": 3.16958462066598, + "grad_norm": 0.048095703125, + "learning_rate": 2.841799857037337e-06, + "loss": 0.5253, + "step": 1168 + }, + { + "epoch": 3.1723309303123925, + "grad_norm": 0.04443359375, + "learning_rate": 2.822741956068648e-06, + "loss": 0.4386, + "step": 1169 + }, + { + "epoch": 3.175077239958805, + "grad_norm": 0.04541015625, + "learning_rate": 2.803741534535916e-06, + "loss": 0.4587, + "step": 1170 + }, + { + "epoch": 3.177823549605218, + "grad_norm": 0.047119140625, + "learning_rate": 2.7847986821254605e-06, + "loss": 0.445, + "step": 1171 + }, + { + "epoch": 3.1805698592516305, + "grad_norm": 0.047119140625, + "learning_rate": 2.7659134882518715e-06, + "loss": 0.532, + "step": 1172 + }, + { + "epoch": 3.183316168898043, + "grad_norm": 0.045654296875, + "learning_rate": 2.747086042057566e-06, + "loss": 0.5697, + "step": 1173 + }, + { + "epoch": 3.186062478544456, + "grad_norm": 0.047119140625, + "learning_rate": 2.7283164324123904e-06, + "loss": 0.5351, + "step": 1174 + }, + { + "epoch": 3.1888087881908684, + "grad_norm": 0.04833984375, + "learning_rate": 2.7096047479131848e-06, + "loss": 0.5825, + "step": 1175 + }, + { + "epoch": 3.191555097837281, + "grad_norm": 0.044189453125, + "learning_rate": 2.6909510768833606e-06, + "loss": 0.4346, + "step": 1176 + }, + { + "epoch": 3.1943014074836937, + "grad_norm": 0.04931640625, + "learning_rate": 2.6723555073725125e-06, + "loss": 0.4932, + "step": 1177 + }, + { + "epoch": 3.1970477171301064, + "grad_norm": 0.0458984375, + "learning_rate": 2.653818127155959e-06, + "loss": 0.5453, + "step": 1178 + }, + { + "epoch": 3.199794026776519, + "grad_norm": 0.049072265625, + "learning_rate": 2.635339023734374e-06, + "loss": 0.5351, + "step": 1179 + }, + { + "epoch": 3.2025403364229317, + "grad_norm": 0.04833984375, + "learning_rate": 2.6169182843333334e-06, + "loss": 0.4668, + "step": 1180 + }, + { + "epoch": 3.2052866460693443, + "grad_norm": 0.04833984375, + "learning_rate": 2.5985559959029347e-06, + "loss": 0.4176, + "step": 1181 + }, + { + "epoch": 3.208032955715757, + "grad_norm": 0.04638671875, + "learning_rate": 2.5802522451173627e-06, + "loss": 0.4717, + "step": 1182 + }, + { + "epoch": 3.2107792653621696, + "grad_norm": 0.047119140625, + "learning_rate": 2.562007118374504e-06, + "loss": 0.5227, + "step": 1183 + }, + { + "epoch": 3.2107792653621696, + "eval_loss": 0.5037118196487427, + "eval_runtime": 639.2982, + "eval_samples_per_second": 14.338, + "eval_steps_per_second": 14.338, + "step": 1183 + }, + { + "epoch": 3.2135255750085823, + "grad_norm": 0.04248046875, + "learning_rate": 2.543820701795511e-06, + "loss": 0.4845, + "step": 1184 + }, + { + "epoch": 3.216271884654995, + "grad_norm": 0.1611328125, + "learning_rate": 2.5256930812244273e-06, + "loss": 1.1749, + "step": 1185 + }, + { + "epoch": 3.2190181943014076, + "grad_norm": 0.1923828125, + "learning_rate": 2.507624342227748e-06, + "loss": 1.1636, + "step": 1186 + }, + { + "epoch": 3.22176450394782, + "grad_norm": 0.047607421875, + "learning_rate": 2.4896145700940524e-06, + "loss": 0.508, + "step": 1187 + }, + { + "epoch": 3.224510813594233, + "grad_norm": 0.19140625, + "learning_rate": 2.471663849833567e-06, + "loss": 1.1039, + "step": 1188 + }, + { + "epoch": 3.2272571232406455, + "grad_norm": 0.044921875, + "learning_rate": 2.453772266177791e-06, + "loss": 0.5217, + "step": 1189 + }, + { + "epoch": 3.230003432887058, + "grad_norm": 0.04833984375, + "learning_rate": 2.435939903579075e-06, + "loss": 0.5168, + "step": 1190 + }, + { + "epoch": 3.232749742533471, + "grad_norm": 0.05615234375, + "learning_rate": 2.4181668462102478e-06, + "loss": 0.6517, + "step": 1191 + }, + { + "epoch": 3.2354960521798835, + "grad_norm": 0.043212890625, + "learning_rate": 2.4004531779641835e-06, + "loss": 0.439, + "step": 1192 + }, + { + "epoch": 3.238242361826296, + "grad_norm": 0.047119140625, + "learning_rate": 2.382798982453444e-06, + "loss": 0.4339, + "step": 1193 + }, + { + "epoch": 3.2409886714727087, + "grad_norm": 0.044677734375, + "learning_rate": 2.3652043430098624e-06, + "loss": 0.493, + "step": 1194 + }, + { + "epoch": 3.243734981119121, + "grad_norm": 0.046142578125, + "learning_rate": 2.3476693426841417e-06, + "loss": 0.389, + "step": 1195 + }, + { + "epoch": 3.2464812907655336, + "grad_norm": 0.0439453125, + "learning_rate": 2.3301940642454932e-06, + "loss": 0.5022, + "step": 1196 + }, + { + "epoch": 3.2492276004119462, + "grad_norm": 0.047119140625, + "learning_rate": 2.3127785901812093e-06, + "loss": 0.5275, + "step": 1197 + }, + { + "epoch": 3.251973910058359, + "grad_norm": 0.049560546875, + "learning_rate": 2.2954230026963092e-06, + "loss": 0.5681, + "step": 1198 + }, + { + "epoch": 3.2547202197047715, + "grad_norm": 0.04150390625, + "learning_rate": 2.278127383713117e-06, + "loss": 0.4153, + "step": 1199 + }, + { + "epoch": 3.257466529351184, + "grad_norm": 0.046142578125, + "learning_rate": 2.2608918148709057e-06, + "loss": 0.4188, + "step": 1200 + }, + { + "epoch": 3.260212838997597, + "grad_norm": 0.0458984375, + "learning_rate": 2.2437163775254863e-06, + "loss": 0.4721, + "step": 1201 + }, + { + "epoch": 3.2629591486440095, + "grad_norm": 0.048828125, + "learning_rate": 2.22660115274885e-06, + "loss": 0.443, + "step": 1202 + }, + { + "epoch": 3.265705458290422, + "grad_norm": 0.0478515625, + "learning_rate": 2.2095462213287526e-06, + "loss": 0.4396, + "step": 1203 + }, + { + "epoch": 3.268451767936835, + "grad_norm": 0.042724609375, + "learning_rate": 2.19255166376837e-06, + "loss": 0.4431, + "step": 1204 + }, + { + "epoch": 3.2711980775832474, + "grad_norm": 0.0517578125, + "learning_rate": 2.175617560285883e-06, + "loss": 0.5235, + "step": 1205 + }, + { + "epoch": 3.27394438722966, + "grad_norm": 0.03955078125, + "learning_rate": 2.158743990814128e-06, + "loss": 0.357, + "step": 1206 + }, + { + "epoch": 3.2766906968760727, + "grad_norm": 0.1318359375, + "learning_rate": 2.1419310350001997e-06, + "loss": 1.1825, + "step": 1207 + }, + { + "epoch": 3.2794370065224854, + "grad_norm": 0.049072265625, + "learning_rate": 2.1251787722050854e-06, + "loss": 0.5559, + "step": 1208 + }, + { + "epoch": 3.282183316168898, + "grad_norm": 0.045654296875, + "learning_rate": 2.1084872815032885e-06, + "loss": 0.529, + "step": 1209 + }, + { + "epoch": 3.2849296258153107, + "grad_norm": 0.047119140625, + "learning_rate": 2.0918566416824557e-06, + "loss": 0.5814, + "step": 1210 + }, + { + "epoch": 3.2876759354617233, + "grad_norm": 0.04736328125, + "learning_rate": 2.075286931242995e-06, + "loss": 0.4434, + "step": 1211 + }, + { + "epoch": 3.290422245108136, + "grad_norm": 0.044677734375, + "learning_rate": 2.058778228397726e-06, + "loss": 0.4634, + "step": 1212 + }, + { + "epoch": 3.2931685547545486, + "grad_norm": 0.0478515625, + "learning_rate": 2.0423306110714846e-06, + "loss": 0.4865, + "step": 1213 + }, + { + "epoch": 3.2959148644009613, + "grad_norm": 0.052978515625, + "learning_rate": 2.0259441569007836e-06, + "loss": 0.4826, + "step": 1214 + }, + { + "epoch": 3.298661174047374, + "grad_norm": 0.054931640625, + "learning_rate": 2.0096189432334194e-06, + "loss": 0.399, + "step": 1215 + }, + { + "epoch": 3.3014074836937866, + "grad_norm": 0.053466796875, + "learning_rate": 1.9933550471281315e-06, + "loss": 0.5276, + "step": 1216 + }, + { + "epoch": 3.304153793340199, + "grad_norm": 0.04345703125, + "learning_rate": 1.9771525453542123e-06, + "loss": 0.4359, + "step": 1217 + }, + { + "epoch": 3.306900102986612, + "grad_norm": 0.043212890625, + "learning_rate": 1.9610115143911696e-06, + "loss": 0.4383, + "step": 1218 + }, + { + "epoch": 3.3096464126330245, + "grad_norm": 0.044189453125, + "learning_rate": 1.9449320304283545e-06, + "loss": 0.4844, + "step": 1219 + }, + { + "epoch": 3.312392722279437, + "grad_norm": 0.0517578125, + "learning_rate": 1.928914169364595e-06, + "loss": 0.4703, + "step": 1220 + }, + { + "epoch": 3.31513903192585, + "grad_norm": 0.044189453125, + "learning_rate": 1.9129580068078556e-06, + "loss": 0.5235, + "step": 1221 + }, + { + "epoch": 3.3178853415722624, + "grad_norm": 0.046630859375, + "learning_rate": 1.8970636180748542e-06, + "loss": 0.4168, + "step": 1222 + }, + { + "epoch": 3.320631651218675, + "grad_norm": 0.048095703125, + "learning_rate": 1.8812310781907416e-06, + "loss": 0.6157, + "step": 1223 + }, + { + "epoch": 3.3233779608650877, + "grad_norm": 0.046630859375, + "learning_rate": 1.8654604618887095e-06, + "loss": 0.5189, + "step": 1224 + }, + { + "epoch": 3.3261242705115004, + "grad_norm": 0.0439453125, + "learning_rate": 1.8497518436096727e-06, + "loss": 0.5767, + "step": 1225 + }, + { + "epoch": 3.328870580157913, + "grad_norm": 0.04638671875, + "learning_rate": 1.8341052975018856e-06, + "loss": 0.4845, + "step": 1226 + }, + { + "epoch": 3.3316168898043257, + "grad_norm": 0.047119140625, + "learning_rate": 1.8185208974206202e-06, + "loss": 0.5266, + "step": 1227 + }, + { + "epoch": 3.334363199450738, + "grad_norm": 0.0439453125, + "learning_rate": 1.8029987169277962e-06, + "loss": 0.4485, + "step": 1228 + }, + { + "epoch": 3.3371095090971505, + "grad_norm": 0.059814453125, + "learning_rate": 1.7875388292916516e-06, + "loss": 0.5584, + "step": 1229 + }, + { + "epoch": 3.339855818743563, + "grad_norm": 0.0400390625, + "learning_rate": 1.7721413074863769e-06, + "loss": 0.4716, + "step": 1230 + }, + { + "epoch": 3.342602128389976, + "grad_norm": 0.04736328125, + "learning_rate": 1.7568062241917937e-06, + "loss": 0.5874, + "step": 1231 + }, + { + "epoch": 3.3453484380363885, + "grad_norm": 0.043701171875, + "learning_rate": 1.7415336517929886e-06, + "loss": 0.384, + "step": 1232 + }, + { + "epoch": 3.348094747682801, + "grad_norm": 0.04541015625, + "learning_rate": 1.726323662379992e-06, + "loss": 0.5479, + "step": 1233 + }, + { + "epoch": 3.350841057329214, + "grad_norm": 0.045654296875, + "learning_rate": 1.7111763277474179e-06, + "loss": 0.4631, + "step": 1234 + }, + { + "epoch": 3.3535873669756264, + "grad_norm": 0.049072265625, + "learning_rate": 1.6960917193941478e-06, + "loss": 0.5318, + "step": 1235 + }, + { + "epoch": 3.356333676622039, + "grad_norm": 0.04345703125, + "learning_rate": 1.6810699085229685e-06, + "loss": 0.4914, + "step": 1236 + }, + { + "epoch": 3.3590799862684517, + "grad_norm": 0.049072265625, + "learning_rate": 1.6661109660402563e-06, + "loss": 0.5059, + "step": 1237 + }, + { + "epoch": 3.3618262959148644, + "grad_norm": 0.039794921875, + "learning_rate": 1.6512149625556371e-06, + "loss": 0.3797, + "step": 1238 + }, + { + "epoch": 3.364572605561277, + "grad_norm": 0.045166015625, + "learning_rate": 1.6363819683816372e-06, + "loss": 0.5895, + "step": 1239 + }, + { + "epoch": 3.3673189152076897, + "grad_norm": 0.045654296875, + "learning_rate": 1.6216120535333818e-06, + "loss": 0.5115, + "step": 1240 + }, + { + "epoch": 3.3700652248541023, + "grad_norm": 0.049560546875, + "learning_rate": 1.6069052877282292e-06, + "loss": 0.414, + "step": 1241 + }, + { + "epoch": 3.372811534500515, + "grad_norm": 0.048583984375, + "learning_rate": 1.5922617403854768e-06, + "loss": 0.5862, + "step": 1242 + }, + { + "epoch": 3.3755578441469276, + "grad_norm": 0.05078125, + "learning_rate": 1.5776814806260032e-06, + "loss": 0.5591, + "step": 1243 + }, + { + "epoch": 3.3783041537933403, + "grad_norm": 0.048583984375, + "learning_rate": 1.563164577271965e-06, + "loss": 0.5291, + "step": 1244 + }, + { + "epoch": 3.381050463439753, + "grad_norm": 0.047119140625, + "learning_rate": 1.5487110988464526e-06, + "loss": 0.5359, + "step": 1245 + }, + { + "epoch": 3.3837967730861656, + "grad_norm": 0.04345703125, + "learning_rate": 1.5343211135731894e-06, + "loss": 0.4184, + "step": 1246 + }, + { + "epoch": 3.386543082732578, + "grad_norm": 0.04736328125, + "learning_rate": 1.5199946893761785e-06, + "loss": 0.4923, + "step": 1247 + }, + { + "epoch": 3.389289392378991, + "grad_norm": 0.046630859375, + "learning_rate": 1.5057318938794195e-06, + "loss": 0.4915, + "step": 1248 + }, + { + "epoch": 3.3920357020254035, + "grad_norm": 0.050048828125, + "learning_rate": 1.4915327944065544e-06, + "loss": 0.5856, + "step": 1249 + }, + { + "epoch": 3.394782011671816, + "grad_norm": 0.048583984375, + "learning_rate": 1.4773974579805783e-06, + "loss": 0.5155, + "step": 1250 + }, + { + "epoch": 3.397528321318229, + "grad_norm": 0.046142578125, + "learning_rate": 1.463325951323496e-06, + "loss": 0.5409, + "step": 1251 + }, + { + "epoch": 3.4002746309646414, + "grad_norm": 0.045166015625, + "learning_rate": 1.4493183408560361e-06, + "loss": 0.4235, + "step": 1252 + }, + { + "epoch": 3.403020940611054, + "grad_norm": 0.0458984375, + "learning_rate": 1.4353746926973077e-06, + "loss": 0.4903, + "step": 1253 + }, + { + "epoch": 3.4057672502574663, + "grad_norm": 0.046142578125, + "learning_rate": 1.421495072664522e-06, + "loss": 0.5838, + "step": 1254 + }, + { + "epoch": 3.408513559903879, + "grad_norm": 0.041748046875, + "learning_rate": 1.407679546272641e-06, + "loss": 0.4681, + "step": 1255 + }, + { + "epoch": 3.4112598695502916, + "grad_norm": 0.04345703125, + "learning_rate": 1.3939281787341102e-06, + "loss": 0.5009, + "step": 1256 + }, + { + "epoch": 3.4140061791967042, + "grad_norm": 0.04638671875, + "learning_rate": 1.3802410349585153e-06, + "loss": 0.5052, + "step": 1257 + }, + { + "epoch": 3.416752488843117, + "grad_norm": 0.0498046875, + "learning_rate": 1.3666181795523004e-06, + "loss": 0.6236, + "step": 1258 + }, + { + "epoch": 3.4194987984895295, + "grad_norm": 0.053466796875, + "learning_rate": 1.3530596768184478e-06, + "loss": 0.5415, + "step": 1259 + }, + { + "epoch": 3.422245108135942, + "grad_norm": 0.04443359375, + "learning_rate": 1.3395655907561871e-06, + "loss": 0.4757, + "step": 1260 + }, + { + "epoch": 3.424991417782355, + "grad_norm": 0.045166015625, + "learning_rate": 1.3261359850606792e-06, + "loss": 0.4161, + "step": 1261 + }, + { + "epoch": 3.4277377274287675, + "grad_norm": 0.048095703125, + "learning_rate": 1.3127709231227281e-06, + "loss": 0.5019, + "step": 1262 + }, + { + "epoch": 3.43048403707518, + "grad_norm": 0.04248046875, + "learning_rate": 1.2994704680284786e-06, + "loss": 0.4118, + "step": 1263 + }, + { + "epoch": 3.4332303467215928, + "grad_norm": 0.06298828125, + "learning_rate": 1.2862346825591075e-06, + "loss": 0.495, + "step": 1264 + }, + { + "epoch": 3.4359766563680054, + "grad_norm": 0.049072265625, + "learning_rate": 1.2730636291905462e-06, + "loss": 0.4988, + "step": 1265 + }, + { + "epoch": 3.438722966014418, + "grad_norm": 0.04345703125, + "learning_rate": 1.2599573700931666e-06, + "loss": 0.3351, + "step": 1266 + }, + { + "epoch": 3.4414692756608307, + "grad_norm": 0.051025390625, + "learning_rate": 1.2469159671315072e-06, + "loss": 0.5976, + "step": 1267 + }, + { + "epoch": 3.4442155853072434, + "grad_norm": 0.04541015625, + "learning_rate": 1.2339394818639583e-06, + "loss": 0.5267, + "step": 1268 + }, + { + "epoch": 3.446961894953656, + "grad_norm": 0.05126953125, + "learning_rate": 1.2210279755424981e-06, + "loss": 0.4385, + "step": 1269 + }, + { + "epoch": 3.4497082046000687, + "grad_norm": 0.04150390625, + "learning_rate": 1.2081815091123755e-06, + "loss": 0.4201, + "step": 1270 + }, + { + "epoch": 3.4524545142464813, + "grad_norm": 0.049072265625, + "learning_rate": 1.1954001432118482e-06, + "loss": 0.5371, + "step": 1271 + }, + { + "epoch": 3.455200823892894, + "grad_norm": 0.04296875, + "learning_rate": 1.1826839381718752e-06, + "loss": 0.4782, + "step": 1272 + }, + { + "epoch": 3.4579471335393066, + "grad_norm": 0.046630859375, + "learning_rate": 1.1700329540158473e-06, + "loss": 0.4626, + "step": 1273 + }, + { + "epoch": 3.4606934431857193, + "grad_norm": 0.048095703125, + "learning_rate": 1.157447250459292e-06, + "loss": 0.5723, + "step": 1274 + }, + { + "epoch": 3.4606934431857193, + "eval_loss": 0.5037119388580322, + "eval_runtime": 636.191, + "eval_samples_per_second": 14.408, + "eval_steps_per_second": 14.408, + "step": 1274 + } + ], + "logging_steps": 1, + "max_steps": 1456, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 91, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.582524910880162e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}