diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.2499897072748981, + "epoch": 0.4999794145497962, "eval_steps": 759, - "global_step": 759, + "global_step": 1518, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -5336,6 +5336,5327 @@ "eval_samples_per_second": 3.215, "eval_steps_per_second": 1.608, "step": 759 + }, + { + "epoch": 0.2503190744781588, + "grad_norm": 2.326564073562622, + "learning_rate": 8.668665207923482e-05, + "loss": 2.0632, + "step": 760 + }, + { + "epoch": 0.2506484416814196, + "grad_norm": 2.884378671646118, + "learning_rate": 8.665088970082331e-05, + "loss": 2.5123, + "step": 761 + }, + { + "epoch": 0.2509778088846803, + "grad_norm": 2.3617143630981445, + "learning_rate": 8.661508675244801e-05, + "loss": 2.1108, + "step": 762 + }, + { + "epoch": 0.25130717608794106, + "grad_norm": 2.4837584495544434, + "learning_rate": 8.657924327374025e-05, + "loss": 2.3088, + "step": 763 + }, + { + "epoch": 0.2516365432912018, + "grad_norm": 2.4218530654907227, + "learning_rate": 8.654335930437627e-05, + "loss": 1.8391, + "step": 764 + }, + { + "epoch": 0.25196591049446254, + "grad_norm": 2.5416500568389893, + "learning_rate": 8.650743488407709e-05, + "loss": 2.1901, + "step": 765 + }, + { + "epoch": 0.25229527769772325, + "grad_norm": 2.6304333209991455, + "learning_rate": 8.647147005260854e-05, + "loss": 2.4257, + "step": 766 + }, + { + "epoch": 0.25262464490098396, + "grad_norm": 3.04967999458313, + "learning_rate": 8.643546484978115e-05, + "loss": 2.1485, + "step": 767 + }, + { + "epoch": 0.25295401210424473, + "grad_norm": 2.736220359802246, + "learning_rate": 8.639941931545017e-05, + "loss": 2.0484, + "step": 768 + }, + { + "epoch": 0.25328337930750544, + "grad_norm": 2.4498915672302246, + "learning_rate": 8.636333348951546e-05, + "loss": 1.9845, + "step": 769 + }, + { + "epoch": 0.2536127465107662, + "grad_norm": 3.074000597000122, + "learning_rate": 8.63272074119215e-05, + "loss": 2.0217, + "step": 770 + }, + { + "epoch": 0.2539421137140269, + "grad_norm": 2.7875514030456543, + "learning_rate": 8.629104112265735e-05, + "loss": 2.1935, + "step": 771 + }, + { + "epoch": 0.2542714809172877, + "grad_norm": 2.7638540267944336, + "learning_rate": 8.625483466175652e-05, + "loss": 2.1215, + "step": 772 + }, + { + "epoch": 0.2546008481205484, + "grad_norm": 2.99078106880188, + "learning_rate": 8.621858806929705e-05, + "loss": 1.8759, + "step": 773 + }, + { + "epoch": 0.2549302153238091, + "grad_norm": 3.0909841060638428, + "learning_rate": 8.618230138540136e-05, + "loss": 1.8021, + "step": 774 + }, + { + "epoch": 0.2552595825270699, + "grad_norm": 3.437960147857666, + "learning_rate": 8.614597465023626e-05, + "loss": 1.9589, + "step": 775 + }, + { + "epoch": 0.2555889497303306, + "grad_norm": 2.2460529804229736, + "learning_rate": 8.61096079040129e-05, + "loss": 2.2812, + "step": 776 + }, + { + "epoch": 0.25591831693359135, + "grad_norm": 2.4062023162841797, + "learning_rate": 8.607320118698674e-05, + "loss": 2.1588, + "step": 777 + }, + { + "epoch": 0.25624768413685206, + "grad_norm": 3.011057138442993, + "learning_rate": 8.603675453945742e-05, + "loss": 2.5091, + "step": 778 + }, + { + "epoch": 0.2565770513401128, + "grad_norm": 2.4397964477539062, + "learning_rate": 8.600026800176885e-05, + "loss": 2.4397, + "step": 779 + }, + { + "epoch": 0.25690641854337354, + "grad_norm": 2.548110246658325, + "learning_rate": 8.596374161430907e-05, + "loss": 2.4302, + "step": 780 + }, + { + "epoch": 0.2572357857466343, + "grad_norm": 2.477574586868286, + "learning_rate": 8.592717541751021e-05, + "loss": 2.2464, + "step": 781 + }, + { + "epoch": 0.257565152949895, + "grad_norm": 2.5044217109680176, + "learning_rate": 8.58905694518485e-05, + "loss": 2.13, + "step": 782 + }, + { + "epoch": 0.2578945201531557, + "grad_norm": 2.3912570476531982, + "learning_rate": 8.585392375784418e-05, + "loss": 2.3024, + "step": 783 + }, + { + "epoch": 0.2582238873564165, + "grad_norm": 2.51187801361084, + "learning_rate": 8.581723837606144e-05, + "loss": 2.3613, + "step": 784 + }, + { + "epoch": 0.2585532545596772, + "grad_norm": 2.4491524696350098, + "learning_rate": 8.578051334710844e-05, + "loss": 2.0575, + "step": 785 + }, + { + "epoch": 0.25888262176293797, + "grad_norm": 2.263449192047119, + "learning_rate": 8.574374871163721e-05, + "loss": 2.0833, + "step": 786 + }, + { + "epoch": 0.2592119889661987, + "grad_norm": 2.714989423751831, + "learning_rate": 8.570694451034362e-05, + "loss": 2.2463, + "step": 787 + }, + { + "epoch": 0.25954135616945945, + "grad_norm": 2.5804946422576904, + "learning_rate": 8.567010078396735e-05, + "loss": 2.2576, + "step": 788 + }, + { + "epoch": 0.25987072337272016, + "grad_norm": 2.759988784790039, + "learning_rate": 8.56332175732918e-05, + "loss": 2.2353, + "step": 789 + }, + { + "epoch": 0.26020009057598087, + "grad_norm": 2.70926570892334, + "learning_rate": 8.559629491914412e-05, + "loss": 2.0866, + "step": 790 + }, + { + "epoch": 0.26052945777924164, + "grad_norm": 3.139805555343628, + "learning_rate": 8.555933286239508e-05, + "loss": 2.4547, + "step": 791 + }, + { + "epoch": 0.26085882498250235, + "grad_norm": 2.390231132507324, + "learning_rate": 8.552233144395907e-05, + "loss": 1.9382, + "step": 792 + }, + { + "epoch": 0.2611881921857631, + "grad_norm": 2.88167142868042, + "learning_rate": 8.54852907047941e-05, + "loss": 2.1961, + "step": 793 + }, + { + "epoch": 0.2615175593890238, + "grad_norm": 2.3817718029022217, + "learning_rate": 8.544821068590165e-05, + "loss": 1.7874, + "step": 794 + }, + { + "epoch": 0.2618469265922846, + "grad_norm": 2.8125686645507812, + "learning_rate": 8.54110914283267e-05, + "loss": 2.1755, + "step": 795 + }, + { + "epoch": 0.2621762937955453, + "grad_norm": 2.719560384750366, + "learning_rate": 8.537393297315767e-05, + "loss": 2.0847, + "step": 796 + }, + { + "epoch": 0.26250566099880607, + "grad_norm": 2.7798001766204834, + "learning_rate": 8.533673536152638e-05, + "loss": 1.8334, + "step": 797 + }, + { + "epoch": 0.2628350282020668, + "grad_norm": 4.041043758392334, + "learning_rate": 8.529949863460793e-05, + "loss": 2.3578, + "step": 798 + }, + { + "epoch": 0.2631643954053275, + "grad_norm": 3.4026458263397217, + "learning_rate": 8.52622228336208e-05, + "loss": 2.0998, + "step": 799 + }, + { + "epoch": 0.26349376260858826, + "grad_norm": 4.440029144287109, + "learning_rate": 8.522490799982669e-05, + "loss": 1.7791, + "step": 800 + }, + { + "epoch": 0.26382312981184897, + "grad_norm": 2.11838436126709, + "learning_rate": 8.518755417453048e-05, + "loss": 2.4855, + "step": 801 + }, + { + "epoch": 0.26415249701510973, + "grad_norm": 2.5172719955444336, + "learning_rate": 8.515016139908024e-05, + "loss": 2.2894, + "step": 802 + }, + { + "epoch": 0.26448186421837044, + "grad_norm": 2.1991279125213623, + "learning_rate": 8.511272971486717e-05, + "loss": 2.2683, + "step": 803 + }, + { + "epoch": 0.2648112314216312, + "grad_norm": 2.425428867340088, + "learning_rate": 8.507525916332549e-05, + "loss": 2.2494, + "step": 804 + }, + { + "epoch": 0.2651405986248919, + "grad_norm": 2.5921618938446045, + "learning_rate": 8.503774978593248e-05, + "loss": 2.2534, + "step": 805 + }, + { + "epoch": 0.2654699658281527, + "grad_norm": 2.6209821701049805, + "learning_rate": 8.500020162420841e-05, + "loss": 2.3588, + "step": 806 + }, + { + "epoch": 0.2657993330314134, + "grad_norm": 2.933804750442505, + "learning_rate": 8.496261471971645e-05, + "loss": 2.5508, + "step": 807 + }, + { + "epoch": 0.2661287002346741, + "grad_norm": 2.4999735355377197, + "learning_rate": 8.492498911406266e-05, + "loss": 2.0871, + "step": 808 + }, + { + "epoch": 0.2664580674379349, + "grad_norm": 2.6054482460021973, + "learning_rate": 8.488732484889594e-05, + "loss": 2.1609, + "step": 809 + }, + { + "epoch": 0.2667874346411956, + "grad_norm": 2.588648557662964, + "learning_rate": 8.4849621965908e-05, + "loss": 2.2175, + "step": 810 + }, + { + "epoch": 0.26711680184445635, + "grad_norm": 2.7105255126953125, + "learning_rate": 8.481188050683328e-05, + "loss": 2.1727, + "step": 811 + }, + { + "epoch": 0.26744616904771706, + "grad_norm": 2.858750581741333, + "learning_rate": 8.477410051344895e-05, + "loss": 2.6405, + "step": 812 + }, + { + "epoch": 0.26777553625097783, + "grad_norm": 2.562743663787842, + "learning_rate": 8.473628202757478e-05, + "loss": 2.2222, + "step": 813 + }, + { + "epoch": 0.26810490345423854, + "grad_norm": 2.7561135292053223, + "learning_rate": 8.46984250910732e-05, + "loss": 2.3755, + "step": 814 + }, + { + "epoch": 0.26843427065749925, + "grad_norm": 2.5868582725524902, + "learning_rate": 8.466052974584918e-05, + "loss": 2.3861, + "step": 815 + }, + { + "epoch": 0.26876363786076, + "grad_norm": 2.836190938949585, + "learning_rate": 8.46225960338502e-05, + "loss": 2.1303, + "step": 816 + }, + { + "epoch": 0.26909300506402073, + "grad_norm": 3.028157949447632, + "learning_rate": 8.458462399706623e-05, + "loss": 2.2734, + "step": 817 + }, + { + "epoch": 0.2694223722672815, + "grad_norm": 2.804187536239624, + "learning_rate": 8.454661367752965e-05, + "loss": 2.1357, + "step": 818 + }, + { + "epoch": 0.2697517394705422, + "grad_norm": 2.80621075630188, + "learning_rate": 8.450856511731519e-05, + "loss": 1.732, + "step": 819 + }, + { + "epoch": 0.270081106673803, + "grad_norm": 2.5028510093688965, + "learning_rate": 8.447047835853999e-05, + "loss": 1.8806, + "step": 820 + }, + { + "epoch": 0.2704104738770637, + "grad_norm": 2.597686529159546, + "learning_rate": 8.443235344336337e-05, + "loss": 2.1617, + "step": 821 + }, + { + "epoch": 0.27073984108032445, + "grad_norm": 2.7407145500183105, + "learning_rate": 8.439419041398698e-05, + "loss": 2.157, + "step": 822 + }, + { + "epoch": 0.27106920828358516, + "grad_norm": 3.096123695373535, + "learning_rate": 8.435598931265459e-05, + "loss": 1.9549, + "step": 823 + }, + { + "epoch": 0.27139857548684587, + "grad_norm": 4.0686516761779785, + "learning_rate": 8.431775018165211e-05, + "loss": 2.139, + "step": 824 + }, + { + "epoch": 0.27172794269010664, + "grad_norm": 2.7538039684295654, + "learning_rate": 8.427947306330764e-05, + "loss": 1.8598, + "step": 825 + }, + { + "epoch": 0.27205730989336735, + "grad_norm": 2.3142900466918945, + "learning_rate": 8.424115799999122e-05, + "loss": 2.3159, + "step": 826 + }, + { + "epoch": 0.2723866770966281, + "grad_norm": 2.2066521644592285, + "learning_rate": 8.420280503411495e-05, + "loss": 2.2287, + "step": 827 + }, + { + "epoch": 0.2727160442998888, + "grad_norm": 2.188645839691162, + "learning_rate": 8.416441420813288e-05, + "loss": 2.2174, + "step": 828 + }, + { + "epoch": 0.2730454115031496, + "grad_norm": 2.808419704437256, + "learning_rate": 8.412598556454096e-05, + "loss": 2.6173, + "step": 829 + }, + { + "epoch": 0.2733747787064103, + "grad_norm": 2.461317300796509, + "learning_rate": 8.408751914587699e-05, + "loss": 2.5954, + "step": 830 + }, + { + "epoch": 0.27370414590967107, + "grad_norm": 2.9252431392669678, + "learning_rate": 8.40490149947206e-05, + "loss": 2.4617, + "step": 831 + }, + { + "epoch": 0.2740335131129318, + "grad_norm": 2.5286619663238525, + "learning_rate": 8.401047315369319e-05, + "loss": 2.5215, + "step": 832 + }, + { + "epoch": 0.2743628803161925, + "grad_norm": 2.650164842605591, + "learning_rate": 8.397189366545786e-05, + "loss": 2.1433, + "step": 833 + }, + { + "epoch": 0.27469224751945326, + "grad_norm": 2.2741010189056396, + "learning_rate": 8.393327657271944e-05, + "loss": 2.0067, + "step": 834 + }, + { + "epoch": 0.27502161472271397, + "grad_norm": 2.3971540927886963, + "learning_rate": 8.389462191822428e-05, + "loss": 2.2778, + "step": 835 + }, + { + "epoch": 0.27535098192597474, + "grad_norm": 2.9199283123016357, + "learning_rate": 8.385592974476042e-05, + "loss": 2.375, + "step": 836 + }, + { + "epoch": 0.27568034912923545, + "grad_norm": 2.3536462783813477, + "learning_rate": 8.381720009515736e-05, + "loss": 2.2465, + "step": 837 + }, + { + "epoch": 0.2760097163324962, + "grad_norm": 2.7381389141082764, + "learning_rate": 8.377843301228611e-05, + "loss": 2.201, + "step": 838 + }, + { + "epoch": 0.2763390835357569, + "grad_norm": 2.795403003692627, + "learning_rate": 8.373962853905912e-05, + "loss": 2.1858, + "step": 839 + }, + { + "epoch": 0.27666845073901764, + "grad_norm": 2.549487829208374, + "learning_rate": 8.37007867184302e-05, + "loss": 2.0456, + "step": 840 + }, + { + "epoch": 0.2769978179422784, + "grad_norm": 2.419311046600342, + "learning_rate": 8.366190759339453e-05, + "loss": 1.8163, + "step": 841 + }, + { + "epoch": 0.2773271851455391, + "grad_norm": 2.8960893154144287, + "learning_rate": 8.362299120698858e-05, + "loss": 2.0316, + "step": 842 + }, + { + "epoch": 0.2776565523487999, + "grad_norm": 2.440534830093384, + "learning_rate": 8.358403760229006e-05, + "loss": 1.8316, + "step": 843 + }, + { + "epoch": 0.2779859195520606, + "grad_norm": 2.9307198524475098, + "learning_rate": 8.354504682241786e-05, + "loss": 2.5626, + "step": 844 + }, + { + "epoch": 0.27831528675532136, + "grad_norm": 2.7770323753356934, + "learning_rate": 8.350601891053207e-05, + "loss": 2.2409, + "step": 845 + }, + { + "epoch": 0.27864465395858207, + "grad_norm": 2.7087655067443848, + "learning_rate": 8.346695390983382e-05, + "loss": 1.9896, + "step": 846 + }, + { + "epoch": 0.27897402116184283, + "grad_norm": 2.673274517059326, + "learning_rate": 8.342785186356534e-05, + "loss": 2.0128, + "step": 847 + }, + { + "epoch": 0.27930338836510354, + "grad_norm": 3.0754430294036865, + "learning_rate": 8.338871281500984e-05, + "loss": 2.008, + "step": 848 + }, + { + "epoch": 0.27963275556836426, + "grad_norm": 2.8769547939300537, + "learning_rate": 8.334953680749152e-05, + "loss": 1.8857, + "step": 849 + }, + { + "epoch": 0.279962122771625, + "grad_norm": 3.0530593395233154, + "learning_rate": 8.331032388437546e-05, + "loss": 1.967, + "step": 850 + }, + { + "epoch": 0.28029148997488573, + "grad_norm": 1.8923742771148682, + "learning_rate": 8.32710740890676e-05, + "loss": 2.4918, + "step": 851 + }, + { + "epoch": 0.2806208571781465, + "grad_norm": 2.0591697692871094, + "learning_rate": 8.323178746501475e-05, + "loss": 2.2662, + "step": 852 + }, + { + "epoch": 0.2809502243814072, + "grad_norm": 2.4539167881011963, + "learning_rate": 8.319246405570441e-05, + "loss": 2.4903, + "step": 853 + }, + { + "epoch": 0.281279591584668, + "grad_norm": 2.1013600826263428, + "learning_rate": 8.315310390466487e-05, + "loss": 2.1026, + "step": 854 + }, + { + "epoch": 0.2816089587879287, + "grad_norm": 2.0880558490753174, + "learning_rate": 8.311370705546501e-05, + "loss": 2.3886, + "step": 855 + }, + { + "epoch": 0.28193832599118945, + "grad_norm": 2.4675230979919434, + "learning_rate": 8.307427355171443e-05, + "loss": 2.2145, + "step": 856 + }, + { + "epoch": 0.28226769319445016, + "grad_norm": 2.3811771869659424, + "learning_rate": 8.303480343706321e-05, + "loss": 2.6376, + "step": 857 + }, + { + "epoch": 0.2825970603977109, + "grad_norm": 2.4288272857666016, + "learning_rate": 8.299529675520201e-05, + "loss": 2.2381, + "step": 858 + }, + { + "epoch": 0.28292642760097164, + "grad_norm": 2.4424099922180176, + "learning_rate": 8.295575354986196e-05, + "loss": 2.4726, + "step": 859 + }, + { + "epoch": 0.28325579480423235, + "grad_norm": 2.6503515243530273, + "learning_rate": 8.291617386481463e-05, + "loss": 2.4256, + "step": 860 + }, + { + "epoch": 0.2835851620074931, + "grad_norm": 2.2516067028045654, + "learning_rate": 8.287655774387193e-05, + "loss": 1.9415, + "step": 861 + }, + { + "epoch": 0.28391452921075383, + "grad_norm": 2.435704231262207, + "learning_rate": 8.283690523088613e-05, + "loss": 2.0356, + "step": 862 + }, + { + "epoch": 0.2842438964140146, + "grad_norm": 2.6750681400299072, + "learning_rate": 8.279721636974978e-05, + "loss": 2.2815, + "step": 863 + }, + { + "epoch": 0.2845732636172753, + "grad_norm": 3.6107234954833984, + "learning_rate": 8.275749120439569e-05, + "loss": 2.2028, + "step": 864 + }, + { + "epoch": 0.284902630820536, + "grad_norm": 2.5893361568450928, + "learning_rate": 8.27177297787968e-05, + "loss": 2.1027, + "step": 865 + }, + { + "epoch": 0.2852319980237968, + "grad_norm": 2.4202561378479004, + "learning_rate": 8.267793213696624e-05, + "loss": 2.0077, + "step": 866 + }, + { + "epoch": 0.2855613652270575, + "grad_norm": 2.8321430683135986, + "learning_rate": 8.263809832295719e-05, + "loss": 1.9374, + "step": 867 + }, + { + "epoch": 0.28589073243031826, + "grad_norm": 2.830946683883667, + "learning_rate": 8.25982283808629e-05, + "loss": 2.3701, + "step": 868 + }, + { + "epoch": 0.286220099633579, + "grad_norm": 2.626882791519165, + "learning_rate": 8.255832235481659e-05, + "loss": 2.2236, + "step": 869 + }, + { + "epoch": 0.28654946683683974, + "grad_norm": 2.398036003112793, + "learning_rate": 8.251838028899143e-05, + "loss": 2.2344, + "step": 870 + }, + { + "epoch": 0.28687883404010045, + "grad_norm": 3.660982608795166, + "learning_rate": 8.247840222760052e-05, + "loss": 2.2156, + "step": 871 + }, + { + "epoch": 0.2872082012433612, + "grad_norm": 3.3050460815429688, + "learning_rate": 8.243838821489671e-05, + "loss": 2.0017, + "step": 872 + }, + { + "epoch": 0.2875375684466219, + "grad_norm": 3.096376895904541, + "learning_rate": 8.239833829517276e-05, + "loss": 1.9194, + "step": 873 + }, + { + "epoch": 0.28786693564988264, + "grad_norm": 2.761716365814209, + "learning_rate": 8.235825251276108e-05, + "loss": 1.9807, + "step": 874 + }, + { + "epoch": 0.2881963028531434, + "grad_norm": 3.867349624633789, + "learning_rate": 8.231813091203385e-05, + "loss": 1.9222, + "step": 875 + }, + { + "epoch": 0.2885256700564041, + "grad_norm": 2.335017442703247, + "learning_rate": 8.227797353740286e-05, + "loss": 2.4247, + "step": 876 + }, + { + "epoch": 0.2888550372596649, + "grad_norm": 2.4474287033081055, + "learning_rate": 8.223778043331948e-05, + "loss": 2.6098, + "step": 877 + }, + { + "epoch": 0.2891844044629256, + "grad_norm": 2.22808575630188, + "learning_rate": 8.219755164427469e-05, + "loss": 2.2784, + "step": 878 + }, + { + "epoch": 0.28951377166618636, + "grad_norm": 2.265848398208618, + "learning_rate": 8.215728721479892e-05, + "loss": 2.2767, + "step": 879 + }, + { + "epoch": 0.28984313886944707, + "grad_norm": 2.5640437602996826, + "learning_rate": 8.211698718946208e-05, + "loss": 1.9147, + "step": 880 + }, + { + "epoch": 0.29017250607270784, + "grad_norm": 2.4126553535461426, + "learning_rate": 8.207665161287345e-05, + "loss": 2.2837, + "step": 881 + }, + { + "epoch": 0.29050187327596855, + "grad_norm": 2.4454050064086914, + "learning_rate": 8.20362805296817e-05, + "loss": 2.3148, + "step": 882 + }, + { + "epoch": 0.29083124047922926, + "grad_norm": 2.776961088180542, + "learning_rate": 8.19958739845748e-05, + "loss": 2.7321, + "step": 883 + }, + { + "epoch": 0.29116060768249, + "grad_norm": 2.721742630004883, + "learning_rate": 8.195543202227993e-05, + "loss": 2.3516, + "step": 884 + }, + { + "epoch": 0.29148997488575074, + "grad_norm": 2.417945384979248, + "learning_rate": 8.191495468756354e-05, + "loss": 2.0815, + "step": 885 + }, + { + "epoch": 0.2918193420890115, + "grad_norm": 2.7946279048919678, + "learning_rate": 8.187444202523116e-05, + "loss": 2.5238, + "step": 886 + }, + { + "epoch": 0.2921487092922722, + "grad_norm": 2.2200636863708496, + "learning_rate": 8.183389408012752e-05, + "loss": 2.2116, + "step": 887 + }, + { + "epoch": 0.292478076495533, + "grad_norm": 2.7158875465393066, + "learning_rate": 8.179331089713629e-05, + "loss": 2.2724, + "step": 888 + }, + { + "epoch": 0.2928074436987937, + "grad_norm": 3.0811691284179688, + "learning_rate": 8.175269252118023e-05, + "loss": 2.481, + "step": 889 + }, + { + "epoch": 0.2931368109020544, + "grad_norm": 2.898715019226074, + "learning_rate": 8.171203899722105e-05, + "loss": 1.9759, + "step": 890 + }, + { + "epoch": 0.29346617810531517, + "grad_norm": 3.110511541366577, + "learning_rate": 8.167135037025933e-05, + "loss": 2.3635, + "step": 891 + }, + { + "epoch": 0.2937955453085759, + "grad_norm": 2.4876153469085693, + "learning_rate": 8.163062668533454e-05, + "loss": 2.2224, + "step": 892 + }, + { + "epoch": 0.29412491251183664, + "grad_norm": 2.962139844894409, + "learning_rate": 8.158986798752492e-05, + "loss": 2.1494, + "step": 893 + }, + { + "epoch": 0.29445427971509736, + "grad_norm": 2.6857833862304688, + "learning_rate": 8.154907432194751e-05, + "loss": 1.9066, + "step": 894 + }, + { + "epoch": 0.2947836469183581, + "grad_norm": 3.1264843940734863, + "learning_rate": 8.150824573375804e-05, + "loss": 2.1503, + "step": 895 + }, + { + "epoch": 0.29511301412161883, + "grad_norm": 3.0480618476867676, + "learning_rate": 8.146738226815087e-05, + "loss": 2.2263, + "step": 896 + }, + { + "epoch": 0.2954423813248796, + "grad_norm": 2.6517231464385986, + "learning_rate": 8.142648397035899e-05, + "loss": 1.98, + "step": 897 + }, + { + "epoch": 0.2957717485281403, + "grad_norm": 2.765357255935669, + "learning_rate": 8.138555088565398e-05, + "loss": 1.8292, + "step": 898 + }, + { + "epoch": 0.296101115731401, + "grad_norm": 2.788534641265869, + "learning_rate": 8.134458305934587e-05, + "loss": 1.7272, + "step": 899 + }, + { + "epoch": 0.2964304829346618, + "grad_norm": 3.0059924125671387, + "learning_rate": 8.130358053678315e-05, + "loss": 2.0136, + "step": 900 + }, + { + "epoch": 0.2967598501379225, + "grad_norm": 2.0941505432128906, + "learning_rate": 8.126254336335279e-05, + "loss": 2.5732, + "step": 901 + }, + { + "epoch": 0.29708921734118326, + "grad_norm": 2.0665740966796875, + "learning_rate": 8.122147158448002e-05, + "loss": 2.4054, + "step": 902 + }, + { + "epoch": 0.297418584544444, + "grad_norm": 2.2430355548858643, + "learning_rate": 8.118036524562841e-05, + "loss": 2.4054, + "step": 903 + }, + { + "epoch": 0.29774795174770474, + "grad_norm": 2.2699286937713623, + "learning_rate": 8.113922439229982e-05, + "loss": 2.1162, + "step": 904 + }, + { + "epoch": 0.29807731895096545, + "grad_norm": 2.1671202182769775, + "learning_rate": 8.109804907003429e-05, + "loss": 2.3838, + "step": 905 + }, + { + "epoch": 0.2984066861542262, + "grad_norm": 2.175811529159546, + "learning_rate": 8.105683932441e-05, + "loss": 2.0848, + "step": 906 + }, + { + "epoch": 0.29873605335748693, + "grad_norm": 3.168586492538452, + "learning_rate": 8.101559520104323e-05, + "loss": 2.4891, + "step": 907 + }, + { + "epoch": 0.29906542056074764, + "grad_norm": 2.39568829536438, + "learning_rate": 8.097431674558838e-05, + "loss": 2.3179, + "step": 908 + }, + { + "epoch": 0.2993947877640084, + "grad_norm": 2.7230722904205322, + "learning_rate": 8.093300400373775e-05, + "loss": 2.2789, + "step": 909 + }, + { + "epoch": 0.2997241549672691, + "grad_norm": 2.686748504638672, + "learning_rate": 8.08916570212217e-05, + "loss": 2.1697, + "step": 910 + }, + { + "epoch": 0.3000535221705299, + "grad_norm": 2.3742809295654297, + "learning_rate": 8.08502758438084e-05, + "loss": 1.9485, + "step": 911 + }, + { + "epoch": 0.3003828893737906, + "grad_norm": 2.234506607055664, + "learning_rate": 8.080886051730391e-05, + "loss": 2.2139, + "step": 912 + }, + { + "epoch": 0.30071225657705136, + "grad_norm": 2.438096284866333, + "learning_rate": 8.076741108755212e-05, + "loss": 2.338, + "step": 913 + }, + { + "epoch": 0.3010416237803121, + "grad_norm": 2.3559389114379883, + "learning_rate": 8.072592760043463e-05, + "loss": 2.3608, + "step": 914 + }, + { + "epoch": 0.3013709909835728, + "grad_norm": 2.8507134914398193, + "learning_rate": 8.068441010187073e-05, + "loss": 2.3359, + "step": 915 + }, + { + "epoch": 0.30170035818683355, + "grad_norm": 3.0053601264953613, + "learning_rate": 8.06428586378174e-05, + "loss": 2.2598, + "step": 916 + }, + { + "epoch": 0.30202972539009426, + "grad_norm": 2.6545512676239014, + "learning_rate": 8.06012732542692e-05, + "loss": 2.2021, + "step": 917 + }, + { + "epoch": 0.302359092593355, + "grad_norm": 2.2311315536499023, + "learning_rate": 8.05596539972582e-05, + "loss": 2.0849, + "step": 918 + }, + { + "epoch": 0.30268845979661574, + "grad_norm": 3.3348686695098877, + "learning_rate": 8.051800091285404e-05, + "loss": 2.2344, + "step": 919 + }, + { + "epoch": 0.3030178269998765, + "grad_norm": 2.89279842376709, + "learning_rate": 8.047631404716374e-05, + "loss": 2.5125, + "step": 920 + }, + { + "epoch": 0.3033471942031372, + "grad_norm": 2.6193151473999023, + "learning_rate": 8.043459344633173e-05, + "loss": 1.7652, + "step": 921 + }, + { + "epoch": 0.303676561406398, + "grad_norm": 2.6484382152557373, + "learning_rate": 8.039283915653979e-05, + "loss": 1.9555, + "step": 922 + }, + { + "epoch": 0.3040059286096587, + "grad_norm": 2.553636312484741, + "learning_rate": 8.035105122400701e-05, + "loss": 1.8701, + "step": 923 + }, + { + "epoch": 0.3043352958129194, + "grad_norm": 3.712156057357788, + "learning_rate": 8.030922969498968e-05, + "loss": 2.2537, + "step": 924 + }, + { + "epoch": 0.30466466301618017, + "grad_norm": 3.272002696990967, + "learning_rate": 8.026737461578132e-05, + "loss": 2.1644, + "step": 925 + }, + { + "epoch": 0.3049940302194409, + "grad_norm": 2.4612069129943848, + "learning_rate": 8.022548603271252e-05, + "loss": 2.4379, + "step": 926 + }, + { + "epoch": 0.30532339742270165, + "grad_norm": 3.1458165645599365, + "learning_rate": 8.018356399215104e-05, + "loss": 2.6069, + "step": 927 + }, + { + "epoch": 0.30565276462596236, + "grad_norm": 2.0101094245910645, + "learning_rate": 8.014160854050164e-05, + "loss": 2.185, + "step": 928 + }, + { + "epoch": 0.3059821318292231, + "grad_norm": 2.1659107208251953, + "learning_rate": 8.009961972420607e-05, + "loss": 2.1568, + "step": 929 + }, + { + "epoch": 0.30631149903248384, + "grad_norm": 2.243806838989258, + "learning_rate": 8.005759758974296e-05, + "loss": 2.4728, + "step": 930 + }, + { + "epoch": 0.3066408662357446, + "grad_norm": 2.525348424911499, + "learning_rate": 8.001554218362791e-05, + "loss": 2.6265, + "step": 931 + }, + { + "epoch": 0.3069702334390053, + "grad_norm": 2.3284335136413574, + "learning_rate": 7.997345355241328e-05, + "loss": 2.3675, + "step": 932 + }, + { + "epoch": 0.307299600642266, + "grad_norm": 2.4215810298919678, + "learning_rate": 7.993133174268826e-05, + "loss": 2.1221, + "step": 933 + }, + { + "epoch": 0.3076289678455268, + "grad_norm": 2.8312551975250244, + "learning_rate": 7.988917680107871e-05, + "loss": 2.5939, + "step": 934 + }, + { + "epoch": 0.3079583350487875, + "grad_norm": 2.665449857711792, + "learning_rate": 7.984698877424718e-05, + "loss": 2.24, + "step": 935 + }, + { + "epoch": 0.30828770225204827, + "grad_norm": 2.5410232543945312, + "learning_rate": 7.980476770889289e-05, + "loss": 2.2135, + "step": 936 + }, + { + "epoch": 0.308617069455309, + "grad_norm": 2.556459426879883, + "learning_rate": 7.976251365175158e-05, + "loss": 2.2969, + "step": 937 + }, + { + "epoch": 0.30894643665856975, + "grad_norm": 3.4073662757873535, + "learning_rate": 7.972022664959554e-05, + "loss": 2.3814, + "step": 938 + }, + { + "epoch": 0.30927580386183046, + "grad_norm": 2.888021230697632, + "learning_rate": 7.96779067492335e-05, + "loss": 2.4569, + "step": 939 + }, + { + "epoch": 0.30960517106509117, + "grad_norm": 2.819286584854126, + "learning_rate": 7.963555399751063e-05, + "loss": 2.1128, + "step": 940 + }, + { + "epoch": 0.30993453826835193, + "grad_norm": 2.760483741760254, + "learning_rate": 7.959316844130846e-05, + "loss": 2.4026, + "step": 941 + }, + { + "epoch": 0.31026390547161264, + "grad_norm": 2.6997532844543457, + "learning_rate": 7.95507501275448e-05, + "loss": 2.0134, + "step": 942 + }, + { + "epoch": 0.3105932726748734, + "grad_norm": 2.7757112979888916, + "learning_rate": 7.950829910317379e-05, + "loss": 2.207, + "step": 943 + }, + { + "epoch": 0.3109226398781341, + "grad_norm": 2.5524275302886963, + "learning_rate": 7.946581541518569e-05, + "loss": 1.7721, + "step": 944 + }, + { + "epoch": 0.3112520070813949, + "grad_norm": 2.4421706199645996, + "learning_rate": 7.942329911060703e-05, + "loss": 2.1257, + "step": 945 + }, + { + "epoch": 0.3115813742846556, + "grad_norm": 2.7249844074249268, + "learning_rate": 7.938075023650029e-05, + "loss": 1.9599, + "step": 946 + }, + { + "epoch": 0.31191074148791637, + "grad_norm": 2.9589931964874268, + "learning_rate": 7.933816883996415e-05, + "loss": 2.3046, + "step": 947 + }, + { + "epoch": 0.3122401086911771, + "grad_norm": 2.614107608795166, + "learning_rate": 7.92955549681332e-05, + "loss": 1.8874, + "step": 948 + }, + { + "epoch": 0.3125694758944378, + "grad_norm": 3.2241110801696777, + "learning_rate": 7.925290866817802e-05, + "loss": 2.3943, + "step": 949 + }, + { + "epoch": 0.31289884309769855, + "grad_norm": 3.045088291168213, + "learning_rate": 7.921022998730507e-05, + "loss": 1.961, + "step": 950 + }, + { + "epoch": 0.31322821030095926, + "grad_norm": 2.1225154399871826, + "learning_rate": 7.916751897275665e-05, + "loss": 2.749, + "step": 951 + }, + { + "epoch": 0.31355757750422003, + "grad_norm": 2.060499668121338, + "learning_rate": 7.912477567181086e-05, + "loss": 2.2708, + "step": 952 + }, + { + "epoch": 0.31388694470748074, + "grad_norm": 2.4853827953338623, + "learning_rate": 7.908200013178156e-05, + "loss": 2.2059, + "step": 953 + }, + { + "epoch": 0.3142163119107415, + "grad_norm": 2.317662477493286, + "learning_rate": 7.903919240001824e-05, + "loss": 2.4268, + "step": 954 + }, + { + "epoch": 0.3145456791140022, + "grad_norm": 2.5555713176727295, + "learning_rate": 7.899635252390606e-05, + "loss": 2.4417, + "step": 955 + }, + { + "epoch": 0.314875046317263, + "grad_norm": 2.6387650966644287, + "learning_rate": 7.895348055086577e-05, + "loss": 2.3655, + "step": 956 + }, + { + "epoch": 0.3152044135205237, + "grad_norm": 2.4570939540863037, + "learning_rate": 7.891057652835361e-05, + "loss": 2.4014, + "step": 957 + }, + { + "epoch": 0.3155337807237844, + "grad_norm": 2.4972894191741943, + "learning_rate": 7.886764050386135e-05, + "loss": 2.3593, + "step": 958 + }, + { + "epoch": 0.3158631479270452, + "grad_norm": 2.601262092590332, + "learning_rate": 7.882467252491617e-05, + "loss": 2.4721, + "step": 959 + }, + { + "epoch": 0.3161925151303059, + "grad_norm": 2.5913026332855225, + "learning_rate": 7.878167263908056e-05, + "loss": 2.2818, + "step": 960 + }, + { + "epoch": 0.31652188233356665, + "grad_norm": 2.6688802242279053, + "learning_rate": 7.873864089395243e-05, + "loss": 2.1604, + "step": 961 + }, + { + "epoch": 0.31685124953682736, + "grad_norm": 2.4998068809509277, + "learning_rate": 7.869557733716488e-05, + "loss": 1.95, + "step": 962 + }, + { + "epoch": 0.31718061674008813, + "grad_norm": 2.5691847801208496, + "learning_rate": 7.865248201638623e-05, + "loss": 2.1274, + "step": 963 + }, + { + "epoch": 0.31750998394334884, + "grad_norm": 2.2748234272003174, + "learning_rate": 7.860935497932e-05, + "loss": 1.9988, + "step": 964 + }, + { + "epoch": 0.31783935114660955, + "grad_norm": 2.8164100646972656, + "learning_rate": 7.856619627370479e-05, + "loss": 2.3755, + "step": 965 + }, + { + "epoch": 0.3181687183498703, + "grad_norm": 2.8128836154937744, + "learning_rate": 7.852300594731425e-05, + "loss": 2.1516, + "step": 966 + }, + { + "epoch": 0.318498085553131, + "grad_norm": 2.7594070434570312, + "learning_rate": 7.847978404795704e-05, + "loss": 2.177, + "step": 967 + }, + { + "epoch": 0.3188274527563918, + "grad_norm": 2.687586784362793, + "learning_rate": 7.843653062347679e-05, + "loss": 2.4126, + "step": 968 + }, + { + "epoch": 0.3191568199596525, + "grad_norm": 2.610460042953491, + "learning_rate": 7.8393245721752e-05, + "loss": 2.3253, + "step": 969 + }, + { + "epoch": 0.31948618716291327, + "grad_norm": 2.525682210922241, + "learning_rate": 7.8349929390696e-05, + "loss": 1.9292, + "step": 970 + }, + { + "epoch": 0.319815554366174, + "grad_norm": 3.1786725521087646, + "learning_rate": 7.830658167825696e-05, + "loss": 1.956, + "step": 971 + }, + { + "epoch": 0.32014492156943475, + "grad_norm": 2.809406280517578, + "learning_rate": 7.826320263241771e-05, + "loss": 2.1892, + "step": 972 + }, + { + "epoch": 0.32047428877269546, + "grad_norm": 3.148411750793457, + "learning_rate": 7.821979230119587e-05, + "loss": 2.0155, + "step": 973 + }, + { + "epoch": 0.32080365597595617, + "grad_norm": 3.3248815536499023, + "learning_rate": 7.81763507326436e-05, + "loss": 1.7539, + "step": 974 + }, + { + "epoch": 0.32113302317921694, + "grad_norm": 3.2047150135040283, + "learning_rate": 7.813287797484768e-05, + "loss": 2.028, + "step": 975 + }, + { + "epoch": 0.32146239038247765, + "grad_norm": 2.169050455093384, + "learning_rate": 7.808937407592938e-05, + "loss": 2.4696, + "step": 976 + }, + { + "epoch": 0.3217917575857384, + "grad_norm": 2.0645432472229004, + "learning_rate": 7.804583908404448e-05, + "loss": 2.1029, + "step": 977 + }, + { + "epoch": 0.3221211247889991, + "grad_norm": 2.496066093444824, + "learning_rate": 7.800227304738317e-05, + "loss": 2.3193, + "step": 978 + }, + { + "epoch": 0.3224504919922599, + "grad_norm": 2.2872400283813477, + "learning_rate": 7.795867601416998e-05, + "loss": 2.4506, + "step": 979 + }, + { + "epoch": 0.3227798591955206, + "grad_norm": 2.6444544792175293, + "learning_rate": 7.791504803266377e-05, + "loss": 2.818, + "step": 980 + }, + { + "epoch": 0.32310922639878137, + "grad_norm": 2.2178843021392822, + "learning_rate": 7.787138915115768e-05, + "loss": 1.9535, + "step": 981 + }, + { + "epoch": 0.3234385936020421, + "grad_norm": 2.528449535369873, + "learning_rate": 7.782769941797899e-05, + "loss": 2.1525, + "step": 982 + }, + { + "epoch": 0.3237679608053028, + "grad_norm": 2.597266435623169, + "learning_rate": 7.778397888148921e-05, + "loss": 2.4295, + "step": 983 + }, + { + "epoch": 0.32409732800856356, + "grad_norm": 2.5413691997528076, + "learning_rate": 7.774022759008386e-05, + "loss": 2.3796, + "step": 984 + }, + { + "epoch": 0.32442669521182427, + "grad_norm": 2.5218312740325928, + "learning_rate": 7.76964455921926e-05, + "loss": 2.4834, + "step": 985 + }, + { + "epoch": 0.32475606241508503, + "grad_norm": 2.420651435852051, + "learning_rate": 7.7652632936279e-05, + "loss": 2.1914, + "step": 986 + }, + { + "epoch": 0.32508542961834574, + "grad_norm": 2.723442792892456, + "learning_rate": 7.760878967084059e-05, + "loss": 2.5173, + "step": 987 + }, + { + "epoch": 0.3254147968216065, + "grad_norm": 2.400263786315918, + "learning_rate": 7.756491584440882e-05, + "loss": 2.1829, + "step": 988 + }, + { + "epoch": 0.3257441640248672, + "grad_norm": 2.4888710975646973, + "learning_rate": 7.75210115055489e-05, + "loss": 2.1607, + "step": 989 + }, + { + "epoch": 0.32607353122812793, + "grad_norm": 2.6729047298431396, + "learning_rate": 7.747707670285989e-05, + "loss": 2.3433, + "step": 990 + }, + { + "epoch": 0.3264028984313887, + "grad_norm": 2.9809768199920654, + "learning_rate": 7.743311148497452e-05, + "loss": 2.3124, + "step": 991 + }, + { + "epoch": 0.3267322656346494, + "grad_norm": 2.287365198135376, + "learning_rate": 7.73891159005592e-05, + "loss": 2.0576, + "step": 992 + }, + { + "epoch": 0.3270616328379102, + "grad_norm": 2.4659500122070312, + "learning_rate": 7.734508999831394e-05, + "loss": 2.0228, + "step": 993 + }, + { + "epoch": 0.3273910000411709, + "grad_norm": 2.66711163520813, + "learning_rate": 7.730103382697236e-05, + "loss": 2.1277, + "step": 994 + }, + { + "epoch": 0.32772036724443165, + "grad_norm": 2.902860641479492, + "learning_rate": 7.725694743530153e-05, + "loss": 2.3815, + "step": 995 + }, + { + "epoch": 0.32804973444769236, + "grad_norm": 2.8413193225860596, + "learning_rate": 7.721283087210199e-05, + "loss": 2.2505, + "step": 996 + }, + { + "epoch": 0.32837910165095313, + "grad_norm": 3.3534953594207764, + "learning_rate": 7.716868418620768e-05, + "loss": 2.2123, + "step": 997 + }, + { + "epoch": 0.32870846885421384, + "grad_norm": 2.6357085704803467, + "learning_rate": 7.71245074264859e-05, + "loss": 1.8754, + "step": 998 + }, + { + "epoch": 0.32903783605747455, + "grad_norm": 3.168962001800537, + "learning_rate": 7.70803006418372e-05, + "loss": 1.9192, + "step": 999 + }, + { + "epoch": 0.3293672032607353, + "grad_norm": 3.3432531356811523, + "learning_rate": 7.703606388119542e-05, + "loss": 1.7519, + "step": 1000 + }, + { + "epoch": 0.32969657046399603, + "grad_norm": 1.857453465461731, + "learning_rate": 7.699179719352752e-05, + "loss": 2.5652, + "step": 1001 + }, + { + "epoch": 0.3300259376672568, + "grad_norm": 1.9321167469024658, + "learning_rate": 7.694750062783363e-05, + "loss": 2.1067, + "step": 1002 + }, + { + "epoch": 0.3303553048705175, + "grad_norm": 2.123394250869751, + "learning_rate": 7.690317423314696e-05, + "loss": 2.311, + "step": 1003 + }, + { + "epoch": 0.3306846720737783, + "grad_norm": 2.345137357711792, + "learning_rate": 7.685881805853369e-05, + "loss": 2.1725, + "step": 1004 + }, + { + "epoch": 0.331014039277039, + "grad_norm": 2.3156449794769287, + "learning_rate": 7.6814432153093e-05, + "loss": 2.3528, + "step": 1005 + }, + { + "epoch": 0.33134340648029975, + "grad_norm": 2.4214887619018555, + "learning_rate": 7.6770016565957e-05, + "loss": 2.0471, + "step": 1006 + }, + { + "epoch": 0.33167277368356046, + "grad_norm": 2.3343491554260254, + "learning_rate": 7.672557134629059e-05, + "loss": 2.1914, + "step": 1007 + }, + { + "epoch": 0.3320021408868212, + "grad_norm": 2.713235378265381, + "learning_rate": 7.668109654329154e-05, + "loss": 2.4468, + "step": 1008 + }, + { + "epoch": 0.33233150809008194, + "grad_norm": 3.0459039211273193, + "learning_rate": 7.663659220619033e-05, + "loss": 2.4859, + "step": 1009 + }, + { + "epoch": 0.33266087529334265, + "grad_norm": 2.7493438720703125, + "learning_rate": 7.659205838425013e-05, + "loss": 2.4392, + "step": 1010 + }, + { + "epoch": 0.3329902424966034, + "grad_norm": 2.3039515018463135, + "learning_rate": 7.654749512676676e-05, + "loss": 2.0348, + "step": 1011 + }, + { + "epoch": 0.33331960969986413, + "grad_norm": 2.667750597000122, + "learning_rate": 7.650290248306863e-05, + "loss": 1.949, + "step": 1012 + }, + { + "epoch": 0.3336489769031249, + "grad_norm": 2.9373788833618164, + "learning_rate": 7.645828050251665e-05, + "loss": 2.591, + "step": 1013 + }, + { + "epoch": 0.3339783441063856, + "grad_norm": 3.202880620956421, + "learning_rate": 7.641362923450424e-05, + "loss": 2.5421, + "step": 1014 + }, + { + "epoch": 0.3343077113096463, + "grad_norm": 2.4246268272399902, + "learning_rate": 7.636894872845722e-05, + "loss": 2.1465, + "step": 1015 + }, + { + "epoch": 0.3346370785129071, + "grad_norm": 2.732734203338623, + "learning_rate": 7.632423903383374e-05, + "loss": 2.1029, + "step": 1016 + }, + { + "epoch": 0.3349664457161678, + "grad_norm": 3.3760523796081543, + "learning_rate": 7.627950020012434e-05, + "loss": 2.0353, + "step": 1017 + }, + { + "epoch": 0.33529581291942856, + "grad_norm": 3.0070247650146484, + "learning_rate": 7.623473227685176e-05, + "loss": 2.365, + "step": 1018 + }, + { + "epoch": 0.33562518012268927, + "grad_norm": 2.8173720836639404, + "learning_rate": 7.618993531357094e-05, + "loss": 2.0214, + "step": 1019 + }, + { + "epoch": 0.33595454732595004, + "grad_norm": 2.5577118396759033, + "learning_rate": 7.614510935986898e-05, + "loss": 1.8253, + "step": 1020 + }, + { + "epoch": 0.33628391452921075, + "grad_norm": 2.8958218097686768, + "learning_rate": 7.610025446536509e-05, + "loss": 2.2123, + "step": 1021 + }, + { + "epoch": 0.3366132817324715, + "grad_norm": 3.0846564769744873, + "learning_rate": 7.605537067971045e-05, + "loss": 2.0833, + "step": 1022 + }, + { + "epoch": 0.3369426489357322, + "grad_norm": 3.0642452239990234, + "learning_rate": 7.601045805258828e-05, + "loss": 1.8784, + "step": 1023 + }, + { + "epoch": 0.33727201613899294, + "grad_norm": 4.532246112823486, + "learning_rate": 7.596551663371372e-05, + "loss": 2.4327, + "step": 1024 + }, + { + "epoch": 0.3376013833422537, + "grad_norm": 3.3755955696105957, + "learning_rate": 7.592054647283375e-05, + "loss": 1.7789, + "step": 1025 + }, + { + "epoch": 0.3379307505455144, + "grad_norm": 1.943467140197754, + "learning_rate": 7.587554761972718e-05, + "loss": 2.3184, + "step": 1026 + }, + { + "epoch": 0.3382601177487752, + "grad_norm": 2.520038604736328, + "learning_rate": 7.583052012420461e-05, + "loss": 2.2305, + "step": 1027 + }, + { + "epoch": 0.3385894849520359, + "grad_norm": 2.3161160945892334, + "learning_rate": 7.57854640361083e-05, + "loss": 2.2228, + "step": 1028 + }, + { + "epoch": 0.33891885215529666, + "grad_norm": 2.4692294597625732, + "learning_rate": 7.574037940531218e-05, + "loss": 2.251, + "step": 1029 + }, + { + "epoch": 0.33924821935855737, + "grad_norm": 2.4752914905548096, + "learning_rate": 7.569526628172177e-05, + "loss": 2.2454, + "step": 1030 + }, + { + "epoch": 0.33957758656181813, + "grad_norm": 2.6334524154663086, + "learning_rate": 7.565012471527416e-05, + "loss": 2.5675, + "step": 1031 + }, + { + "epoch": 0.33990695376507885, + "grad_norm": 2.3727474212646484, + "learning_rate": 7.560495475593785e-05, + "loss": 2.2345, + "step": 1032 + }, + { + "epoch": 0.34023632096833956, + "grad_norm": 2.6379683017730713, + "learning_rate": 7.555975645371285e-05, + "loss": 2.4907, + "step": 1033 + }, + { + "epoch": 0.3405656881716003, + "grad_norm": 2.5622193813323975, + "learning_rate": 7.55145298586305e-05, + "loss": 2.3077, + "step": 1034 + }, + { + "epoch": 0.34089505537486103, + "grad_norm": 2.888123035430908, + "learning_rate": 7.546927502075348e-05, + "loss": 2.2664, + "step": 1035 + }, + { + "epoch": 0.3412244225781218, + "grad_norm": 2.4946439266204834, + "learning_rate": 7.542399199017568e-05, + "loss": 2.3239, + "step": 1036 + }, + { + "epoch": 0.3415537897813825, + "grad_norm": 2.6942648887634277, + "learning_rate": 7.53786808170223e-05, + "loss": 2.501, + "step": 1037 + }, + { + "epoch": 0.3418831569846433, + "grad_norm": 2.577432632446289, + "learning_rate": 7.53333415514496e-05, + "loss": 2.3515, + "step": 1038 + }, + { + "epoch": 0.342212524187904, + "grad_norm": 2.2709529399871826, + "learning_rate": 7.528797424364496e-05, + "loss": 1.9878, + "step": 1039 + }, + { + "epoch": 0.3425418913911647, + "grad_norm": 2.2214181423187256, + "learning_rate": 7.524257894382681e-05, + "loss": 1.8414, + "step": 1040 + }, + { + "epoch": 0.34287125859442547, + "grad_norm": 2.8622872829437256, + "learning_rate": 7.519715570224457e-05, + "loss": 1.9751, + "step": 1041 + }, + { + "epoch": 0.3432006257976862, + "grad_norm": 2.7308106422424316, + "learning_rate": 7.515170456917857e-05, + "loss": 2.1777, + "step": 1042 + }, + { + "epoch": 0.34352999300094694, + "grad_norm": 3.008892297744751, + "learning_rate": 7.510622559494002e-05, + "loss": 2.1712, + "step": 1043 + }, + { + "epoch": 0.34385936020420765, + "grad_norm": 2.538496732711792, + "learning_rate": 7.5060718829871e-05, + "loss": 1.5176, + "step": 1044 + }, + { + "epoch": 0.3441887274074684, + "grad_norm": 2.9815316200256348, + "learning_rate": 7.501518432434424e-05, + "loss": 1.9269, + "step": 1045 + }, + { + "epoch": 0.34451809461072913, + "grad_norm": 2.786571741104126, + "learning_rate": 7.49696221287633e-05, + "loss": 2.0934, + "step": 1046 + }, + { + "epoch": 0.3448474618139899, + "grad_norm": 2.8320257663726807, + "learning_rate": 7.49240322935623e-05, + "loss": 1.7772, + "step": 1047 + }, + { + "epoch": 0.3451768290172506, + "grad_norm": 3.3367862701416016, + "learning_rate": 7.487841486920599e-05, + "loss": 2.2513, + "step": 1048 + }, + { + "epoch": 0.3455061962205113, + "grad_norm": 3.6575772762298584, + "learning_rate": 7.48327699061897e-05, + "loss": 2.2954, + "step": 1049 + }, + { + "epoch": 0.3458355634237721, + "grad_norm": 3.485363721847534, + "learning_rate": 7.478709745503913e-05, + "loss": 1.9509, + "step": 1050 + }, + { + "epoch": 0.3461649306270328, + "grad_norm": 2.551144599914551, + "learning_rate": 7.474139756631056e-05, + "loss": 2.5046, + "step": 1051 + }, + { + "epoch": 0.34649429783029356, + "grad_norm": 2.3824918270111084, + "learning_rate": 7.46956702905905e-05, + "loss": 2.3288, + "step": 1052 + }, + { + "epoch": 0.3468236650335543, + "grad_norm": 2.3063457012176514, + "learning_rate": 7.464991567849586e-05, + "loss": 2.2688, + "step": 1053 + }, + { + "epoch": 0.34715303223681504, + "grad_norm": 2.4683382511138916, + "learning_rate": 7.460413378067379e-05, + "loss": 2.4154, + "step": 1054 + }, + { + "epoch": 0.34748239944007575, + "grad_norm": 2.572463035583496, + "learning_rate": 7.455832464780162e-05, + "loss": 2.4033, + "step": 1055 + }, + { + "epoch": 0.3478117666433365, + "grad_norm": 2.575000762939453, + "learning_rate": 7.451248833058687e-05, + "loss": 2.2997, + "step": 1056 + }, + { + "epoch": 0.34814113384659723, + "grad_norm": 2.226249933242798, + "learning_rate": 7.446662487976713e-05, + "loss": 2.1283, + "step": 1057 + }, + { + "epoch": 0.34847050104985794, + "grad_norm": 2.5053114891052246, + "learning_rate": 7.442073434610997e-05, + "loss": 2.1099, + "step": 1058 + }, + { + "epoch": 0.3487998682531187, + "grad_norm": 2.71102237701416, + "learning_rate": 7.437481678041307e-05, + "loss": 2.3652, + "step": 1059 + }, + { + "epoch": 0.3491292354563794, + "grad_norm": 3.036839723587036, + "learning_rate": 7.43288722335039e-05, + "loss": 1.9926, + "step": 1060 + }, + { + "epoch": 0.3494586026596402, + "grad_norm": 2.7319066524505615, + "learning_rate": 7.428290075623987e-05, + "loss": 2.5434, + "step": 1061 + }, + { + "epoch": 0.3497879698629009, + "grad_norm": 2.671752691268921, + "learning_rate": 7.423690239950818e-05, + "loss": 2.1477, + "step": 1062 + }, + { + "epoch": 0.35011733706616166, + "grad_norm": 3.032055616378784, + "learning_rate": 7.419087721422576e-05, + "loss": 2.3655, + "step": 1063 + }, + { + "epoch": 0.35044670426942237, + "grad_norm": 2.8573625087738037, + "learning_rate": 7.414482525133928e-05, + "loss": 2.6337, + "step": 1064 + }, + { + "epoch": 0.3507760714726831, + "grad_norm": 2.5970046520233154, + "learning_rate": 7.409874656182506e-05, + "loss": 1.9232, + "step": 1065 + }, + { + "epoch": 0.35110543867594385, + "grad_norm": 2.5858652591705322, + "learning_rate": 7.405264119668894e-05, + "loss": 2.4204, + "step": 1066 + }, + { + "epoch": 0.35143480587920456, + "grad_norm": 3.2637603282928467, + "learning_rate": 7.400650920696633e-05, + "loss": 2.6369, + "step": 1067 + }, + { + "epoch": 0.3517641730824653, + "grad_norm": 2.568941831588745, + "learning_rate": 7.396035064372214e-05, + "loss": 2.2192, + "step": 1068 + }, + { + "epoch": 0.35209354028572604, + "grad_norm": 3.0660107135772705, + "learning_rate": 7.39141655580506e-05, + "loss": 2.1387, + "step": 1069 + }, + { + "epoch": 0.3524229074889868, + "grad_norm": 3.537260055541992, + "learning_rate": 7.386795400107539e-05, + "loss": 2.1265, + "step": 1070 + }, + { + "epoch": 0.3527522746922475, + "grad_norm": 2.7157890796661377, + "learning_rate": 7.382171602394948e-05, + "loss": 1.7636, + "step": 1071 + }, + { + "epoch": 0.3530816418955083, + "grad_norm": 3.227700710296631, + "learning_rate": 7.377545167785506e-05, + "loss": 2.1963, + "step": 1072 + }, + { + "epoch": 0.353411009098769, + "grad_norm": 2.9469199180603027, + "learning_rate": 7.372916101400349e-05, + "loss": 1.8702, + "step": 1073 + }, + { + "epoch": 0.3537403763020297, + "grad_norm": 2.9933621883392334, + "learning_rate": 7.368284408363531e-05, + "loss": 1.6875, + "step": 1074 + }, + { + "epoch": 0.35406974350529047, + "grad_norm": 3.143301248550415, + "learning_rate": 7.363650093802012e-05, + "loss": 1.8475, + "step": 1075 + }, + { + "epoch": 0.3543991107085512, + "grad_norm": 1.9010746479034424, + "learning_rate": 7.35901316284565e-05, + "loss": 2.4547, + "step": 1076 + }, + { + "epoch": 0.35472847791181195, + "grad_norm": 1.9302737712860107, + "learning_rate": 7.354373620627205e-05, + "loss": 2.2665, + "step": 1077 + }, + { + "epoch": 0.35505784511507266, + "grad_norm": 2.267845392227173, + "learning_rate": 7.349731472282325e-05, + "loss": 2.3649, + "step": 1078 + }, + { + "epoch": 0.3553872123183334, + "grad_norm": 2.5052695274353027, + "learning_rate": 7.345086722949539e-05, + "loss": 2.3261, + "step": 1079 + }, + { + "epoch": 0.35571657952159413, + "grad_norm": 2.4212067127227783, + "learning_rate": 7.340439377770263e-05, + "loss": 2.1668, + "step": 1080 + }, + { + "epoch": 0.35604594672485484, + "grad_norm": 2.1286873817443848, + "learning_rate": 7.335789441888781e-05, + "loss": 2.1269, + "step": 1081 + }, + { + "epoch": 0.3563753139281156, + "grad_norm": 2.6201021671295166, + "learning_rate": 7.331136920452244e-05, + "loss": 2.498, + "step": 1082 + }, + { + "epoch": 0.3567046811313763, + "grad_norm": 2.8522377014160156, + "learning_rate": 7.326481818610668e-05, + "loss": 2.2456, + "step": 1083 + }, + { + "epoch": 0.3570340483346371, + "grad_norm": 2.749220609664917, + "learning_rate": 7.321824141516926e-05, + "loss": 2.2296, + "step": 1084 + }, + { + "epoch": 0.3573634155378978, + "grad_norm": 2.6423492431640625, + "learning_rate": 7.317163894326735e-05, + "loss": 2.0497, + "step": 1085 + }, + { + "epoch": 0.35769278274115857, + "grad_norm": 2.9978692531585693, + "learning_rate": 7.312501082198666e-05, + "loss": 2.4605, + "step": 1086 + }, + { + "epoch": 0.3580221499444193, + "grad_norm": 2.5553252696990967, + "learning_rate": 7.307835710294125e-05, + "loss": 1.9969, + "step": 1087 + }, + { + "epoch": 0.35835151714768004, + "grad_norm": 3.3244998455047607, + "learning_rate": 7.303167783777349e-05, + "loss": 2.2522, + "step": 1088 + }, + { + "epoch": 0.35868088435094075, + "grad_norm": 2.7964744567871094, + "learning_rate": 7.298497307815406e-05, + "loss": 2.2721, + "step": 1089 + }, + { + "epoch": 0.35901025155420146, + "grad_norm": 2.972792625427246, + "learning_rate": 7.293824287578185e-05, + "loss": 2.2599, + "step": 1090 + }, + { + "epoch": 0.35933961875746223, + "grad_norm": 2.953064441680908, + "learning_rate": 7.289148728238392e-05, + "loss": 2.4102, + "step": 1091 + }, + { + "epoch": 0.35966898596072294, + "grad_norm": 3.144014835357666, + "learning_rate": 7.284470634971544e-05, + "loss": 2.4154, + "step": 1092 + }, + { + "epoch": 0.3599983531639837, + "grad_norm": 2.5227506160736084, + "learning_rate": 7.279790012955961e-05, + "loss": 1.9305, + "step": 1093 + }, + { + "epoch": 0.3603277203672444, + "grad_norm": 3.1171419620513916, + "learning_rate": 7.275106867372762e-05, + "loss": 2.2716, + "step": 1094 + }, + { + "epoch": 0.3606570875705052, + "grad_norm": 2.6330418586730957, + "learning_rate": 7.270421203405863e-05, + "loss": 1.8731, + "step": 1095 + }, + { + "epoch": 0.3609864547737659, + "grad_norm": 2.5206286907196045, + "learning_rate": 7.265733026241966e-05, + "loss": 1.8349, + "step": 1096 + }, + { + "epoch": 0.36131582197702666, + "grad_norm": 2.8334505558013916, + "learning_rate": 7.261042341070552e-05, + "loss": 1.4885, + "step": 1097 + }, + { + "epoch": 0.3616451891802874, + "grad_norm": 3.205961227416992, + "learning_rate": 7.256349153083881e-05, + "loss": 1.9328, + "step": 1098 + }, + { + "epoch": 0.3619745563835481, + "grad_norm": 3.352240562438965, + "learning_rate": 7.251653467476983e-05, + "loss": 2.0997, + "step": 1099 + }, + { + "epoch": 0.36230392358680885, + "grad_norm": 3.4709959030151367, + "learning_rate": 7.246955289447653e-05, + "loss": 1.7248, + "step": 1100 + }, + { + "epoch": 0.36263329079006956, + "grad_norm": 2.071342945098877, + "learning_rate": 7.242254624196443e-05, + "loss": 2.2652, + "step": 1101 + }, + { + "epoch": 0.36296265799333033, + "grad_norm": 2.2652060985565186, + "learning_rate": 7.237551476926661e-05, + "loss": 2.3592, + "step": 1102 + }, + { + "epoch": 0.36329202519659104, + "grad_norm": 2.8345634937286377, + "learning_rate": 7.232845852844361e-05, + "loss": 2.6346, + "step": 1103 + }, + { + "epoch": 0.3636213923998518, + "grad_norm": 2.3296942710876465, + "learning_rate": 7.228137757158338e-05, + "loss": 2.0973, + "step": 1104 + }, + { + "epoch": 0.3639507596031125, + "grad_norm": 2.4769644737243652, + "learning_rate": 7.223427195080126e-05, + "loss": 2.2648, + "step": 1105 + }, + { + "epoch": 0.36428012680637323, + "grad_norm": 2.285839080810547, + "learning_rate": 7.218714171823984e-05, + "loss": 2.1648, + "step": 1106 + }, + { + "epoch": 0.364609494009634, + "grad_norm": 2.404283046722412, + "learning_rate": 7.2139986926069e-05, + "loss": 2.3887, + "step": 1107 + }, + { + "epoch": 0.3649388612128947, + "grad_norm": 2.4443814754486084, + "learning_rate": 7.209280762648576e-05, + "loss": 2.227, + "step": 1108 + }, + { + "epoch": 0.36526822841615547, + "grad_norm": 2.4258129596710205, + "learning_rate": 7.204560387171432e-05, + "loss": 2.2972, + "step": 1109 + }, + { + "epoch": 0.3655975956194162, + "grad_norm": 2.630659341812134, + "learning_rate": 7.199837571400591e-05, + "loss": 2.3323, + "step": 1110 + }, + { + "epoch": 0.36592696282267695, + "grad_norm": 2.332148313522339, + "learning_rate": 7.195112320563881e-05, + "loss": 2.0955, + "step": 1111 + }, + { + "epoch": 0.36625633002593766, + "grad_norm": 2.656656503677368, + "learning_rate": 7.190384639891822e-05, + "loss": 2.016, + "step": 1112 + }, + { + "epoch": 0.3665856972291984, + "grad_norm": 2.663341760635376, + "learning_rate": 7.185654534617623e-05, + "loss": 2.3012, + "step": 1113 + }, + { + "epoch": 0.36691506443245914, + "grad_norm": 2.5622754096984863, + "learning_rate": 7.180922009977181e-05, + "loss": 1.9749, + "step": 1114 + }, + { + "epoch": 0.36724443163571985, + "grad_norm": 2.865852117538452, + "learning_rate": 7.176187071209069e-05, + "loss": 1.648, + "step": 1115 + }, + { + "epoch": 0.3675737988389806, + "grad_norm": 2.8726089000701904, + "learning_rate": 7.171449723554531e-05, + "loss": 2.0493, + "step": 1116 + }, + { + "epoch": 0.3679031660422413, + "grad_norm": 2.720703125, + "learning_rate": 7.166709972257478e-05, + "loss": 2.3402, + "step": 1117 + }, + { + "epoch": 0.3682325332455021, + "grad_norm": 3.307445764541626, + "learning_rate": 7.161967822564483e-05, + "loss": 1.8724, + "step": 1118 + }, + { + "epoch": 0.3685619004487628, + "grad_norm": 3.013970375061035, + "learning_rate": 7.157223279724775e-05, + "loss": 2.1087, + "step": 1119 + }, + { + "epoch": 0.36889126765202357, + "grad_norm": 2.9190618991851807, + "learning_rate": 7.152476348990224e-05, + "loss": 2.1204, + "step": 1120 + }, + { + "epoch": 0.3692206348552843, + "grad_norm": 2.860560178756714, + "learning_rate": 7.147727035615355e-05, + "loss": 2.0234, + "step": 1121 + }, + { + "epoch": 0.36955000205854505, + "grad_norm": 3.1333353519439697, + "learning_rate": 7.142975344857325e-05, + "loss": 2.1354, + "step": 1122 + }, + { + "epoch": 0.36987936926180576, + "grad_norm": 3.1250181198120117, + "learning_rate": 7.138221281975919e-05, + "loss": 2.2142, + "step": 1123 + }, + { + "epoch": 0.37020873646506647, + "grad_norm": 3.049356460571289, + "learning_rate": 7.133464852233553e-05, + "loss": 1.9174, + "step": 1124 + }, + { + "epoch": 0.37053810366832723, + "grad_norm": 3.432244062423706, + "learning_rate": 7.12870606089526e-05, + "loss": 1.4808, + "step": 1125 + }, + { + "epoch": 0.37086747087158795, + "grad_norm": 1.7517294883728027, + "learning_rate": 7.123944913228688e-05, + "loss": 2.5009, + "step": 1126 + }, + { + "epoch": 0.3711968380748487, + "grad_norm": 2.627819299697876, + "learning_rate": 7.119181414504095e-05, + "loss": 2.4364, + "step": 1127 + }, + { + "epoch": 0.3715262052781094, + "grad_norm": 2.8850955963134766, + "learning_rate": 7.11441556999434e-05, + "loss": 2.4496, + "step": 1128 + }, + { + "epoch": 0.3718555724813702, + "grad_norm": 2.320338726043701, + "learning_rate": 7.109647384974876e-05, + "loss": 1.9873, + "step": 1129 + }, + { + "epoch": 0.3721849396846309, + "grad_norm": 2.7479662895202637, + "learning_rate": 7.104876864723751e-05, + "loss": 2.4404, + "step": 1130 + }, + { + "epoch": 0.3725143068878916, + "grad_norm": 2.618581533432007, + "learning_rate": 7.100104014521598e-05, + "loss": 2.0846, + "step": 1131 + }, + { + "epoch": 0.3728436740911524, + "grad_norm": 3.1042213439941406, + "learning_rate": 7.095328839651625e-05, + "loss": 2.6769, + "step": 1132 + }, + { + "epoch": 0.3731730412944131, + "grad_norm": 2.402482271194458, + "learning_rate": 7.090551345399616e-05, + "loss": 2.1385, + "step": 1133 + }, + { + "epoch": 0.37350240849767385, + "grad_norm": 2.716562271118164, + "learning_rate": 7.085771537053923e-05, + "loss": 2.2691, + "step": 1134 + }, + { + "epoch": 0.37383177570093457, + "grad_norm": 2.5292835235595703, + "learning_rate": 7.080989419905456e-05, + "loss": 2.0973, + "step": 1135 + }, + { + "epoch": 0.37416114290419533, + "grad_norm": 2.4309535026550293, + "learning_rate": 7.076204999247686e-05, + "loss": 2.2923, + "step": 1136 + }, + { + "epoch": 0.37449051010745604, + "grad_norm": 2.3152859210968018, + "learning_rate": 7.071418280376629e-05, + "loss": 1.898, + "step": 1137 + }, + { + "epoch": 0.3748198773107168, + "grad_norm": 2.7684056758880615, + "learning_rate": 7.06662926859085e-05, + "loss": 2.192, + "step": 1138 + }, + { + "epoch": 0.3751492445139775, + "grad_norm": 2.886624813079834, + "learning_rate": 7.061837969191445e-05, + "loss": 2.2443, + "step": 1139 + }, + { + "epoch": 0.37547861171723823, + "grad_norm": 2.6988983154296875, + "learning_rate": 7.05704438748205e-05, + "loss": 2.0046, + "step": 1140 + }, + { + "epoch": 0.375807978920499, + "grad_norm": 2.849220037460327, + "learning_rate": 7.05224852876882e-05, + "loss": 2.2765, + "step": 1141 + }, + { + "epoch": 0.3761373461237597, + "grad_norm": 2.494020462036133, + "learning_rate": 7.047450398360438e-05, + "loss": 2.1473, + "step": 1142 + }, + { + "epoch": 0.3764667133270205, + "grad_norm": 2.90225887298584, + "learning_rate": 7.042650001568097e-05, + "loss": 2.3484, + "step": 1143 + }, + { + "epoch": 0.3767960805302812, + "grad_norm": 2.873544692993164, + "learning_rate": 7.037847343705496e-05, + "loss": 2.0771, + "step": 1144 + }, + { + "epoch": 0.37712544773354195, + "grad_norm": 2.2210004329681396, + "learning_rate": 7.033042430088844e-05, + "loss": 1.6013, + "step": 1145 + }, + { + "epoch": 0.37745481493680266, + "grad_norm": 2.627661943435669, + "learning_rate": 7.028235266036841e-05, + "loss": 2.0004, + "step": 1146 + }, + { + "epoch": 0.37778418214006343, + "grad_norm": 2.7000410556793213, + "learning_rate": 7.023425856870683e-05, + "loss": 1.8592, + "step": 1147 + }, + { + "epoch": 0.37811354934332414, + "grad_norm": 3.242762327194214, + "learning_rate": 7.018614207914047e-05, + "loss": 2.1389, + "step": 1148 + }, + { + "epoch": 0.37844291654658485, + "grad_norm": 3.2626426219940186, + "learning_rate": 7.013800324493089e-05, + "loss": 2.0661, + "step": 1149 + }, + { + "epoch": 0.3787722837498456, + "grad_norm": 4.664367198944092, + "learning_rate": 7.008984211936446e-05, + "loss": 2.0999, + "step": 1150 + }, + { + "epoch": 0.37910165095310633, + "grad_norm": 1.8425556421279907, + "learning_rate": 7.00416587557521e-05, + "loss": 2.4092, + "step": 1151 + }, + { + "epoch": 0.3794310181563671, + "grad_norm": 2.1894047260284424, + "learning_rate": 6.999345320742945e-05, + "loss": 2.4542, + "step": 1152 + }, + { + "epoch": 0.3797603853596278, + "grad_norm": 2.095588207244873, + "learning_rate": 6.994522552775666e-05, + "loss": 2.1205, + "step": 1153 + }, + { + "epoch": 0.38008975256288857, + "grad_norm": 2.427074909210205, + "learning_rate": 6.98969757701184e-05, + "loss": 2.2412, + "step": 1154 + }, + { + "epoch": 0.3804191197661493, + "grad_norm": 2.322599411010742, + "learning_rate": 6.984870398792374e-05, + "loss": 2.0839, + "step": 1155 + }, + { + "epoch": 0.38074848696941, + "grad_norm": 2.316253900527954, + "learning_rate": 6.980041023460619e-05, + "loss": 2.2265, + "step": 1156 + }, + { + "epoch": 0.38107785417267076, + "grad_norm": 2.33427095413208, + "learning_rate": 6.975209456362353e-05, + "loss": 2.3765, + "step": 1157 + }, + { + "epoch": 0.38140722137593147, + "grad_norm": 2.23207426071167, + "learning_rate": 6.97037570284578e-05, + "loss": 1.9018, + "step": 1158 + }, + { + "epoch": 0.38173658857919224, + "grad_norm": 2.3566267490386963, + "learning_rate": 6.965539768261531e-05, + "loss": 2.1916, + "step": 1159 + }, + { + "epoch": 0.38206595578245295, + "grad_norm": 2.377549409866333, + "learning_rate": 6.960701657962641e-05, + "loss": 2.4266, + "step": 1160 + }, + { + "epoch": 0.3823953229857137, + "grad_norm": 2.494011163711548, + "learning_rate": 6.955861377304564e-05, + "loss": 2.1707, + "step": 1161 + }, + { + "epoch": 0.3827246901889744, + "grad_norm": 2.6809515953063965, + "learning_rate": 6.951018931645146e-05, + "loss": 2.3089, + "step": 1162 + }, + { + "epoch": 0.3830540573922352, + "grad_norm": 2.6300032138824463, + "learning_rate": 6.946174326344637e-05, + "loss": 2.1554, + "step": 1163 + }, + { + "epoch": 0.3833834245954959, + "grad_norm": 2.4809701442718506, + "learning_rate": 6.941327566765675e-05, + "loss": 2.1591, + "step": 1164 + }, + { + "epoch": 0.3837127917987566, + "grad_norm": 2.6750636100769043, + "learning_rate": 6.936478658273285e-05, + "loss": 2.3151, + "step": 1165 + }, + { + "epoch": 0.3840421590020174, + "grad_norm": 2.421334981918335, + "learning_rate": 6.931627606234865e-05, + "loss": 1.9933, + "step": 1166 + }, + { + "epoch": 0.3843715262052781, + "grad_norm": 2.758087158203125, + "learning_rate": 6.92677441602019e-05, + "loss": 2.2134, + "step": 1167 + }, + { + "epoch": 0.38470089340853886, + "grad_norm": 3.3581409454345703, + "learning_rate": 6.921919093001402e-05, + "loss": 2.3097, + "step": 1168 + }, + { + "epoch": 0.38503026061179957, + "grad_norm": 3.3593056201934814, + "learning_rate": 6.917061642553005e-05, + "loss": 1.9195, + "step": 1169 + }, + { + "epoch": 0.38535962781506033, + "grad_norm": 2.783876895904541, + "learning_rate": 6.91220207005185e-05, + "loss": 1.8634, + "step": 1170 + }, + { + "epoch": 0.38568899501832105, + "grad_norm": 2.8132247924804688, + "learning_rate": 6.907340380877149e-05, + "loss": 2.1477, + "step": 1171 + }, + { + "epoch": 0.3860183622215818, + "grad_norm": 2.878632068634033, + "learning_rate": 6.902476580410449e-05, + "loss": 2.254, + "step": 1172 + }, + { + "epoch": 0.3863477294248425, + "grad_norm": 3.4279277324676514, + "learning_rate": 6.897610674035634e-05, + "loss": 1.95, + "step": 1173 + }, + { + "epoch": 0.38667709662810323, + "grad_norm": 3.4778835773468018, + "learning_rate": 6.892742667138923e-05, + "loss": 1.8198, + "step": 1174 + }, + { + "epoch": 0.387006463831364, + "grad_norm": 3.2819323539733887, + "learning_rate": 6.887872565108859e-05, + "loss": 1.8538, + "step": 1175 + }, + { + "epoch": 0.3873358310346247, + "grad_norm": 1.8319649696350098, + "learning_rate": 6.883000373336299e-05, + "loss": 2.4531, + "step": 1176 + }, + { + "epoch": 0.3876651982378855, + "grad_norm": 2.054316520690918, + "learning_rate": 6.878126097214421e-05, + "loss": 1.9968, + "step": 1177 + }, + { + "epoch": 0.3879945654411462, + "grad_norm": 2.3420443534851074, + "learning_rate": 6.873249742138709e-05, + "loss": 2.2451, + "step": 1178 + }, + { + "epoch": 0.38832393264440696, + "grad_norm": 1.8880903720855713, + "learning_rate": 6.868371313506941e-05, + "loss": 2.3786, + "step": 1179 + }, + { + "epoch": 0.38865329984766767, + "grad_norm": 2.393928289413452, + "learning_rate": 6.863490816719196e-05, + "loss": 2.1902, + "step": 1180 + }, + { + "epoch": 0.3889826670509284, + "grad_norm": 2.3493289947509766, + "learning_rate": 6.858608257177846e-05, + "loss": 2.053, + "step": 1181 + }, + { + "epoch": 0.38931203425418914, + "grad_norm": 2.4995625019073486, + "learning_rate": 6.853723640287535e-05, + "loss": 2.3704, + "step": 1182 + }, + { + "epoch": 0.38964140145744985, + "grad_norm": 2.5445573329925537, + "learning_rate": 6.848836971455197e-05, + "loss": 1.9575, + "step": 1183 + }, + { + "epoch": 0.3899707686607106, + "grad_norm": 2.5418543815612793, + "learning_rate": 6.84394825609003e-05, + "loss": 2.4032, + "step": 1184 + }, + { + "epoch": 0.39030013586397133, + "grad_norm": 3.205739736557007, + "learning_rate": 6.839057499603497e-05, + "loss": 2.6626, + "step": 1185 + }, + { + "epoch": 0.3906295030672321, + "grad_norm": 2.529710531234741, + "learning_rate": 6.834164707409326e-05, + "loss": 2.1196, + "step": 1186 + }, + { + "epoch": 0.3909588702704928, + "grad_norm": 2.9174511432647705, + "learning_rate": 6.829269884923491e-05, + "loss": 2.4384, + "step": 1187 + }, + { + "epoch": 0.3912882374737536, + "grad_norm": 2.5731706619262695, + "learning_rate": 6.82437303756422e-05, + "loss": 2.21, + "step": 1188 + }, + { + "epoch": 0.3916176046770143, + "grad_norm": 3.202188014984131, + "learning_rate": 6.819474170751978e-05, + "loss": 2.5139, + "step": 1189 + }, + { + "epoch": 0.391946971880275, + "grad_norm": 2.6559245586395264, + "learning_rate": 6.814573289909466e-05, + "loss": 2.3044, + "step": 1190 + }, + { + "epoch": 0.39227633908353576, + "grad_norm": 3.1062824726104736, + "learning_rate": 6.809670400461618e-05, + "loss": 2.0021, + "step": 1191 + }, + { + "epoch": 0.3926057062867965, + "grad_norm": 2.513209342956543, + "learning_rate": 6.804765507835587e-05, + "loss": 1.8991, + "step": 1192 + }, + { + "epoch": 0.39293507349005724, + "grad_norm": 2.8009233474731445, + "learning_rate": 6.799858617460744e-05, + "loss": 2.3419, + "step": 1193 + }, + { + "epoch": 0.39326444069331795, + "grad_norm": 2.8235421180725098, + "learning_rate": 6.794949734768674e-05, + "loss": 2.2565, + "step": 1194 + }, + { + "epoch": 0.3935938078965787, + "grad_norm": 2.455665111541748, + "learning_rate": 6.790038865193167e-05, + "loss": 1.9415, + "step": 1195 + }, + { + "epoch": 0.39392317509983943, + "grad_norm": 2.5987555980682373, + "learning_rate": 6.785126014170207e-05, + "loss": 1.8636, + "step": 1196 + }, + { + "epoch": 0.3942525423031002, + "grad_norm": 2.7285478115081787, + "learning_rate": 6.780211187137981e-05, + "loss": 2.0818, + "step": 1197 + }, + { + "epoch": 0.3945819095063609, + "grad_norm": 3.124101161956787, + "learning_rate": 6.775294389536852e-05, + "loss": 2.1855, + "step": 1198 + }, + { + "epoch": 0.3949112767096216, + "grad_norm": 2.466757297515869, + "learning_rate": 6.770375626809373e-05, + "loss": 1.6696, + "step": 1199 + }, + { + "epoch": 0.3952406439128824, + "grad_norm": 3.0805959701538086, + "learning_rate": 6.76545490440027e-05, + "loss": 2.1575, + "step": 1200 + }, + { + "epoch": 0.3955700111161431, + "grad_norm": 2.310868978500366, + "learning_rate": 6.760532227756435e-05, + "loss": 2.3745, + "step": 1201 + }, + { + "epoch": 0.39589937831940386, + "grad_norm": 2.0340662002563477, + "learning_rate": 6.755607602326928e-05, + "loss": 2.2478, + "step": 1202 + }, + { + "epoch": 0.39622874552266457, + "grad_norm": 2.500797748565674, + "learning_rate": 6.750681033562964e-05, + "loss": 2.593, + "step": 1203 + }, + { + "epoch": 0.39655811272592534, + "grad_norm": 2.07173228263855, + "learning_rate": 6.745752526917907e-05, + "loss": 2.3154, + "step": 1204 + }, + { + "epoch": 0.39688747992918605, + "grad_norm": 2.0684971809387207, + "learning_rate": 6.74082208784727e-05, + "loss": 2.0726, + "step": 1205 + }, + { + "epoch": 0.39721684713244676, + "grad_norm": 2.2718148231506348, + "learning_rate": 6.735889721808703e-05, + "loss": 2.2801, + "step": 1206 + }, + { + "epoch": 0.3975462143357075, + "grad_norm": 2.2261784076690674, + "learning_rate": 6.730955434261986e-05, + "loss": 2.1202, + "step": 1207 + }, + { + "epoch": 0.39787558153896824, + "grad_norm": 2.9758946895599365, + "learning_rate": 6.726019230669034e-05, + "loss": 2.4945, + "step": 1208 + }, + { + "epoch": 0.398204948742229, + "grad_norm": 2.5161709785461426, + "learning_rate": 6.721081116493874e-05, + "loss": 2.1904, + "step": 1209 + }, + { + "epoch": 0.3985343159454897, + "grad_norm": 2.922379493713379, + "learning_rate": 6.716141097202657e-05, + "loss": 2.5583, + "step": 1210 + }, + { + "epoch": 0.3988636831487505, + "grad_norm": 2.4563610553741455, + "learning_rate": 6.711199178263632e-05, + "loss": 2.355, + "step": 1211 + }, + { + "epoch": 0.3991930503520112, + "grad_norm": 3.078686237335205, + "learning_rate": 6.706255365147161e-05, + "loss": 2.604, + "step": 1212 + }, + { + "epoch": 0.39952241755527196, + "grad_norm": 2.5945472717285156, + "learning_rate": 6.701309663325696e-05, + "loss": 2.3826, + "step": 1213 + }, + { + "epoch": 0.39985178475853267, + "grad_norm": 2.8002431392669678, + "learning_rate": 6.696362078273781e-05, + "loss": 2.1889, + "step": 1214 + }, + { + "epoch": 0.4001811519617934, + "grad_norm": 2.4396965503692627, + "learning_rate": 6.69141261546805e-05, + "loss": 1.9122, + "step": 1215 + }, + { + "epoch": 0.40051051916505415, + "grad_norm": 2.8927268981933594, + "learning_rate": 6.686461280387203e-05, + "loss": 2.2138, + "step": 1216 + }, + { + "epoch": 0.40083988636831486, + "grad_norm": 2.633314847946167, + "learning_rate": 6.681508078512031e-05, + "loss": 1.7317, + "step": 1217 + }, + { + "epoch": 0.4011692535715756, + "grad_norm": 3.017258882522583, + "learning_rate": 6.676553015325372e-05, + "loss": 2.1284, + "step": 1218 + }, + { + "epoch": 0.40149862077483633, + "grad_norm": 2.7072062492370605, + "learning_rate": 6.67159609631214e-05, + "loss": 1.8943, + "step": 1219 + }, + { + "epoch": 0.4018279879780971, + "grad_norm": 2.961946487426758, + "learning_rate": 6.666637326959293e-05, + "loss": 2.2615, + "step": 1220 + }, + { + "epoch": 0.4021573551813578, + "grad_norm": 2.648956775665283, + "learning_rate": 6.661676712755842e-05, + "loss": 1.9888, + "step": 1221 + }, + { + "epoch": 0.4024867223846186, + "grad_norm": 2.841571092605591, + "learning_rate": 6.656714259192839e-05, + "loss": 2.2434, + "step": 1222 + }, + { + "epoch": 0.4028160895878793, + "grad_norm": 2.6502773761749268, + "learning_rate": 6.65174997176337e-05, + "loss": 1.9564, + "step": 1223 + }, + { + "epoch": 0.40314545679114, + "grad_norm": 2.589456796646118, + "learning_rate": 6.646783855962555e-05, + "loss": 1.664, + "step": 1224 + }, + { + "epoch": 0.40347482399440077, + "grad_norm": 2.849358320236206, + "learning_rate": 6.641815917287535e-05, + "loss": 1.6624, + "step": 1225 + }, + { + "epoch": 0.4038041911976615, + "grad_norm": 2.1147572994232178, + "learning_rate": 6.63684616123747e-05, + "loss": 2.2939, + "step": 1226 + }, + { + "epoch": 0.40413355840092224, + "grad_norm": 2.6644229888916016, + "learning_rate": 6.631874593313531e-05, + "loss": 2.2582, + "step": 1227 + }, + { + "epoch": 0.40446292560418295, + "grad_norm": 2.094411849975586, + "learning_rate": 6.626901219018895e-05, + "loss": 2.3132, + "step": 1228 + }, + { + "epoch": 0.4047922928074437, + "grad_norm": 2.631877899169922, + "learning_rate": 6.621926043858739e-05, + "loss": 2.1575, + "step": 1229 + }, + { + "epoch": 0.40512166001070443, + "grad_norm": 2.610651731491089, + "learning_rate": 6.616949073340232e-05, + "loss": 2.3285, + "step": 1230 + }, + { + "epoch": 0.40545102721396514, + "grad_norm": 2.4871439933776855, + "learning_rate": 6.611970312972531e-05, + "loss": 2.0776, + "step": 1231 + }, + { + "epoch": 0.4057803944172259, + "grad_norm": 2.7471463680267334, + "learning_rate": 6.606989768266776e-05, + "loss": 2.5548, + "step": 1232 + }, + { + "epoch": 0.4061097616204866, + "grad_norm": 2.746669292449951, + "learning_rate": 6.602007444736077e-05, + "loss": 2.2351, + "step": 1233 + }, + { + "epoch": 0.4064391288237474, + "grad_norm": 2.907914161682129, + "learning_rate": 6.597023347895524e-05, + "loss": 2.7394, + "step": 1234 + }, + { + "epoch": 0.4067684960270081, + "grad_norm": 2.477755308151245, + "learning_rate": 6.592037483262156e-05, + "loss": 2.0522, + "step": 1235 + }, + { + "epoch": 0.40709786323026886, + "grad_norm": 2.537019968032837, + "learning_rate": 6.587049856354977e-05, + "loss": 2.2179, + "step": 1236 + }, + { + "epoch": 0.4074272304335296, + "grad_norm": 2.399907350540161, + "learning_rate": 6.582060472694939e-05, + "loss": 2.2994, + "step": 1237 + }, + { + "epoch": 0.40775659763679034, + "grad_norm": 2.9592201709747314, + "learning_rate": 6.577069337804944e-05, + "loss": 2.246, + "step": 1238 + }, + { + "epoch": 0.40808596484005105, + "grad_norm": 2.831124782562256, + "learning_rate": 6.572076457209822e-05, + "loss": 2.0093, + "step": 1239 + }, + { + "epoch": 0.40841533204331176, + "grad_norm": 2.620274066925049, + "learning_rate": 6.567081836436346e-05, + "loss": 1.8906, + "step": 1240 + }, + { + "epoch": 0.40874469924657253, + "grad_norm": 2.5152428150177, + "learning_rate": 6.562085481013211e-05, + "loss": 2.2982, + "step": 1241 + }, + { + "epoch": 0.40907406644983324, + "grad_norm": 3.2875349521636963, + "learning_rate": 6.55708739647103e-05, + "loss": 2.4582, + "step": 1242 + }, + { + "epoch": 0.409403433653094, + "grad_norm": 2.650205612182617, + "learning_rate": 6.552087588342332e-05, + "loss": 2.1196, + "step": 1243 + }, + { + "epoch": 0.4097328008563547, + "grad_norm": 2.7873198986053467, + "learning_rate": 6.547086062161555e-05, + "loss": 1.9238, + "step": 1244 + }, + { + "epoch": 0.4100621680596155, + "grad_norm": 2.823976993560791, + "learning_rate": 6.542082823465037e-05, + "loss": 1.9929, + "step": 1245 + }, + { + "epoch": 0.4103915352628762, + "grad_norm": 3.1395912170410156, + "learning_rate": 6.537077877791011e-05, + "loss": 2.3549, + "step": 1246 + }, + { + "epoch": 0.41072090246613696, + "grad_norm": 2.8609724044799805, + "learning_rate": 6.532071230679604e-05, + "loss": 2.1222, + "step": 1247 + }, + { + "epoch": 0.41105026966939767, + "grad_norm": 2.8728225231170654, + "learning_rate": 6.527062887672819e-05, + "loss": 2.1451, + "step": 1248 + }, + { + "epoch": 0.4113796368726584, + "grad_norm": 3.3821842670440674, + "learning_rate": 6.522052854314544e-05, + "loss": 2.3055, + "step": 1249 + }, + { + "epoch": 0.41170900407591915, + "grad_norm": 3.682586431503296, + "learning_rate": 6.517041136150534e-05, + "loss": 1.9983, + "step": 1250 + }, + { + "epoch": 0.41203837127917986, + "grad_norm": 2.3784186840057373, + "learning_rate": 6.512027738728407e-05, + "loss": 2.4458, + "step": 1251 + }, + { + "epoch": 0.4123677384824406, + "grad_norm": 2.384232997894287, + "learning_rate": 6.507012667597643e-05, + "loss": 2.5749, + "step": 1252 + }, + { + "epoch": 0.41269710568570134, + "grad_norm": 2.3909189701080322, + "learning_rate": 6.501995928309577e-05, + "loss": 2.3923, + "step": 1253 + }, + { + "epoch": 0.4130264728889621, + "grad_norm": 2.478942394256592, + "learning_rate": 6.496977526417383e-05, + "loss": 2.5673, + "step": 1254 + }, + { + "epoch": 0.4133558400922228, + "grad_norm": 2.6806704998016357, + "learning_rate": 6.491957467476081e-05, + "loss": 2.1562, + "step": 1255 + }, + { + "epoch": 0.4136852072954835, + "grad_norm": 2.777129888534546, + "learning_rate": 6.486935757042529e-05, + "loss": 2.3145, + "step": 1256 + }, + { + "epoch": 0.4140145744987443, + "grad_norm": 2.46478533744812, + "learning_rate": 6.481912400675402e-05, + "loss": 2.1051, + "step": 1257 + }, + { + "epoch": 0.414343941702005, + "grad_norm": 2.9172468185424805, + "learning_rate": 6.476887403935204e-05, + "loss": 2.32, + "step": 1258 + }, + { + "epoch": 0.41467330890526577, + "grad_norm": 2.7249600887298584, + "learning_rate": 6.471860772384256e-05, + "loss": 2.2816, + "step": 1259 + }, + { + "epoch": 0.4150026761085265, + "grad_norm": 2.6038973331451416, + "learning_rate": 6.466832511586687e-05, + "loss": 2.1756, + "step": 1260 + }, + { + "epoch": 0.41533204331178725, + "grad_norm": 2.6487905979156494, + "learning_rate": 6.461802627108426e-05, + "loss": 2.6021, + "step": 1261 + }, + { + "epoch": 0.41566141051504796, + "grad_norm": 2.545063018798828, + "learning_rate": 6.456771124517205e-05, + "loss": 2.0434, + "step": 1262 + }, + { + "epoch": 0.4159907777183087, + "grad_norm": 2.7668240070343018, + "learning_rate": 6.451738009382542e-05, + "loss": 2.3299, + "step": 1263 + }, + { + "epoch": 0.41632014492156943, + "grad_norm": 2.4981627464294434, + "learning_rate": 6.446703287275745e-05, + "loss": 1.9392, + "step": 1264 + }, + { + "epoch": 0.41664951212483015, + "grad_norm": 2.7257003784179688, + "learning_rate": 6.441666963769897e-05, + "loss": 2.0421, + "step": 1265 + }, + { + "epoch": 0.4169788793280909, + "grad_norm": 2.8627986907958984, + "learning_rate": 6.436629044439854e-05, + "loss": 2.3554, + "step": 1266 + }, + { + "epoch": 0.4173082465313516, + "grad_norm": 2.81748628616333, + "learning_rate": 6.43158953486224e-05, + "loss": 1.9084, + "step": 1267 + }, + { + "epoch": 0.4176376137346124, + "grad_norm": 2.941399335861206, + "learning_rate": 6.426548440615438e-05, + "loss": 2.0619, + "step": 1268 + }, + { + "epoch": 0.4179669809378731, + "grad_norm": 3.05411434173584, + "learning_rate": 6.421505767279588e-05, + "loss": 2.1444, + "step": 1269 + }, + { + "epoch": 0.41829634814113387, + "grad_norm": 2.318077564239502, + "learning_rate": 6.416461520436571e-05, + "loss": 1.7884, + "step": 1270 + }, + { + "epoch": 0.4186257153443946, + "grad_norm": 2.866882562637329, + "learning_rate": 6.411415705670021e-05, + "loss": 2.0699, + "step": 1271 + }, + { + "epoch": 0.41895508254765534, + "grad_norm": 2.7729244232177734, + "learning_rate": 6.406368328565295e-05, + "loss": 2.2929, + "step": 1272 + }, + { + "epoch": 0.41928444975091606, + "grad_norm": 3.3135483264923096, + "learning_rate": 6.401319394709489e-05, + "loss": 2.4658, + "step": 1273 + }, + { + "epoch": 0.41961381695417677, + "grad_norm": 3.1503915786743164, + "learning_rate": 6.396268909691414e-05, + "loss": 2.117, + "step": 1274 + }, + { + "epoch": 0.41994318415743753, + "grad_norm": 2.9462268352508545, + "learning_rate": 6.391216879101608e-05, + "loss": 2.0571, + "step": 1275 + }, + { + "epoch": 0.42027255136069824, + "grad_norm": 2.186408042907715, + "learning_rate": 6.386163308532314e-05, + "loss": 2.5956, + "step": 1276 + }, + { + "epoch": 0.420601918563959, + "grad_norm": 2.4059898853302, + "learning_rate": 6.381108203577476e-05, + "loss": 2.4601, + "step": 1277 + }, + { + "epoch": 0.4209312857672197, + "grad_norm": 2.4408154487609863, + "learning_rate": 6.376051569832742e-05, + "loss": 2.537, + "step": 1278 + }, + { + "epoch": 0.4212606529704805, + "grad_norm": 2.3048956394195557, + "learning_rate": 6.370993412895454e-05, + "loss": 2.4092, + "step": 1279 + }, + { + "epoch": 0.4215900201737412, + "grad_norm": 2.4109179973602295, + "learning_rate": 6.365933738364634e-05, + "loss": 2.2523, + "step": 1280 + }, + { + "epoch": 0.4219193873770019, + "grad_norm": 2.6136887073516846, + "learning_rate": 6.360872551840988e-05, + "loss": 2.0975, + "step": 1281 + }, + { + "epoch": 0.4222487545802627, + "grad_norm": 2.697741985321045, + "learning_rate": 6.355809858926893e-05, + "loss": 2.5818, + "step": 1282 + }, + { + "epoch": 0.4225781217835234, + "grad_norm": 2.3792057037353516, + "learning_rate": 6.350745665226396e-05, + "loss": 2.1916, + "step": 1283 + }, + { + "epoch": 0.42290748898678415, + "grad_norm": 2.742786169052124, + "learning_rate": 6.345679976345205e-05, + "loss": 2.5287, + "step": 1284 + }, + { + "epoch": 0.42323685619004486, + "grad_norm": 2.550384998321533, + "learning_rate": 6.34061279789068e-05, + "loss": 2.3782, + "step": 1285 + }, + { + "epoch": 0.42356622339330563, + "grad_norm": 3.0929136276245117, + "learning_rate": 6.335544135471834e-05, + "loss": 2.3501, + "step": 1286 + }, + { + "epoch": 0.42389559059656634, + "grad_norm": 2.920880079269409, + "learning_rate": 6.330473994699318e-05, + "loss": 2.3128, + "step": 1287 + }, + { + "epoch": 0.4242249577998271, + "grad_norm": 2.258695125579834, + "learning_rate": 6.325402381185426e-05, + "loss": 2.0221, + "step": 1288 + }, + { + "epoch": 0.4245543250030878, + "grad_norm": 2.6346912384033203, + "learning_rate": 6.320329300544076e-05, + "loss": 2.2641, + "step": 1289 + }, + { + "epoch": 0.42488369220634853, + "grad_norm": 3.050056219100952, + "learning_rate": 6.315254758390814e-05, + "loss": 2.4125, + "step": 1290 + }, + { + "epoch": 0.4252130594096093, + "grad_norm": 2.530167818069458, + "learning_rate": 6.3101787603428e-05, + "loss": 2.0573, + "step": 1291 + }, + { + "epoch": 0.42554242661287, + "grad_norm": 3.0110995769500732, + "learning_rate": 6.305101312018809e-05, + "loss": 2.6722, + "step": 1292 + }, + { + "epoch": 0.4258717938161308, + "grad_norm": 2.6352243423461914, + "learning_rate": 6.300022419039219e-05, + "loss": 2.0293, + "step": 1293 + }, + { + "epoch": 0.4262011610193915, + "grad_norm": 2.864758253097534, + "learning_rate": 6.294942087026011e-05, + "loss": 1.9841, + "step": 1294 + }, + { + "epoch": 0.42653052822265225, + "grad_norm": 3.216989517211914, + "learning_rate": 6.289860321602754e-05, + "loss": 2.005, + "step": 1295 + }, + { + "epoch": 0.42685989542591296, + "grad_norm": 3.134946584701538, + "learning_rate": 6.284777128394603e-05, + "loss": 2.3214, + "step": 1296 + }, + { + "epoch": 0.4271892626291737, + "grad_norm": 3.151312828063965, + "learning_rate": 6.279692513028304e-05, + "loss": 2.2082, + "step": 1297 + }, + { + "epoch": 0.42751862983243444, + "grad_norm": 2.858823776245117, + "learning_rate": 6.274606481132163e-05, + "loss": 2.0439, + "step": 1298 + }, + { + "epoch": 0.42784799703569515, + "grad_norm": 2.6305992603302, + "learning_rate": 6.269519038336062e-05, + "loss": 1.7326, + "step": 1299 + }, + { + "epoch": 0.4281773642389559, + "grad_norm": 4.182598114013672, + "learning_rate": 6.264430190271444e-05, + "loss": 1.8915, + "step": 1300 + }, + { + "epoch": 0.4285067314422166, + "grad_norm": 2.144503116607666, + "learning_rate": 6.259339942571307e-05, + "loss": 2.3162, + "step": 1301 + }, + { + "epoch": 0.4288360986454774, + "grad_norm": 2.138836622238159, + "learning_rate": 6.254248300870198e-05, + "loss": 2.3545, + "step": 1302 + }, + { + "epoch": 0.4291654658487381, + "grad_norm": 2.6747050285339355, + "learning_rate": 6.249155270804206e-05, + "loss": 2.6766, + "step": 1303 + }, + { + "epoch": 0.42949483305199887, + "grad_norm": 2.2941064834594727, + "learning_rate": 6.24406085801096e-05, + "loss": 2.6815, + "step": 1304 + }, + { + "epoch": 0.4298242002552596, + "grad_norm": 2.450798988342285, + "learning_rate": 6.238965068129616e-05, + "loss": 2.3887, + "step": 1305 + }, + { + "epoch": 0.4301535674585203, + "grad_norm": 2.7236177921295166, + "learning_rate": 6.233867906800856e-05, + "loss": 2.6746, + "step": 1306 + }, + { + "epoch": 0.43048293466178106, + "grad_norm": 2.221660852432251, + "learning_rate": 6.22876937966688e-05, + "loss": 2.4207, + "step": 1307 + }, + { + "epoch": 0.43081230186504177, + "grad_norm": 2.9190986156463623, + "learning_rate": 6.2236694923714e-05, + "loss": 2.5129, + "step": 1308 + }, + { + "epoch": 0.43114166906830254, + "grad_norm": 2.183797597885132, + "learning_rate": 6.218568250559634e-05, + "loss": 2.209, + "step": 1309 + }, + { + "epoch": 0.43147103627156325, + "grad_norm": 2.7680306434631348, + "learning_rate": 6.2134656598783e-05, + "loss": 1.9334, + "step": 1310 + }, + { + "epoch": 0.431800403474824, + "grad_norm": 2.6052422523498535, + "learning_rate": 6.208361725975605e-05, + "loss": 2.2873, + "step": 1311 + }, + { + "epoch": 0.4321297706780847, + "grad_norm": 2.873652696609497, + "learning_rate": 6.203256454501248e-05, + "loss": 2.5024, + "step": 1312 + }, + { + "epoch": 0.4324591378813455, + "grad_norm": 3.202440023422241, + "learning_rate": 6.198149851106407e-05, + "loss": 2.2169, + "step": 1313 + }, + { + "epoch": 0.4327885050846062, + "grad_norm": 2.7076804637908936, + "learning_rate": 6.19304192144373e-05, + "loss": 2.0037, + "step": 1314 + }, + { + "epoch": 0.4331178722878669, + "grad_norm": 3.2877442836761475, + "learning_rate": 6.187932671167342e-05, + "loss": 2.6063, + "step": 1315 + }, + { + "epoch": 0.4334472394911277, + "grad_norm": 2.6223089694976807, + "learning_rate": 6.18282210593282e-05, + "loss": 1.959, + "step": 1316 + }, + { + "epoch": 0.4337766066943884, + "grad_norm": 3.001115083694458, + "learning_rate": 6.177710231397203e-05, + "loss": 2.1598, + "step": 1317 + }, + { + "epoch": 0.43410597389764916, + "grad_norm": 2.4432201385498047, + "learning_rate": 6.172597053218978e-05, + "loss": 1.7752, + "step": 1318 + }, + { + "epoch": 0.43443534110090987, + "grad_norm": 2.4193572998046875, + "learning_rate": 6.167482577058075e-05, + "loss": 2.0442, + "step": 1319 + }, + { + "epoch": 0.43476470830417063, + "grad_norm": 2.7738735675811768, + "learning_rate": 6.162366808575857e-05, + "loss": 2.1012, + "step": 1320 + }, + { + "epoch": 0.43509407550743134, + "grad_norm": 2.398259401321411, + "learning_rate": 6.157249753435124e-05, + "loss": 1.641, + "step": 1321 + }, + { + "epoch": 0.4354234427106921, + "grad_norm": 2.803773880004883, + "learning_rate": 6.152131417300098e-05, + "loss": 2.1607, + "step": 1322 + }, + { + "epoch": 0.4357528099139528, + "grad_norm": 3.041407346725464, + "learning_rate": 6.147011805836414e-05, + "loss": 1.8865, + "step": 1323 + }, + { + "epoch": 0.43608217711721353, + "grad_norm": 4.000000476837158, + "learning_rate": 6.141890924711126e-05, + "loss": 1.8523, + "step": 1324 + }, + { + "epoch": 0.4364115443204743, + "grad_norm": 3.650296688079834, + "learning_rate": 6.136768779592691e-05, + "loss": 2.2463, + "step": 1325 + }, + { + "epoch": 0.436740911523735, + "grad_norm": 2.288025140762329, + "learning_rate": 6.13164537615096e-05, + "loss": 2.5091, + "step": 1326 + }, + { + "epoch": 0.4370702787269958, + "grad_norm": 2.0283560752868652, + "learning_rate": 6.126520720057186e-05, + "loss": 2.2552, + "step": 1327 + }, + { + "epoch": 0.4373996459302565, + "grad_norm": 2.147888422012329, + "learning_rate": 6.121394816984e-05, + "loss": 2.3754, + "step": 1328 + }, + { + "epoch": 0.43772901313351725, + "grad_norm": 2.3866569995880127, + "learning_rate": 6.11626767260542e-05, + "loss": 2.2485, + "step": 1329 + }, + { + "epoch": 0.43805838033677796, + "grad_norm": 2.4724600315093994, + "learning_rate": 6.111139292596834e-05, + "loss": 2.3486, + "step": 1330 + }, + { + "epoch": 0.4383877475400387, + "grad_norm": 2.1698498725891113, + "learning_rate": 6.106009682634997e-05, + "loss": 2.2673, + "step": 1331 + }, + { + "epoch": 0.43871711474329944, + "grad_norm": 2.1860690116882324, + "learning_rate": 6.100878848398032e-05, + "loss": 2.1531, + "step": 1332 + }, + { + "epoch": 0.43904648194656015, + "grad_norm": 2.3432462215423584, + "learning_rate": 6.095746795565408e-05, + "loss": 2.3907, + "step": 1333 + }, + { + "epoch": 0.4393758491498209, + "grad_norm": 3.00425386428833, + "learning_rate": 6.090613529817949e-05, + "loss": 2.5884, + "step": 1334 + }, + { + "epoch": 0.43970521635308163, + "grad_norm": 3.2894668579101562, + "learning_rate": 6.085479056837821e-05, + "loss": 2.2589, + "step": 1335 + }, + { + "epoch": 0.4400345835563424, + "grad_norm": 2.9489943981170654, + "learning_rate": 6.0803433823085244e-05, + "loss": 2.2962, + "step": 1336 + }, + { + "epoch": 0.4403639507596031, + "grad_norm": 3.29754638671875, + "learning_rate": 6.075206511914891e-05, + "loss": 2.4655, + "step": 1337 + }, + { + "epoch": 0.4406933179628639, + "grad_norm": 2.559438705444336, + "learning_rate": 6.070068451343074e-05, + "loss": 2.2637, + "step": 1338 + }, + { + "epoch": 0.4410226851661246, + "grad_norm": 3.065319061279297, + "learning_rate": 6.0649292062805494e-05, + "loss": 2.1736, + "step": 1339 + }, + { + "epoch": 0.4413520523693853, + "grad_norm": 2.943509101867676, + "learning_rate": 6.059788782416099e-05, + "loss": 2.0683, + "step": 1340 + }, + { + "epoch": 0.44168141957264606, + "grad_norm": 2.7584707736968994, + "learning_rate": 6.054647185439814e-05, + "loss": 2.3206, + "step": 1341 + }, + { + "epoch": 0.44201078677590677, + "grad_norm": 2.789400577545166, + "learning_rate": 6.049504421043078e-05, + "loss": 1.8737, + "step": 1342 + }, + { + "epoch": 0.44234015397916754, + "grad_norm": 2.9476754665374756, + "learning_rate": 6.0443604949185706e-05, + "loss": 2.2538, + "step": 1343 + }, + { + "epoch": 0.44266952118242825, + "grad_norm": 2.6708343029022217, + "learning_rate": 6.0392154127602595e-05, + "loss": 2.156, + "step": 1344 + }, + { + "epoch": 0.442998888385689, + "grad_norm": 2.7900948524475098, + "learning_rate": 6.0340691802633884e-05, + "loss": 1.8873, + "step": 1345 + }, + { + "epoch": 0.4433282555889497, + "grad_norm": 2.6233713626861572, + "learning_rate": 6.028921803124476e-05, + "loss": 1.7659, + "step": 1346 + }, + { + "epoch": 0.4436576227922105, + "grad_norm": 2.7721614837646484, + "learning_rate": 6.023773287041308e-05, + "loss": 2.0038, + "step": 1347 + }, + { + "epoch": 0.4439869899954712, + "grad_norm": 2.7499945163726807, + "learning_rate": 6.01862363771293e-05, + "loss": 2.1821, + "step": 1348 + }, + { + "epoch": 0.4443163571987319, + "grad_norm": 2.655479907989502, + "learning_rate": 6.013472860839642e-05, + "loss": 1.9229, + "step": 1349 + }, + { + "epoch": 0.4446457244019927, + "grad_norm": 2.927269458770752, + "learning_rate": 6.008320962122994e-05, + "loss": 1.8664, + "step": 1350 + }, + { + "epoch": 0.4449750916052534, + "grad_norm": 1.9483531713485718, + "learning_rate": 6.003167947265777e-05, + "loss": 2.5558, + "step": 1351 + }, + { + "epoch": 0.44530445880851416, + "grad_norm": 2.5765058994293213, + "learning_rate": 5.9980138219720125e-05, + "loss": 2.6047, + "step": 1352 + }, + { + "epoch": 0.44563382601177487, + "grad_norm": 2.483147382736206, + "learning_rate": 5.992858591946961e-05, + "loss": 2.324, + "step": 1353 + }, + { + "epoch": 0.44596319321503564, + "grad_norm": 2.8535609245300293, + "learning_rate": 5.987702262897098e-05, + "loss": 2.395, + "step": 1354 + }, + { + "epoch": 0.44629256041829635, + "grad_norm": 2.455509662628174, + "learning_rate": 5.9825448405301175e-05, + "loss": 2.3842, + "step": 1355 + }, + { + "epoch": 0.44662192762155706, + "grad_norm": 2.347088575363159, + "learning_rate": 5.977386330554926e-05, + "loss": 2.2313, + "step": 1356 + }, + { + "epoch": 0.4469512948248178, + "grad_norm": 2.4735326766967773, + "learning_rate": 5.9722267386816324e-05, + "loss": 2.1965, + "step": 1357 + }, + { + "epoch": 0.44728066202807854, + "grad_norm": 2.5871574878692627, + "learning_rate": 5.967066070621541e-05, + "loss": 2.2823, + "step": 1358 + }, + { + "epoch": 0.4476100292313393, + "grad_norm": 2.4709925651550293, + "learning_rate": 5.9619043320871494e-05, + "loss": 2.2467, + "step": 1359 + }, + { + "epoch": 0.4479393964346, + "grad_norm": 2.6086020469665527, + "learning_rate": 5.956741528792142e-05, + "loss": 2.2654, + "step": 1360 + }, + { + "epoch": 0.4482687636378608, + "grad_norm": 2.7858760356903076, + "learning_rate": 5.951577666451379e-05, + "loss": 2.3658, + "step": 1361 + }, + { + "epoch": 0.4485981308411215, + "grad_norm": 2.6391570568084717, + "learning_rate": 5.946412750780892e-05, + "loss": 2.1763, + "step": 1362 + }, + { + "epoch": 0.44892749804438226, + "grad_norm": 2.597666025161743, + "learning_rate": 5.941246787497884e-05, + "loss": 2.1193, + "step": 1363 + }, + { + "epoch": 0.44925686524764297, + "grad_norm": 2.591607093811035, + "learning_rate": 5.9360797823207104e-05, + "loss": 1.9894, + "step": 1364 + }, + { + "epoch": 0.4495862324509037, + "grad_norm": 2.4428396224975586, + "learning_rate": 5.930911740968884e-05, + "loss": 2.1478, + "step": 1365 + }, + { + "epoch": 0.44991559965416444, + "grad_norm": 2.8427088260650635, + "learning_rate": 5.9257426691630656e-05, + "loss": 2.0804, + "step": 1366 + }, + { + "epoch": 0.45024496685742516, + "grad_norm": 2.4200117588043213, + "learning_rate": 5.920572572625056e-05, + "loss": 2.0263, + "step": 1367 + }, + { + "epoch": 0.4505743340606859, + "grad_norm": 2.4792933464050293, + "learning_rate": 5.915401457077785e-05, + "loss": 2.1457, + "step": 1368 + }, + { + "epoch": 0.45090370126394663, + "grad_norm": 3.0544183254241943, + "learning_rate": 5.910229328245319e-05, + "loss": 2.476, + "step": 1369 + }, + { + "epoch": 0.4512330684672074, + "grad_norm": 3.122922897338867, + "learning_rate": 5.90505619185284e-05, + "loss": 1.8849, + "step": 1370 + }, + { + "epoch": 0.4515624356704681, + "grad_norm": 3.2922286987304688, + "learning_rate": 5.899882053626646e-05, + "loss": 2.1517, + "step": 1371 + }, + { + "epoch": 0.4518918028737289, + "grad_norm": 3.095803737640381, + "learning_rate": 5.8947069192941493e-05, + "loss": 2.1022, + "step": 1372 + }, + { + "epoch": 0.4522211700769896, + "grad_norm": 3.426305055618286, + "learning_rate": 5.889530794583855e-05, + "loss": 2.1558, + "step": 1373 + }, + { + "epoch": 0.4525505372802503, + "grad_norm": 3.2254509925842285, + "learning_rate": 5.8843536852253745e-05, + "loss": 2.2135, + "step": 1374 + }, + { + "epoch": 0.45287990448351106, + "grad_norm": 3.3690662384033203, + "learning_rate": 5.879175596949401e-05, + "loss": 2.0148, + "step": 1375 + }, + { + "epoch": 0.4532092716867718, + "grad_norm": 2.666074275970459, + "learning_rate": 5.8739965354877194e-05, + "loss": 2.445, + "step": 1376 + }, + { + "epoch": 0.45353863889003254, + "grad_norm": 2.3343346118927, + "learning_rate": 5.8688165065731826e-05, + "loss": 2.332, + "step": 1377 + }, + { + "epoch": 0.45386800609329325, + "grad_norm": 2.3201119899749756, + "learning_rate": 5.8636355159397225e-05, + "loss": 2.5834, + "step": 1378 + }, + { + "epoch": 0.454197373296554, + "grad_norm": 2.493422269821167, + "learning_rate": 5.858453569322332e-05, + "loss": 2.5553, + "step": 1379 + }, + { + "epoch": 0.45452674049981473, + "grad_norm": 2.374323606491089, + "learning_rate": 5.853270672457061e-05, + "loss": 2.1962, + "step": 1380 + }, + { + "epoch": 0.45485610770307544, + "grad_norm": 2.812485694885254, + "learning_rate": 5.8480868310810124e-05, + "loss": 2.3164, + "step": 1381 + }, + { + "epoch": 0.4551854749063362, + "grad_norm": 2.3375439643859863, + "learning_rate": 5.8429020509323385e-05, + "loss": 2.4357, + "step": 1382 + }, + { + "epoch": 0.4555148421095969, + "grad_norm": 2.376716136932373, + "learning_rate": 5.837716337750223e-05, + "loss": 2.5416, + "step": 1383 + }, + { + "epoch": 0.4558442093128577, + "grad_norm": 2.5697383880615234, + "learning_rate": 5.8325296972748864e-05, + "loss": 2.3119, + "step": 1384 + }, + { + "epoch": 0.4561735765161184, + "grad_norm": 2.5520308017730713, + "learning_rate": 5.827342135247581e-05, + "loss": 2.3784, + "step": 1385 + }, + { + "epoch": 0.45650294371937916, + "grad_norm": 2.750321865081787, + "learning_rate": 5.8221536574105694e-05, + "loss": 2.5948, + "step": 1386 + }, + { + "epoch": 0.4568323109226399, + "grad_norm": 2.980220317840576, + "learning_rate": 5.816964269507135e-05, + "loss": 2.5563, + "step": 1387 + }, + { + "epoch": 0.45716167812590064, + "grad_norm": 2.898198366165161, + "learning_rate": 5.811773977281565e-05, + "loss": 2.2648, + "step": 1388 + }, + { + "epoch": 0.45749104532916135, + "grad_norm": 3.0173451900482178, + "learning_rate": 5.806582786479149e-05, + "loss": 2.2571, + "step": 1389 + }, + { + "epoch": 0.45782041253242206, + "grad_norm": 2.6375420093536377, + "learning_rate": 5.801390702846171e-05, + "loss": 2.0654, + "step": 1390 + }, + { + "epoch": 0.4581497797356828, + "grad_norm": 2.8282692432403564, + "learning_rate": 5.796197732129905e-05, + "loss": 2.3666, + "step": 1391 + }, + { + "epoch": 0.45847914693894354, + "grad_norm": 2.815434694290161, + "learning_rate": 5.7910038800786e-05, + "loss": 2.2048, + "step": 1392 + }, + { + "epoch": 0.4588085141422043, + "grad_norm": 2.911081075668335, + "learning_rate": 5.7858091524414926e-05, + "loss": 2.3954, + "step": 1393 + }, + { + "epoch": 0.459137881345465, + "grad_norm": 2.4357879161834717, + "learning_rate": 5.780613554968777e-05, + "loss": 1.9379, + "step": 1394 + }, + { + "epoch": 0.4594672485487258, + "grad_norm": 3.2798242568969727, + "learning_rate": 5.775417093411619e-05, + "loss": 2.3849, + "step": 1395 + }, + { + "epoch": 0.4597966157519865, + "grad_norm": 2.6205365657806396, + "learning_rate": 5.770219773522133e-05, + "loss": 2.2528, + "step": 1396 + }, + { + "epoch": 0.4601259829552472, + "grad_norm": 2.84399676322937, + "learning_rate": 5.765021601053391e-05, + "loss": 1.8288, + "step": 1397 + }, + { + "epoch": 0.46045535015850797, + "grad_norm": 2.869101047515869, + "learning_rate": 5.7598225817594035e-05, + "loss": 2.3284, + "step": 1398 + }, + { + "epoch": 0.4607847173617687, + "grad_norm": 2.6911323070526123, + "learning_rate": 5.754622721395119e-05, + "loss": 2.0477, + "step": 1399 + }, + { + "epoch": 0.46111408456502945, + "grad_norm": 3.472745656967163, + "learning_rate": 5.74942202571642e-05, + "loss": 1.8559, + "step": 1400 + }, + { + "epoch": 0.46144345176829016, + "grad_norm": 2.722027063369751, + "learning_rate": 5.744220500480113e-05, + "loss": 2.5318, + "step": 1401 + }, + { + "epoch": 0.4617728189715509, + "grad_norm": 2.0918314456939697, + "learning_rate": 5.739018151443918e-05, + "loss": 2.1847, + "step": 1402 + }, + { + "epoch": 0.46210218617481164, + "grad_norm": 2.0019867420196533, + "learning_rate": 5.733814984366474e-05, + "loss": 2.2064, + "step": 1403 + }, + { + "epoch": 0.4624315533780724, + "grad_norm": 2.3096742630004883, + "learning_rate": 5.7286110050073194e-05, + "loss": 2.0878, + "step": 1404 + }, + { + "epoch": 0.4627609205813331, + "grad_norm": 2.7274067401885986, + "learning_rate": 5.723406219126895e-05, + "loss": 2.2556, + "step": 1405 + }, + { + "epoch": 0.4630902877845938, + "grad_norm": 2.2362143993377686, + "learning_rate": 5.718200632486534e-05, + "loss": 2.0013, + "step": 1406 + }, + { + "epoch": 0.4634196549878546, + "grad_norm": 2.458976984024048, + "learning_rate": 5.7129942508484556e-05, + "loss": 2.1049, + "step": 1407 + }, + { + "epoch": 0.4637490221911153, + "grad_norm": 2.278069496154785, + "learning_rate": 5.707787079975758e-05, + "loss": 2.0951, + "step": 1408 + }, + { + "epoch": 0.46407838939437607, + "grad_norm": 2.235696315765381, + "learning_rate": 5.702579125632416e-05, + "loss": 2.4472, + "step": 1409 + }, + { + "epoch": 0.4644077565976368, + "grad_norm": 2.662332773208618, + "learning_rate": 5.697370393583269e-05, + "loss": 2.1896, + "step": 1410 + }, + { + "epoch": 0.46473712380089754, + "grad_norm": 2.677049398422241, + "learning_rate": 5.692160889594017e-05, + "loss": 2.0806, + "step": 1411 + }, + { + "epoch": 0.46506649100415826, + "grad_norm": 2.7263095378875732, + "learning_rate": 5.686950619431215e-05, + "loss": 2.1826, + "step": 1412 + }, + { + "epoch": 0.465395858207419, + "grad_norm": 2.625284194946289, + "learning_rate": 5.6817395888622694e-05, + "loss": 2.4464, + "step": 1413 + }, + { + "epoch": 0.46572522541067973, + "grad_norm": 2.6425976753234863, + "learning_rate": 5.6765278036554225e-05, + "loss": 2.0412, + "step": 1414 + }, + { + "epoch": 0.46605459261394044, + "grad_norm": 2.918874502182007, + "learning_rate": 5.671315269579756e-05, + "loss": 2.3, + "step": 1415 + }, + { + "epoch": 0.4663839598172012, + "grad_norm": 2.5336811542510986, + "learning_rate": 5.6661019924051814e-05, + "loss": 2.4006, + "step": 1416 + }, + { + "epoch": 0.4667133270204619, + "grad_norm": 2.6169357299804688, + "learning_rate": 5.6608879779024274e-05, + "loss": 2.1464, + "step": 1417 + }, + { + "epoch": 0.4670426942237227, + "grad_norm": 2.734600305557251, + "learning_rate": 5.6556732318430437e-05, + "loss": 2.0906, + "step": 1418 + }, + { + "epoch": 0.4673720614269834, + "grad_norm": 2.7634074687957764, + "learning_rate": 5.650457759999389e-05, + "loss": 2.1538, + "step": 1419 + }, + { + "epoch": 0.46770142863024416, + "grad_norm": 2.391335964202881, + "learning_rate": 5.6452415681446256e-05, + "loss": 1.7893, + "step": 1420 + }, + { + "epoch": 0.4680307958335049, + "grad_norm": 2.9604604244232178, + "learning_rate": 5.6400246620527096e-05, + "loss": 2.1226, + "step": 1421 + }, + { + "epoch": 0.4683601630367656, + "grad_norm": 2.650703191757202, + "learning_rate": 5.6348070474983905e-05, + "loss": 1.8164, + "step": 1422 + }, + { + "epoch": 0.46868953024002635, + "grad_norm": 2.603179454803467, + "learning_rate": 5.629588730257205e-05, + "loss": 1.6553, + "step": 1423 + }, + { + "epoch": 0.46901889744328706, + "grad_norm": 2.903170108795166, + "learning_rate": 5.6243697161054584e-05, + "loss": 1.8419, + "step": 1424 + }, + { + "epoch": 0.46934826464654783, + "grad_norm": 3.1051528453826904, + "learning_rate": 5.619150010820238e-05, + "loss": 1.6226, + "step": 1425 + }, + { + "epoch": 0.46967763184980854, + "grad_norm": 1.8390017747879028, + "learning_rate": 5.613929620179389e-05, + "loss": 2.2257, + "step": 1426 + }, + { + "epoch": 0.4700069990530693, + "grad_norm": 2.490265130996704, + "learning_rate": 5.608708549961519e-05, + "loss": 2.3091, + "step": 1427 + }, + { + "epoch": 0.47033636625633, + "grad_norm": 2.267498254776001, + "learning_rate": 5.603486805945984e-05, + "loss": 2.4721, + "step": 1428 + }, + { + "epoch": 0.4706657334595908, + "grad_norm": 2.788170337677002, + "learning_rate": 5.598264393912891e-05, + "loss": 2.453, + "step": 1429 + }, + { + "epoch": 0.4709951006628515, + "grad_norm": 2.2585389614105225, + "learning_rate": 5.593041319643083e-05, + "loss": 2.2228, + "step": 1430 + }, + { + "epoch": 0.4713244678661122, + "grad_norm": 2.556220531463623, + "learning_rate": 5.587817588918137e-05, + "loss": 2.3444, + "step": 1431 + }, + { + "epoch": 0.471653835069373, + "grad_norm": 2.604313611984253, + "learning_rate": 5.582593207520357e-05, + "loss": 2.1515, + "step": 1432 + }, + { + "epoch": 0.4719832022726337, + "grad_norm": 2.6779942512512207, + "learning_rate": 5.577368181232764e-05, + "loss": 2.1363, + "step": 1433 + }, + { + "epoch": 0.47231256947589445, + "grad_norm": 2.4980621337890625, + "learning_rate": 5.572142515839098e-05, + "loss": 2.0295, + "step": 1434 + }, + { + "epoch": 0.47264193667915516, + "grad_norm": 2.331618309020996, + "learning_rate": 5.5669162171238046e-05, + "loss": 2.1247, + "step": 1435 + }, + { + "epoch": 0.4729713038824159, + "grad_norm": 3.248292922973633, + "learning_rate": 5.5616892908720274e-05, + "loss": 2.2727, + "step": 1436 + }, + { + "epoch": 0.47330067108567664, + "grad_norm": 2.3975675106048584, + "learning_rate": 5.556461742869609e-05, + "loss": 2.3504, + "step": 1437 + }, + { + "epoch": 0.4736300382889374, + "grad_norm": 2.7226855754852295, + "learning_rate": 5.551233578903078e-05, + "loss": 2.0893, + "step": 1438 + }, + { + "epoch": 0.4739594054921981, + "grad_norm": 2.8404018878936768, + "learning_rate": 5.5460048047596434e-05, + "loss": 2.4199, + "step": 1439 + }, + { + "epoch": 0.4742887726954588, + "grad_norm": 2.4289791584014893, + "learning_rate": 5.540775426227194e-05, + "loss": 2.4747, + "step": 1440 + }, + { + "epoch": 0.4746181398987196, + "grad_norm": 2.772507905960083, + "learning_rate": 5.535545449094283e-05, + "loss": 2.054, + "step": 1441 + }, + { + "epoch": 0.4749475071019803, + "grad_norm": 2.7915074825286865, + "learning_rate": 5.5303148791501305e-05, + "loss": 2.3271, + "step": 1442 + }, + { + "epoch": 0.47527687430524107, + "grad_norm": 2.894771099090576, + "learning_rate": 5.525083722184607e-05, + "loss": 2.2738, + "step": 1443 + }, + { + "epoch": 0.4756062415085018, + "grad_norm": 2.797031879425049, + "learning_rate": 5.519851983988239e-05, + "loss": 2.3225, + "step": 1444 + }, + { + "epoch": 0.47593560871176255, + "grad_norm": 3.053269386291504, + "learning_rate": 5.514619670352192e-05, + "loss": 2.1418, + "step": 1445 + }, + { + "epoch": 0.47626497591502326, + "grad_norm": 2.743236780166626, + "learning_rate": 5.5093867870682725e-05, + "loss": 1.966, + "step": 1446 + }, + { + "epoch": 0.47659434311828397, + "grad_norm": 2.998206615447998, + "learning_rate": 5.504153339928914e-05, + "loss": 2.2283, + "step": 1447 + }, + { + "epoch": 0.47692371032154474, + "grad_norm": 3.0275509357452393, + "learning_rate": 5.498919334727175e-05, + "loss": 2.0081, + "step": 1448 + }, + { + "epoch": 0.47725307752480545, + "grad_norm": 3.7755558490753174, + "learning_rate": 5.4936847772567314e-05, + "loss": 2.3072, + "step": 1449 + }, + { + "epoch": 0.4775824447280662, + "grad_norm": 3.9976677894592285, + "learning_rate": 5.488449673311872e-05, + "loss": 2.2594, + "step": 1450 + }, + { + "epoch": 0.4779118119313269, + "grad_norm": 2.3137824535369873, + "learning_rate": 5.48321402868749e-05, + "loss": 2.6206, + "step": 1451 + }, + { + "epoch": 0.4782411791345877, + "grad_norm": 2.003905773162842, + "learning_rate": 5.477977849179076e-05, + "loss": 2.221, + "step": 1452 + }, + { + "epoch": 0.4785705463378484, + "grad_norm": 2.37841534614563, + "learning_rate": 5.4727411405827136e-05, + "loss": 2.5322, + "step": 1453 + }, + { + "epoch": 0.47889991354110917, + "grad_norm": 2.259744644165039, + "learning_rate": 5.467503908695073e-05, + "loss": 1.9622, + "step": 1454 + }, + { + "epoch": 0.4792292807443699, + "grad_norm": 2.1672475337982178, + "learning_rate": 5.4622661593133996e-05, + "loss": 2.0246, + "step": 1455 + }, + { + "epoch": 0.4795586479476306, + "grad_norm": 2.359017848968506, + "learning_rate": 5.457027898235517e-05, + "loss": 2.0392, + "step": 1456 + }, + { + "epoch": 0.47988801515089136, + "grad_norm": 2.5766847133636475, + "learning_rate": 5.451789131259814e-05, + "loss": 2.2329, + "step": 1457 + }, + { + "epoch": 0.48021738235415207, + "grad_norm": 2.3741567134857178, + "learning_rate": 5.446549864185233e-05, + "loss": 2.2631, + "step": 1458 + }, + { + "epoch": 0.48054674955741283, + "grad_norm": 2.526066541671753, + "learning_rate": 5.44131010281128e-05, + "loss": 2.1493, + "step": 1459 + }, + { + "epoch": 0.48087611676067354, + "grad_norm": 2.551849126815796, + "learning_rate": 5.4360698529380004e-05, + "loss": 2.3144, + "step": 1460 + }, + { + "epoch": 0.4812054839639343, + "grad_norm": 2.2339227199554443, + "learning_rate": 5.4308291203659855e-05, + "loss": 2.2004, + "step": 1461 + }, + { + "epoch": 0.481534851167195, + "grad_norm": 2.5119829177856445, + "learning_rate": 5.425587910896357e-05, + "loss": 2.0944, + "step": 1462 + }, + { + "epoch": 0.4818642183704558, + "grad_norm": 2.8552167415618896, + "learning_rate": 5.4203462303307685e-05, + "loss": 2.1718, + "step": 1463 + }, + { + "epoch": 0.4821935855737165, + "grad_norm": 2.6687982082366943, + "learning_rate": 5.4151040844713886e-05, + "loss": 2.2164, + "step": 1464 + }, + { + "epoch": 0.4825229527769772, + "grad_norm": 2.963787317276001, + "learning_rate": 5.409861479120908e-05, + "loss": 2.22, + "step": 1465 + }, + { + "epoch": 0.482852319980238, + "grad_norm": 2.990370750427246, + "learning_rate": 5.404618420082521e-05, + "loss": 2.2234, + "step": 1466 + }, + { + "epoch": 0.4831816871834987, + "grad_norm": 2.695317506790161, + "learning_rate": 5.39937491315993e-05, + "loss": 2.2448, + "step": 1467 + }, + { + "epoch": 0.48351105438675945, + "grad_norm": 2.775763988494873, + "learning_rate": 5.394130964157324e-05, + "loss": 2.1806, + "step": 1468 + }, + { + "epoch": 0.48384042159002016, + "grad_norm": 2.3641860485076904, + "learning_rate": 5.388886578879392e-05, + "loss": 1.927, + "step": 1469 + }, + { + "epoch": 0.48416978879328093, + "grad_norm": 2.720329999923706, + "learning_rate": 5.383641763131297e-05, + "loss": 1.9248, + "step": 1470 + }, + { + "epoch": 0.48449915599654164, + "grad_norm": 2.7788572311401367, + "learning_rate": 5.378396522718683e-05, + "loss": 1.9824, + "step": 1471 + }, + { + "epoch": 0.48482852319980235, + "grad_norm": 3.20474910736084, + "learning_rate": 5.373150863447662e-05, + "loss": 1.5423, + "step": 1472 + }, + { + "epoch": 0.4851578904030631, + "grad_norm": 2.918916940689087, + "learning_rate": 5.367904791124815e-05, + "loss": 1.9387, + "step": 1473 + }, + { + "epoch": 0.48548725760632383, + "grad_norm": 3.4597556591033936, + "learning_rate": 5.3626583115571716e-05, + "loss": 2.226, + "step": 1474 + }, + { + "epoch": 0.4858166248095846, + "grad_norm": 3.169464588165283, + "learning_rate": 5.357411430552216e-05, + "loss": 2.0575, + "step": 1475 + }, + { + "epoch": 0.4861459920128453, + "grad_norm": 2.0502803325653076, + "learning_rate": 5.352164153917882e-05, + "loss": 2.3078, + "step": 1476 + }, + { + "epoch": 0.4864753592161061, + "grad_norm": 2.205932855606079, + "learning_rate": 5.3469164874625345e-05, + "loss": 2.3107, + "step": 1477 + }, + { + "epoch": 0.4868047264193668, + "grad_norm": 1.9236304759979248, + "learning_rate": 5.341668436994971e-05, + "loss": 2.22, + "step": 1478 + }, + { + "epoch": 0.48713409362262755, + "grad_norm": 2.331031322479248, + "learning_rate": 5.3364200083244175e-05, + "loss": 2.1931, + "step": 1479 + }, + { + "epoch": 0.48746346082588826, + "grad_norm": 2.279069185256958, + "learning_rate": 5.3311712072605136e-05, + "loss": 2.2918, + "step": 1480 + }, + { + "epoch": 0.487792828029149, + "grad_norm": 2.3501081466674805, + "learning_rate": 5.325922039613316e-05, + "loss": 2.28, + "step": 1481 + }, + { + "epoch": 0.48812219523240974, + "grad_norm": 2.4179487228393555, + "learning_rate": 5.320672511193285e-05, + "loss": 2.4011, + "step": 1482 + }, + { + "epoch": 0.48845156243567045, + "grad_norm": 2.3031511306762695, + "learning_rate": 5.315422627811278e-05, + "loss": 2.0346, + "step": 1483 + }, + { + "epoch": 0.4887809296389312, + "grad_norm": 3.2005553245544434, + "learning_rate": 5.310172395278551e-05, + "loss": 2.5121, + "step": 1484 + }, + { + "epoch": 0.4891102968421919, + "grad_norm": 2.487902879714966, + "learning_rate": 5.3049218194067394e-05, + "loss": 2.0688, + "step": 1485 + }, + { + "epoch": 0.4894396640454527, + "grad_norm": 2.2701635360717773, + "learning_rate": 5.299670906007866e-05, + "loss": 2.0644, + "step": 1486 + }, + { + "epoch": 0.4897690312487134, + "grad_norm": 3.022916078567505, + "learning_rate": 5.294419660894322e-05, + "loss": 2.3781, + "step": 1487 + }, + { + "epoch": 0.49009839845197417, + "grad_norm": 2.469449758529663, + "learning_rate": 5.2891680898788665e-05, + "loss": 1.7029, + "step": 1488 + }, + { + "epoch": 0.4904277656552349, + "grad_norm": 2.625373363494873, + "learning_rate": 5.283916198774621e-05, + "loss": 2.0506, + "step": 1489 + }, + { + "epoch": 0.4907571328584956, + "grad_norm": 2.7571139335632324, + "learning_rate": 5.2786639933950597e-05, + "loss": 2.1408, + "step": 1490 + }, + { + "epoch": 0.49108650006175636, + "grad_norm": 2.4763495922088623, + "learning_rate": 5.273411479554008e-05, + "loss": 1.9425, + "step": 1491 + }, + { + "epoch": 0.49141586726501707, + "grad_norm": 3.169473171234131, + "learning_rate": 5.2681586630656276e-05, + "loss": 2.4006, + "step": 1492 + }, + { + "epoch": 0.49174523446827784, + "grad_norm": 2.6018457412719727, + "learning_rate": 5.262905549744419e-05, + "loss": 2.0221, + "step": 1493 + }, + { + "epoch": 0.49207460167153855, + "grad_norm": 2.7356503009796143, + "learning_rate": 5.25765214540521e-05, + "loss": 2.1555, + "step": 1494 + }, + { + "epoch": 0.4924039688747993, + "grad_norm": 2.443387269973755, + "learning_rate": 5.2523984558631514e-05, + "loss": 2.0985, + "step": 1495 + }, + { + "epoch": 0.49273333607806, + "grad_norm": 2.6359920501708984, + "learning_rate": 5.247144486933706e-05, + "loss": 2.217, + "step": 1496 + }, + { + "epoch": 0.49306270328132074, + "grad_norm": 2.7415084838867188, + "learning_rate": 5.241890244432652e-05, + "loss": 1.8597, + "step": 1497 + }, + { + "epoch": 0.4933920704845815, + "grad_norm": 2.734299421310425, + "learning_rate": 5.236635734176069e-05, + "loss": 2.2436, + "step": 1498 + }, + { + "epoch": 0.4937214376878422, + "grad_norm": 2.9822301864624023, + "learning_rate": 5.231380961980326e-05, + "loss": 2.0025, + "step": 1499 + }, + { + "epoch": 0.494050804891103, + "grad_norm": 3.2635011672973633, + "learning_rate": 5.226125933662088e-05, + "loss": 2.073, + "step": 1500 + }, + { + "epoch": 0.4943801720943637, + "grad_norm": 2.0584778785705566, + "learning_rate": 5.220870655038308e-05, + "loss": 2.2952, + "step": 1501 + }, + { + "epoch": 0.49470953929762446, + "grad_norm": 2.4790139198303223, + "learning_rate": 5.2156151319262045e-05, + "loss": 2.5564, + "step": 1502 + }, + { + "epoch": 0.49503890650088517, + "grad_norm": 2.687093496322632, + "learning_rate": 5.2103593701432776e-05, + "loss": 2.0692, + "step": 1503 + }, + { + "epoch": 0.49536827370414593, + "grad_norm": 2.1168501377105713, + "learning_rate": 5.2051033755072834e-05, + "loss": 2.065, + "step": 1504 + }, + { + "epoch": 0.49569764090740664, + "grad_norm": 2.593621253967285, + "learning_rate": 5.199847153836241e-05, + "loss": 2.2862, + "step": 1505 + }, + { + "epoch": 0.49602700811066736, + "grad_norm": 2.273329257965088, + "learning_rate": 5.194590710948419e-05, + "loss": 1.9901, + "step": 1506 + }, + { + "epoch": 0.4963563753139281, + "grad_norm": 2.755690336227417, + "learning_rate": 5.189334052662331e-05, + "loss": 2.1752, + "step": 1507 + }, + { + "epoch": 0.49668574251718883, + "grad_norm": 2.4404118061065674, + "learning_rate": 5.1840771847967286e-05, + "loss": 2.1709, + "step": 1508 + }, + { + "epoch": 0.4970151097204496, + "grad_norm": 2.30769681930542, + "learning_rate": 5.178820113170596e-05, + "loss": 2.2124, + "step": 1509 + }, + { + "epoch": 0.4973444769237103, + "grad_norm": 2.8207881450653076, + "learning_rate": 5.1735628436031436e-05, + "loss": 2.2862, + "step": 1510 + }, + { + "epoch": 0.4976738441269711, + "grad_norm": 2.278991460800171, + "learning_rate": 5.1683053819137975e-05, + "loss": 2.0548, + "step": 1511 + }, + { + "epoch": 0.4980032113302318, + "grad_norm": 2.4886574745178223, + "learning_rate": 5.1630477339221994e-05, + "loss": 2.3737, + "step": 1512 + }, + { + "epoch": 0.49833257853349255, + "grad_norm": 2.911540985107422, + "learning_rate": 5.1577899054481996e-05, + "loss": 2.0253, + "step": 1513 + }, + { + "epoch": 0.49866194573675326, + "grad_norm": 2.7352163791656494, + "learning_rate": 5.1525319023118434e-05, + "loss": 2.3057, + "step": 1514 + }, + { + "epoch": 0.498991312940014, + "grad_norm": 2.4180266857147217, + "learning_rate": 5.147273730333372e-05, + "loss": 2.3452, + "step": 1515 + }, + { + "epoch": 0.49932068014327474, + "grad_norm": 2.5061450004577637, + "learning_rate": 5.14201539533321e-05, + "loss": 1.8195, + "step": 1516 + }, + { + "epoch": 0.49965004734653545, + "grad_norm": 2.4668986797332764, + "learning_rate": 5.1367569031319715e-05, + "loss": 1.9367, + "step": 1517 + }, + { + "epoch": 0.4999794145497962, + "grad_norm": 2.661878824234009, + "learning_rate": 5.1314982595504335e-05, + "loss": 1.9323, + "step": 1518 + }, + { + "epoch": 0.4999794145497962, + "eval_loss": 2.212698221206665, + "eval_runtime": 764.8335, + "eval_samples_per_second": 3.343, + "eval_steps_per_second": 1.672, + "step": 1518 } ], "logging_steps": 1, @@ -5355,7 +10676,7 @@ "attributes": {} } }, - "total_flos": 1.164145589886124e+18, + "total_flos": 2.328291179772248e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null