{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 100, "global_step": 3860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05181347150259067, "grad_norm": 0.1298828125, "learning_rate": 1.2953367875647668e-07, "loss": 1.0187, "step": 20 }, { "epoch": 0.10362694300518134, "grad_norm": 0.11962890625, "learning_rate": 2.5906735751295336e-07, "loss": 1.016, "step": 40 }, { "epoch": 0.15544041450777202, "grad_norm": 0.11962890625, "learning_rate": 3.886010362694301e-07, "loss": 1.0219, "step": 60 }, { "epoch": 0.20725388601036268, "grad_norm": 0.11767578125, "learning_rate": 5.181347150259067e-07, "loss": 1.0164, "step": 80 }, { "epoch": 0.25906735751295334, "grad_norm": 0.126953125, "learning_rate": 6.476683937823834e-07, "loss": 1.0119, "step": 100 }, { "epoch": 0.25906735751295334, "eval_main_loss": 1.0200841426849365, "eval_main_runtime": 50.8504, "eval_main_samples_per_second": 30.206, "eval_main_steps_per_second": 3.776, "step": 100 }, { "epoch": 0.25906735751295334, "eval_anatomy_loss": 2.9678244590759277, "eval_anatomy_runtime": 0.2681, "eval_anatomy_samples_per_second": 7.459, "eval_anatomy_steps_per_second": 3.729, "step": 100 }, { "epoch": 0.25906735751295334, "eval_college_mathematics_loss": 2.1804275512695312, "eval_college_mathematics_runtime": 0.2684, "eval_college_mathematics_samples_per_second": 7.451, "eval_college_mathematics_steps_per_second": 3.726, "step": 100 }, { "epoch": 0.25906735751295334, "eval_international_law_loss": 3.191988229751587, "eval_international_law_runtime": 0.2665, "eval_international_law_samples_per_second": 7.505, "eval_international_law_steps_per_second": 3.752, "step": 100 }, { "epoch": 0.31088082901554404, "grad_norm": 0.12890625, "learning_rate": 7.772020725388602e-07, "loss": 1.0155, "step": 120 }, { "epoch": 0.3626943005181347, "grad_norm": 0.134765625, "learning_rate": 9.067357512953369e-07, "loss": 1.022, "step": 140 }, { "epoch": 0.41450777202072536, "grad_norm": 0.1328125, "learning_rate": 1.0362694300518134e-06, "loss": 1.0071, "step": 160 }, { "epoch": 0.46632124352331605, "grad_norm": 0.13671875, "learning_rate": 1.1658031088082903e-06, "loss": 1.0226, "step": 180 }, { "epoch": 0.5181347150259067, "grad_norm": 0.1494140625, "learning_rate": 1.2953367875647669e-06, "loss": 1.0175, "step": 200 }, { "epoch": 0.5181347150259067, "eval_main_loss": 1.017417311668396, "eval_main_runtime": 50.9406, "eval_main_samples_per_second": 30.153, "eval_main_steps_per_second": 3.769, "step": 200 }, { "epoch": 0.5181347150259067, "eval_anatomy_loss": 2.9614720344543457, "eval_anatomy_runtime": 0.2677, "eval_anatomy_samples_per_second": 7.471, "eval_anatomy_steps_per_second": 3.735, "step": 200 }, { "epoch": 0.5181347150259067, "eval_college_mathematics_loss": 2.1731653213500977, "eval_college_mathematics_runtime": 0.2679, "eval_college_mathematics_samples_per_second": 7.466, "eval_college_mathematics_steps_per_second": 3.733, "step": 200 }, { "epoch": 0.5181347150259067, "eval_international_law_loss": 3.1835579872131348, "eval_international_law_runtime": 0.2687, "eval_international_law_samples_per_second": 7.443, "eval_international_law_steps_per_second": 3.722, "step": 200 }, { "epoch": 0.5699481865284974, "grad_norm": 0.1591796875, "learning_rate": 1.4248704663212437e-06, "loss": 1.0125, "step": 220 }, { "epoch": 0.6217616580310881, "grad_norm": 0.1767578125, "learning_rate": 1.5544041450777204e-06, "loss": 1.0142, "step": 240 }, { "epoch": 0.6735751295336787, "grad_norm": 0.1728515625, "learning_rate": 1.683937823834197e-06, "loss": 1.0175, "step": 260 }, { "epoch": 0.7253886010362695, "grad_norm": 0.205078125, "learning_rate": 1.8134715025906738e-06, "loss": 1.0147, "step": 280 }, { "epoch": 0.7772020725388601, "grad_norm": 0.2294921875, "learning_rate": 1.9430051813471504e-06, "loss": 1.0072, "step": 300 }, { "epoch": 0.7772020725388601, "eval_main_loss": 1.0116238594055176, "eval_main_runtime": 50.9159, "eval_main_samples_per_second": 30.167, "eval_main_steps_per_second": 3.771, "step": 300 }, { "epoch": 0.7772020725388601, "eval_anatomy_loss": 2.944845676422119, "eval_anatomy_runtime": 0.2687, "eval_anatomy_samples_per_second": 7.443, "eval_anatomy_steps_per_second": 3.721, "step": 300 }, { "epoch": 0.7772020725388601, "eval_college_mathematics_loss": 2.1590933799743652, "eval_college_mathematics_runtime": 0.2683, "eval_college_mathematics_samples_per_second": 7.455, "eval_college_mathematics_steps_per_second": 3.727, "step": 300 }, { "epoch": 0.7772020725388601, "eval_international_law_loss": 3.1690821647644043, "eval_international_law_runtime": 0.2666, "eval_international_law_samples_per_second": 7.501, "eval_international_law_steps_per_second": 3.75, "step": 300 }, { "epoch": 0.8290155440414507, "grad_norm": 0.26171875, "learning_rate": 2.072538860103627e-06, "loss": 1.017, "step": 320 }, { "epoch": 0.8808290155440415, "grad_norm": 0.265625, "learning_rate": 2.2020725388601037e-06, "loss": 1.0084, "step": 340 }, { "epoch": 0.9326424870466321, "grad_norm": 0.283203125, "learning_rate": 2.3316062176165805e-06, "loss": 1.0008, "step": 360 }, { "epoch": 0.9844559585492227, "grad_norm": 0.283203125, "learning_rate": 2.461139896373057e-06, "loss": 1.0035, "step": 380 }, { "epoch": 1.0362694300518134, "grad_norm": 0.35546875, "learning_rate": 2.5906735751295338e-06, "loss": 1.0034, "step": 400 }, { "epoch": 1.0362694300518134, "eval_main_loss": 1.0014457702636719, "eval_main_runtime": 50.699, "eval_main_samples_per_second": 30.296, "eval_main_steps_per_second": 3.787, "step": 400 }, { "epoch": 1.0362694300518134, "eval_anatomy_loss": 2.91424298286438, "eval_anatomy_runtime": 0.2675, "eval_anatomy_samples_per_second": 7.475, "eval_anatomy_steps_per_second": 3.738, "step": 400 }, { "epoch": 1.0362694300518134, "eval_college_mathematics_loss": 2.136711597442627, "eval_college_mathematics_runtime": 0.2681, "eval_college_mathematics_samples_per_second": 7.46, "eval_college_mathematics_steps_per_second": 3.73, "step": 400 }, { "epoch": 1.0362694300518134, "eval_international_law_loss": 3.133229970932007, "eval_international_law_runtime": 0.2663, "eval_international_law_samples_per_second": 7.511, "eval_international_law_steps_per_second": 3.755, "step": 400 }, { "epoch": 1.0880829015544042, "grad_norm": 0.361328125, "learning_rate": 2.7202072538860106e-06, "loss": 0.9984, "step": 420 }, { "epoch": 1.1398963730569949, "grad_norm": 0.373046875, "learning_rate": 2.8497409326424875e-06, "loss": 0.9973, "step": 440 }, { "epoch": 1.1917098445595855, "grad_norm": 0.46484375, "learning_rate": 2.979274611398964e-06, "loss": 0.9898, "step": 460 }, { "epoch": 1.2435233160621761, "grad_norm": 0.462890625, "learning_rate": 3.1088082901554407e-06, "loss": 0.9841, "step": 480 }, { "epoch": 1.2953367875647668, "grad_norm": 0.498046875, "learning_rate": 3.238341968911917e-06, "loss": 0.9908, "step": 500 }, { "epoch": 1.2953367875647668, "eval_main_loss": 0.9807333946228027, "eval_main_runtime": 50.6947, "eval_main_samples_per_second": 30.299, "eval_main_steps_per_second": 3.787, "step": 500 }, { "epoch": 1.2953367875647668, "eval_anatomy_loss": 2.860694408416748, "eval_anatomy_runtime": 0.2674, "eval_anatomy_samples_per_second": 7.479, "eval_anatomy_steps_per_second": 3.739, "step": 500 }, { "epoch": 1.2953367875647668, "eval_college_mathematics_loss": 2.0933423042297363, "eval_college_mathematics_runtime": 0.2663, "eval_college_mathematics_samples_per_second": 7.51, "eval_college_mathematics_steps_per_second": 3.755, "step": 500 }, { "epoch": 1.2953367875647668, "eval_international_law_loss": 3.0646896362304688, "eval_international_law_runtime": 0.2664, "eval_international_law_samples_per_second": 7.506, "eval_international_law_steps_per_second": 3.753, "step": 500 }, { "epoch": 1.3471502590673574, "grad_norm": 0.56640625, "learning_rate": 3.367875647668394e-06, "loss": 0.9792, "step": 520 }, { "epoch": 1.3989637305699483, "grad_norm": 0.56640625, "learning_rate": 3.497409326424871e-06, "loss": 0.9697, "step": 540 }, { "epoch": 1.450777202072539, "grad_norm": 0.5234375, "learning_rate": 3.6269430051813476e-06, "loss": 0.9604, "step": 560 }, { "epoch": 1.5025906735751295, "grad_norm": 0.50390625, "learning_rate": 3.756476683937824e-06, "loss": 0.9549, "step": 580 }, { "epoch": 1.5544041450777202, "grad_norm": 0.59375, "learning_rate": 3.886010362694301e-06, "loss": 0.9634, "step": 600 }, { "epoch": 1.5544041450777202, "eval_main_loss": 0.9588530659675598, "eval_main_runtime": 50.8939, "eval_main_samples_per_second": 30.18, "eval_main_steps_per_second": 3.773, "step": 600 }, { "epoch": 1.5544041450777202, "eval_anatomy_loss": 2.800969123840332, "eval_anatomy_runtime": 0.268, "eval_anatomy_samples_per_second": 7.464, "eval_anatomy_steps_per_second": 3.732, "step": 600 }, { "epoch": 1.5544041450777202, "eval_college_mathematics_loss": 2.0425174236297607, "eval_college_mathematics_runtime": 0.2684, "eval_college_mathematics_samples_per_second": 7.452, "eval_college_mathematics_steps_per_second": 3.726, "step": 600 }, { "epoch": 1.5544041450777202, "eval_international_law_loss": 2.9860198497772217, "eval_international_law_runtime": 0.2678, "eval_international_law_samples_per_second": 7.467, "eval_international_law_steps_per_second": 3.734, "step": 600 }, { "epoch": 1.6062176165803108, "grad_norm": 0.59765625, "learning_rate": 4.015544041450777e-06, "loss": 0.9551, "step": 620 }, { "epoch": 1.6580310880829017, "grad_norm": 0.640625, "learning_rate": 4.145077720207254e-06, "loss": 0.9462, "step": 640 }, { "epoch": 1.709844559585492, "grad_norm": 0.51953125, "learning_rate": 4.274611398963731e-06, "loss": 0.9398, "step": 660 }, { "epoch": 1.761658031088083, "grad_norm": 0.490234375, "learning_rate": 4.404145077720207e-06, "loss": 0.9289, "step": 680 }, { "epoch": 1.8134715025906736, "grad_norm": 0.4921875, "learning_rate": 4.533678756476685e-06, "loss": 0.9209, "step": 700 }, { "epoch": 1.8134715025906736, "eval_main_loss": 0.9181744456291199, "eval_main_runtime": 50.9128, "eval_main_samples_per_second": 30.169, "eval_main_steps_per_second": 3.771, "step": 700 }, { "epoch": 1.8134715025906736, "eval_anatomy_loss": 2.6760823726654053, "eval_anatomy_runtime": 0.2694, "eval_anatomy_samples_per_second": 7.423, "eval_anatomy_steps_per_second": 3.711, "step": 700 }, { "epoch": 1.8134715025906736, "eval_college_mathematics_loss": 1.9530092477798462, "eval_college_mathematics_runtime": 0.2669, "eval_college_mathematics_samples_per_second": 7.494, "eval_college_mathematics_steps_per_second": 3.747, "step": 700 }, { "epoch": 1.8134715025906736, "eval_international_law_loss": 2.8536064624786377, "eval_international_law_runtime": 0.268, "eval_international_law_samples_per_second": 7.462, "eval_international_law_steps_per_second": 3.731, "step": 700 }, { "epoch": 1.8652849740932642, "grad_norm": 0.435546875, "learning_rate": 4.663212435233161e-06, "loss": 0.9132, "step": 720 }, { "epoch": 1.917098445595855, "grad_norm": 0.45703125, "learning_rate": 4.7927461139896375e-06, "loss": 0.8982, "step": 740 }, { "epoch": 1.9689119170984455, "grad_norm": 0.42578125, "learning_rate": 4.922279792746114e-06, "loss": 0.898, "step": 760 }, { "epoch": 2.0207253886010363, "grad_norm": 0.451171875, "learning_rate": 4.9999171995395824e-06, "loss": 0.8827, "step": 780 }, { "epoch": 2.0725388601036268, "grad_norm": 0.41015625, "learning_rate": 4.9989857573474595e-06, "loss": 0.8667, "step": 800 }, { "epoch": 2.0725388601036268, "eval_main_loss": 0.8773276209831238, "eval_main_runtime": 50.6915, "eval_main_samples_per_second": 30.301, "eval_main_steps_per_second": 3.788, "step": 800 }, { "epoch": 2.0725388601036268, "eval_anatomy_loss": 2.5175046920776367, "eval_anatomy_runtime": 0.2667, "eval_anatomy_samples_per_second": 7.498, "eval_anatomy_steps_per_second": 3.749, "step": 800 }, { "epoch": 2.0725388601036268, "eval_college_mathematics_loss": 1.8391788005828857, "eval_college_mathematics_runtime": 0.2681, "eval_college_mathematics_samples_per_second": 7.459, "eval_college_mathematics_steps_per_second": 3.73, "step": 800 }, { "epoch": 2.0725388601036268, "eval_international_law_loss": 2.7238972187042236, "eval_international_law_runtime": 0.2657, "eval_international_law_samples_per_second": 7.529, "eval_international_law_steps_per_second": 3.764, "step": 800 }, { "epoch": 2.1243523316062176, "grad_norm": 0.3984375, "learning_rate": 4.997019759281217e-06, "loss": 0.8706, "step": 820 }, { "epoch": 2.1761658031088085, "grad_norm": 0.38671875, "learning_rate": 4.9940200192449906e-06, "loss": 0.8744, "step": 840 }, { "epoch": 2.227979274611399, "grad_norm": 0.349609375, "learning_rate": 4.989987779102074e-06, "loss": 0.8583, "step": 860 }, { "epoch": 2.2797927461139897, "grad_norm": 0.330078125, "learning_rate": 4.984924708160789e-06, "loss": 0.8637, "step": 880 }, { "epoch": 2.33160621761658, "grad_norm": 0.296875, "learning_rate": 4.978832902483415e-06, "loss": 0.8592, "step": 900 }, { "epoch": 2.33160621761658, "eval_main_loss": 0.8600960373878479, "eval_main_runtime": 50.6644, "eval_main_samples_per_second": 30.317, "eval_main_steps_per_second": 3.79, "step": 900 }, { "epoch": 2.33160621761658, "eval_anatomy_loss": 2.4422662258148193, "eval_anatomy_runtime": 0.2671, "eval_anatomy_samples_per_second": 7.488, "eval_anatomy_steps_per_second": 3.744, "step": 900 }, { "epoch": 2.33160621761658, "eval_college_mathematics_loss": 1.7872785329818726, "eval_college_mathematics_runtime": 0.267, "eval_college_mathematics_samples_per_second": 7.492, "eval_college_mathematics_steps_per_second": 3.746, "step": 900 }, { "epoch": 2.33160621761658, "eval_international_law_loss": 2.6731436252593994, "eval_international_law_runtime": 0.2664, "eval_international_law_samples_per_second": 7.508, "eval_international_law_steps_per_second": 3.754, "step": 900 }, { "epoch": 2.383419689119171, "grad_norm": 0.326171875, "learning_rate": 4.971714884018439e-06, "loss": 0.8623, "step": 920 }, { "epoch": 2.4352331606217614, "grad_norm": 0.291015625, "learning_rate": 4.96357359955649e-06, "loss": 0.8496, "step": 940 }, { "epoch": 2.4870466321243523, "grad_norm": 0.29296875, "learning_rate": 4.9544124195104015e-06, "loss": 0.8595, "step": 960 }, { "epoch": 2.538860103626943, "grad_norm": 0.296875, "learning_rate": 4.944235136519888e-06, "loss": 0.8512, "step": 980 }, { "epoch": 2.5906735751295336, "grad_norm": 0.265625, "learning_rate": 4.933045963881431e-06, "loss": 0.8522, "step": 1000 }, { "epoch": 2.5906735751295336, "eval_main_loss": 0.8523173928260803, "eval_main_runtime": 50.9143, "eval_main_samples_per_second": 30.168, "eval_main_steps_per_second": 3.771, "step": 1000 }, { "epoch": 2.5906735751295336, "eval_anatomy_loss": 2.420971632003784, "eval_anatomy_runtime": 0.2681, "eval_anatomy_samples_per_second": 7.459, "eval_anatomy_steps_per_second": 3.729, "step": 1000 }, { "epoch": 2.5906735751295336, "eval_college_mathematics_loss": 1.7655620574951172, "eval_college_mathematics_runtime": 0.2683, "eval_college_mathematics_samples_per_second": 7.454, "eval_college_mathematics_steps_per_second": 3.727, "step": 1000 }, { "epoch": 2.5906735751295336, "eval_international_law_loss": 2.6557209491729736, "eval_international_law_runtime": 0.2675, "eval_international_law_samples_per_second": 7.475, "eval_international_law_steps_per_second": 3.738, "step": 1000 }, { "epoch": 2.6424870466321244, "grad_norm": 0.263671875, "learning_rate": 4.920849533804017e-06, "loss": 0.8525, "step": 1020 }, { "epoch": 2.694300518134715, "grad_norm": 0.275390625, "learning_rate": 4.907650895491443e-06, "loss": 0.8554, "step": 1040 }, { "epoch": 2.7461139896373057, "grad_norm": 0.2421875, "learning_rate": 4.893455513052003e-06, "loss": 0.8489, "step": 1060 }, { "epoch": 2.7979274611398965, "grad_norm": 0.25, "learning_rate": 4.878269263236391e-06, "loss": 0.844, "step": 1080 }, { "epoch": 2.849740932642487, "grad_norm": 0.275390625, "learning_rate": 4.86209843300479e-06, "loss": 0.8456, "step": 1100 }, { "epoch": 2.849740932642487, "eval_main_loss": 0.8472149968147278, "eval_main_runtime": 50.8752, "eval_main_samples_per_second": 30.192, "eval_main_steps_per_second": 3.774, "step": 1100 }, { "epoch": 2.849740932642487, "eval_anatomy_loss": 2.407443046569824, "eval_anatomy_runtime": 0.2673, "eval_anatomy_samples_per_second": 7.481, "eval_anatomy_steps_per_second": 3.741, "step": 1100 }, { "epoch": 2.849740932642487, "eval_college_mathematics_loss": 1.7581956386566162, "eval_college_mathematics_runtime": 0.2664, "eval_college_mathematics_samples_per_second": 7.507, "eval_college_mathematics_steps_per_second": 3.753, "step": 1100 }, { "epoch": 2.849740932642487, "eval_international_law_loss": 2.643361806869507, "eval_international_law_runtime": 0.266, "eval_international_law_samples_per_second": 7.518, "eval_international_law_steps_per_second": 3.759, "step": 1100 }, { "epoch": 2.901554404145078, "grad_norm": 0.2412109375, "learning_rate": 4.8449497169241285e-06, "loss": 0.847, "step": 1120 }, { "epoch": 2.9533678756476682, "grad_norm": 0.248046875, "learning_rate": 4.826830214396594e-06, "loss": 0.8444, "step": 1140 }, { "epoch": 3.005181347150259, "grad_norm": 0.2490234375, "learning_rate": 4.807747426720553e-06, "loss": 0.8454, "step": 1160 }, { "epoch": 3.05699481865285, "grad_norm": 0.2431640625, "learning_rate": 4.78770925398508e-06, "loss": 0.8409, "step": 1180 }, { "epoch": 3.1088082901554404, "grad_norm": 0.25390625, "learning_rate": 4.766723991799407e-06, "loss": 0.8392, "step": 1200 }, { "epoch": 3.1088082901554404, "eval_main_loss": 0.8436682820320129, "eval_main_runtime": 50.6014, "eval_main_samples_per_second": 30.355, "eval_main_steps_per_second": 3.794, "step": 1200 }, { "epoch": 3.1088082901554404, "eval_anatomy_loss": 2.3988683223724365, "eval_anatomy_runtime": 0.2661, "eval_anatomy_samples_per_second": 7.517, "eval_anatomy_steps_per_second": 3.758, "step": 1200 }, { "epoch": 3.1088082901554404, "eval_college_mathematics_loss": 1.7500412464141846, "eval_college_mathematics_runtime": 0.2666, "eval_college_mathematics_samples_per_second": 7.502, "eval_college_mathematics_steps_per_second": 3.751, "step": 1200 }, { "epoch": 3.1088082901554404, "eval_international_law_loss": 2.639014482498169, "eval_international_law_runtime": 0.2661, "eval_international_law_samples_per_second": 7.517, "eval_international_law_steps_per_second": 3.759, "step": 1200 }, { "epoch": 3.160621761658031, "grad_norm": 0.2109375, "learning_rate": 4.744800327858608e-06, "loss": 0.8409, "step": 1220 }, { "epoch": 3.2124352331606216, "grad_norm": 0.255859375, "learning_rate": 4.721947338346993e-06, "loss": 0.8413, "step": 1240 }, { "epoch": 3.2642487046632125, "grad_norm": 0.2197265625, "learning_rate": 4.698174484180641e-06, "loss": 0.8364, "step": 1260 }, { "epoch": 3.3160621761658033, "grad_norm": 0.236328125, "learning_rate": 4.673491607090684e-06, "loss": 0.8351, "step": 1280 }, { "epoch": 3.3678756476683938, "grad_norm": 0.2158203125, "learning_rate": 4.647908925548918e-06, "loss": 0.8354, "step": 1300 }, { "epoch": 3.3678756476683938, "eval_main_loss": 0.8413074016571045, "eval_main_runtime": 50.8355, "eval_main_samples_per_second": 30.215, "eval_main_steps_per_second": 3.777, "step": 1300 }, { "epoch": 3.3678756476683938, "eval_anatomy_loss": 2.3927080631256104, "eval_anatomy_runtime": 0.2684, "eval_anatomy_samples_per_second": 7.451, "eval_anatomy_steps_per_second": 3.725, "step": 1300 }, { "epoch": 3.3678756476683938, "eval_college_mathematics_loss": 1.7482649087905884, "eval_college_mathematics_runtime": 0.2673, "eval_college_mathematics_samples_per_second": 7.482, "eval_college_mathematics_steps_per_second": 3.741, "step": 1300 }, { "epoch": 3.3678756476683938, "eval_international_law_loss": 2.6322338581085205, "eval_international_law_runtime": 0.2677, "eval_international_law_samples_per_second": 7.47, "eval_international_law_steps_per_second": 3.735, "step": 1300 }, { "epoch": 3.4196891191709846, "grad_norm": 0.236328125, "learning_rate": 4.621437030537461e-06, "loss": 0.8428, "step": 1320 }, { "epoch": 3.471502590673575, "grad_norm": 0.232421875, "learning_rate": 4.594086881164184e-06, "loss": 0.8495, "step": 1340 }, { "epoch": 3.523316062176166, "grad_norm": 0.244140625, "learning_rate": 4.565869800125747e-06, "loss": 0.8445, "step": 1360 }, { "epoch": 3.5751295336787567, "grad_norm": 0.2314453125, "learning_rate": 4.536797469020116e-06, "loss": 0.8441, "step": 1380 }, { "epoch": 3.626943005181347, "grad_norm": 0.220703125, "learning_rate": 4.506881923510493e-06, "loss": 0.8388, "step": 1400 }, { "epoch": 3.626943005181347, "eval_main_loss": 0.8397356867790222, "eval_main_runtime": 50.8597, "eval_main_samples_per_second": 30.201, "eval_main_steps_per_second": 3.775, "step": 1400 }, { "epoch": 3.626943005181347, "eval_anatomy_loss": 2.392620325088501, "eval_anatomy_runtime": 0.268, "eval_anatomy_samples_per_second": 7.464, "eval_anatomy_steps_per_second": 3.732, "step": 1400 }, { "epoch": 3.626943005181347, "eval_college_mathematics_loss": 1.746407151222229, "eval_college_mathematics_runtime": 0.2684, "eval_college_mathematics_samples_per_second": 7.451, "eval_college_mathematics_steps_per_second": 3.725, "step": 1400 }, { "epoch": 3.626943005181347, "eval_international_law_loss": 2.632657527923584, "eval_international_law_runtime": 0.2666, "eval_international_law_samples_per_second": 7.503, "eval_international_law_steps_per_second": 3.751, "step": 1400 }, { "epoch": 3.6787564766839376, "grad_norm": 0.2197265625, "learning_rate": 4.476135548342666e-06, "loss": 0.8405, "step": 1420 }, { "epoch": 3.7305699481865284, "grad_norm": 0.2119140625, "learning_rate": 4.444571072217848e-06, "loss": 0.8353, "step": 1440 }, { "epoch": 3.7823834196891193, "grad_norm": 0.2353515625, "learning_rate": 4.4122015625231125e-06, "loss": 0.838, "step": 1460 }, { "epoch": 3.8341968911917097, "grad_norm": 0.208984375, "learning_rate": 4.37904041992163e-06, "loss": 0.8339, "step": 1480 }, { "epoch": 3.8860103626943006, "grad_norm": 0.2099609375, "learning_rate": 4.345101372804917e-06, "loss": 0.8352, "step": 1500 }, { "epoch": 3.8860103626943006, "eval_main_loss": 0.8387607932090759, "eval_main_runtime": 50.9241, "eval_main_samples_per_second": 30.163, "eval_main_steps_per_second": 3.77, "step": 1500 }, { "epoch": 3.8860103626943006, "eval_anatomy_loss": 2.3933463096618652, "eval_anatomy_runtime": 0.2694, "eval_anatomy_samples_per_second": 7.424, "eval_anatomy_steps_per_second": 3.712, "step": 1500 }, { "epoch": 3.8860103626943006, "eval_college_mathematics_loss": 1.7452200651168823, "eval_college_mathematics_runtime": 0.2686, "eval_college_mathematics_samples_per_second": 7.447, "eval_college_mathematics_steps_per_second": 3.724, "step": 1500 }, { "epoch": 3.8860103626943006, "eval_international_law_loss": 2.631213903427124, "eval_international_law_runtime": 0.2675, "eval_international_law_samples_per_second": 7.475, "eval_international_law_steps_per_second": 3.738, "step": 1500 }, { "epoch": 3.937823834196891, "grad_norm": 0.2119140625, "learning_rate": 4.310398471609416e-06, "loss": 0.8379, "step": 1520 }, { "epoch": 3.989637305699482, "grad_norm": 0.228515625, "learning_rate": 4.274946082999753e-06, "loss": 0.8366, "step": 1540 }, { "epoch": 4.041450777202073, "grad_norm": 0.240234375, "learning_rate": 4.238758883921077e-06, "loss": 0.8351, "step": 1560 }, { "epoch": 4.0932642487046635, "grad_norm": 0.244140625, "learning_rate": 4.201851855522946e-06, "loss": 0.8427, "step": 1580 }, { "epoch": 4.1450777202072535, "grad_norm": 0.2197265625, "learning_rate": 4.1642402769572775e-06, "loss": 0.8375, "step": 1600 }, { "epoch": 4.1450777202072535, "eval_main_loss": 0.8381660580635071, "eval_main_runtime": 50.8823, "eval_main_samples_per_second": 30.187, "eval_main_steps_per_second": 3.773, "step": 1600 }, { "epoch": 4.1450777202072535, "eval_anatomy_loss": 2.3937363624572754, "eval_anatomy_runtime": 0.2711, "eval_anatomy_samples_per_second": 7.376, "eval_anatomy_steps_per_second": 3.688, "step": 1600 }, { "epoch": 4.1450777202072535, "eval_college_mathematics_loss": 1.7459180355072021, "eval_college_mathematics_runtime": 0.2669, "eval_college_mathematics_samples_per_second": 7.493, "eval_college_mathematics_steps_per_second": 3.747, "step": 1600 }, { "epoch": 4.1450777202072535, "eval_international_law_loss": 2.6354634761810303, "eval_international_law_runtime": 0.2663, "eval_international_law_samples_per_second": 7.509, "eval_international_law_steps_per_second": 3.755, "step": 1600 }, { "epoch": 4.196891191709844, "grad_norm": 0.2431640625, "learning_rate": 4.125939719052927e-06, "loss": 0.8405, "step": 1620 }, { "epoch": 4.248704663212435, "grad_norm": 0.193359375, "learning_rate": 4.086966037869515e-06, "loss": 0.8346, "step": 1640 }, { "epoch": 4.300518134715026, "grad_norm": 0.244140625, "learning_rate": 4.047335368133176e-06, "loss": 0.8388, "step": 1660 }, { "epoch": 4.352331606217617, "grad_norm": 0.21484375, "learning_rate": 4.0070641165569335e-06, "loss": 0.8301, "step": 1680 }, { "epoch": 4.404145077720207, "grad_norm": 0.2275390625, "learning_rate": 3.96616895504848e-06, "loss": 0.8366, "step": 1700 }, { "epoch": 4.404145077720207, "eval_main_loss": 0.8377915024757385, "eval_main_runtime": 50.9147, "eval_main_samples_per_second": 30.168, "eval_main_steps_per_second": 3.771, "step": 1700 }, { "epoch": 4.404145077720207, "eval_anatomy_loss": 2.394949197769165, "eval_anatomy_runtime": 0.2684, "eval_anatomy_samples_per_second": 7.451, "eval_anatomy_steps_per_second": 3.726, "step": 1700 }, { "epoch": 4.404145077720207, "eval_college_mathematics_loss": 1.7438111305236816, "eval_college_mathematics_runtime": 0.2676, "eval_college_mathematics_samples_per_second": 7.475, "eval_college_mathematics_steps_per_second": 3.737, "step": 1700 }, { "epoch": 4.404145077720207, "eval_international_law_loss": 2.6339313983917236, "eval_international_law_runtime": 0.2665, "eval_international_law_samples_per_second": 7.506, "eval_international_law_steps_per_second": 3.753, "step": 1700 }, { "epoch": 4.455958549222798, "grad_norm": 0.2197265625, "learning_rate": 3.924666813808176e-06, "loss": 0.833, "step": 1720 }, { "epoch": 4.507772020725389, "grad_norm": 0.234375, "learning_rate": 3.882574874320099e-06, "loss": 0.8381, "step": 1740 }, { "epoch": 4.5595854922279795, "grad_norm": 0.2177734375, "learning_rate": 3.839910562239088e-06, "loss": 0.8438, "step": 1760 }, { "epoch": 4.61139896373057, "grad_norm": 0.2138671875, "learning_rate": 3.7966915401766845e-06, "loss": 0.832, "step": 1780 }, { "epoch": 4.66321243523316, "grad_norm": 0.2177734375, "learning_rate": 3.752935700388982e-06, "loss": 0.843, "step": 1800 }, { "epoch": 4.66321243523316, "eval_main_loss": 0.8375835418701172, "eval_main_runtime": 50.8717, "eval_main_samples_per_second": 30.194, "eval_main_steps_per_second": 3.774, "step": 1800 }, { "epoch": 4.66321243523316, "eval_anatomy_loss": 2.3957273960113525, "eval_anatomy_runtime": 0.2674, "eval_anatomy_samples_per_second": 7.479, "eval_anatomy_steps_per_second": 3.74, "step": 1800 }, { "epoch": 4.66321243523316, "eval_college_mathematics_loss": 1.745951771736145, "eval_college_mathematics_runtime": 0.2678, "eval_college_mathematics_samples_per_second": 7.468, "eval_college_mathematics_steps_per_second": 3.734, "step": 1800 }, { "epoch": 4.66321243523316, "eval_international_law_loss": 2.635204315185547, "eval_international_law_runtime": 0.2682, "eval_international_law_samples_per_second": 7.458, "eval_international_law_steps_per_second": 3.729, "step": 1800 }, { "epoch": 4.715025906735751, "grad_norm": 0.22265625, "learning_rate": 3.7086611573694107e-06, "loss": 0.8254, "step": 1820 }, { "epoch": 4.766839378238342, "grad_norm": 0.232421875, "learning_rate": 3.663886240349507e-06, "loss": 0.8342, "step": 1840 }, { "epoch": 4.818652849740933, "grad_norm": 0.228515625, "learning_rate": 3.6186294857107933e-06, "loss": 0.8368, "step": 1860 }, { "epoch": 4.870466321243523, "grad_norm": 0.2255859375, "learning_rate": 3.5729096293108935e-06, "loss": 0.8406, "step": 1880 }, { "epoch": 4.922279792746114, "grad_norm": 0.23828125, "learning_rate": 3.526745598727071e-06, "loss": 0.8317, "step": 1900 }, { "epoch": 4.922279792746114, "eval_main_loss": 0.8374476432800293, "eval_main_runtime": 50.8861, "eval_main_samples_per_second": 30.185, "eval_main_steps_per_second": 3.773, "step": 1900 }, { "epoch": 4.922279792746114, "eval_anatomy_loss": 2.3970742225646973, "eval_anatomy_runtime": 0.268, "eval_anatomy_samples_per_second": 7.462, "eval_anatomy_steps_per_second": 3.731, "step": 1900 }, { "epoch": 4.922279792746114, "eval_college_mathematics_loss": 1.7456122636795044, "eval_college_mathematics_runtime": 0.2683, "eval_college_mathematics_samples_per_second": 7.454, "eval_college_mathematics_steps_per_second": 3.727, "step": 1900 }, { "epoch": 4.922279792746114, "eval_international_law_loss": 2.6364247798919678, "eval_international_law_runtime": 0.267, "eval_international_law_samples_per_second": 7.49, "eval_international_law_steps_per_second": 3.745, "step": 1900 }, { "epoch": 4.974093264248705, "grad_norm": 0.232421875, "learning_rate": 3.4801565054203962e-06, "loss": 0.834, "step": 1920 }, { "epoch": 5.025906735751295, "grad_norm": 0.228515625, "learning_rate": 3.433161636823782e-06, "loss": 0.8409, "step": 1940 }, { "epoch": 5.077720207253886, "grad_norm": 0.2099609375, "learning_rate": 3.3857804483571803e-06, "loss": 0.8324, "step": 1960 }, { "epoch": 5.129533678756476, "grad_norm": 0.236328125, "learning_rate": 3.3380325553732223e-06, "loss": 0.8433, "step": 1980 }, { "epoch": 5.181347150259067, "grad_norm": 0.21484375, "learning_rate": 3.2899377250366536e-06, "loss": 0.8307, "step": 2000 }, { "epoch": 5.181347150259067, "eval_main_loss": 0.8373920321464539, "eval_main_runtime": 50.7102, "eval_main_samples_per_second": 30.29, "eval_main_steps_per_second": 3.786, "step": 2000 }, { "epoch": 5.181347150259067, "eval_anatomy_loss": 2.3975374698638916, "eval_anatomy_runtime": 0.2673, "eval_anatomy_samples_per_second": 7.483, "eval_anatomy_steps_per_second": 3.741, "step": 2000 }, { "epoch": 5.181347150259067, "eval_college_mathematics_loss": 1.746949315071106, "eval_college_mathematics_runtime": 0.2679, "eval_college_mathematics_samples_per_second": 7.465, "eval_college_mathematics_steps_per_second": 3.732, "step": 2000 }, { "epoch": 5.181347150259067, "eval_international_law_loss": 2.637000799179077, "eval_international_law_runtime": 0.267, "eval_international_law_samples_per_second": 7.49, "eval_international_law_steps_per_second": 3.745, "step": 2000 }, { "epoch": 5.233160621761658, "grad_norm": 0.2138671875, "learning_rate": 3.2415158681409215e-06, "loss": 0.836, "step": 2020 }, { "epoch": 5.284974093264249, "grad_norm": 0.2099609375, "learning_rate": 3.1927870308652953e-06, "loss": 0.8447, "step": 2040 }, { "epoch": 5.33678756476684, "grad_norm": 0.220703125, "learning_rate": 3.1437713864759483e-06, "loss": 0.8383, "step": 2060 }, { "epoch": 5.38860103626943, "grad_norm": 0.2451171875, "learning_rate": 3.0944892269744155e-06, "loss": 0.8412, "step": 2080 }, { "epoch": 5.4404145077720205, "grad_norm": 0.23828125, "learning_rate": 3.044960954696906e-06, "loss": 0.837, "step": 2100 }, { "epoch": 5.4404145077720205, "eval_main_loss": 0.837336003780365, "eval_main_runtime": 50.8778, "eval_main_samples_per_second": 30.19, "eval_main_steps_per_second": 3.774, "step": 2100 }, { "epoch": 5.4404145077720205, "eval_anatomy_loss": 2.39707350730896, "eval_anatomy_runtime": 0.264, "eval_anatomy_samples_per_second": 7.575, "eval_anatomy_steps_per_second": 3.787, "step": 2100 }, { "epoch": 5.4404145077720205, "eval_college_mathematics_loss": 1.746044397354126, "eval_college_mathematics_runtime": 0.2668, "eval_college_mathematics_samples_per_second": 7.497, "eval_college_mathematics_steps_per_second": 3.748, "step": 2100 }, { "epoch": 5.4404145077720205, "eval_international_law_loss": 2.639299154281616, "eval_international_law_runtime": 0.2686, "eval_international_law_samples_per_second": 7.446, "eval_international_law_steps_per_second": 3.723, "step": 2100 }, { "epoch": 5.492227979274611, "grad_norm": 0.244140625, "learning_rate": 2.9952070738679312e-06, "loss": 0.8349, "step": 2120 }, { "epoch": 5.544041450777202, "grad_norm": 0.2421875, "learning_rate": 2.9452481821117544e-06, "loss": 0.8261, "step": 2140 }, { "epoch": 5.595854922279793, "grad_norm": 0.193359375, "learning_rate": 2.895104961925179e-06, "loss": 0.8305, "step": 2160 }, { "epoch": 5.647668393782383, "grad_norm": 0.2177734375, "learning_rate": 2.844798172115185e-06, "loss": 0.8316, "step": 2180 }, { "epoch": 5.699481865284974, "grad_norm": 0.220703125, "learning_rate": 2.7943486392049972e-06, "loss": 0.8334, "step": 2200 }, { "epoch": 5.699481865284974, "eval_main_loss": 0.8372721076011658, "eval_main_runtime": 50.8569, "eval_main_samples_per_second": 30.202, "eval_main_steps_per_second": 3.775, "step": 2200 }, { "epoch": 5.699481865284974, "eval_anatomy_loss": 2.3966822624206543, "eval_anatomy_runtime": 0.2669, "eval_anatomy_samples_per_second": 7.494, "eval_anatomy_steps_per_second": 3.747, "step": 2200 }, { "epoch": 5.699481865284974, "eval_college_mathematics_loss": 1.7497589588165283, "eval_college_mathematics_runtime": 0.2678, "eval_college_mathematics_samples_per_second": 7.468, "eval_college_mathematics_steps_per_second": 3.734, "step": 2200 }, { "epoch": 5.699481865284974, "eval_international_law_loss": 2.634140968322754, "eval_international_law_runtime": 0.267, "eval_international_law_samples_per_second": 7.492, "eval_international_law_steps_per_second": 3.746, "step": 2200 }, { "epoch": 5.751295336787565, "grad_norm": 0.212890625, "learning_rate": 2.7437772488120945e-06, "loss": 0.8305, "step": 2220 }, { "epoch": 5.803108808290156, "grad_norm": 0.2109375, "learning_rate": 2.6931049370017755e-06, "loss": 0.8383, "step": 2240 }, { "epoch": 5.8549222797927465, "grad_norm": 0.2333984375, "learning_rate": 2.6423526816198253e-06, "loss": 0.8445, "step": 2260 }, { "epoch": 5.9067357512953365, "grad_norm": 0.2236328125, "learning_rate": 2.5915414936078933e-06, "loss": 0.8385, "step": 2280 }, { "epoch": 5.958549222797927, "grad_norm": 0.2294921875, "learning_rate": 2.5406924083051683e-06, "loss": 0.8359, "step": 2300 }, { "epoch": 5.958549222797927, "eval_main_loss": 0.8372599482536316, "eval_main_runtime": 50.7835, "eval_main_samples_per_second": 30.246, "eval_main_steps_per_second": 3.781, "step": 2300 }, { "epoch": 5.958549222797927, "eval_anatomy_loss": 2.3953776359558105, "eval_anatomy_runtime": 0.2668, "eval_anatomy_samples_per_second": 7.496, "eval_anatomy_steps_per_second": 3.748, "step": 2300 }, { "epoch": 5.958549222797927, "eval_college_mathematics_loss": 1.7478266954421997, "eval_college_mathematics_runtime": 0.2698, "eval_college_mathematics_samples_per_second": 7.413, "eval_college_mathematics_steps_per_second": 3.707, "step": 2300 }, { "epoch": 5.958549222797927, "eval_international_law_loss": 2.6391143798828125, "eval_international_law_runtime": 0.2663, "eval_international_law_samples_per_second": 7.509, "eval_international_law_steps_per_second": 3.755, "step": 2300 }, { "epoch": 6.010362694300518, "grad_norm": 0.2392578125, "learning_rate": 2.4898264767399445e-06, "loss": 0.8316, "step": 2320 }, { "epoch": 6.062176165803109, "grad_norm": 0.2255859375, "learning_rate": 2.438964756914712e-06, "loss": 0.8412, "step": 2340 }, { "epoch": 6.1139896373057, "grad_norm": 0.2236328125, "learning_rate": 2.3881283050883368e-06, "loss": 0.8368, "step": 2360 }, { "epoch": 6.16580310880829, "grad_norm": 0.234375, "learning_rate": 2.337338167058981e-06, "loss": 0.8392, "step": 2380 }, { "epoch": 6.217616580310881, "grad_norm": 0.212890625, "learning_rate": 2.286615369451342e-06, "loss": 0.834, "step": 2400 }, { "epoch": 6.217616580310881, "eval_main_loss": 0.8373088836669922, "eval_main_runtime": 50.8482, "eval_main_samples_per_second": 30.208, "eval_main_steps_per_second": 3.776, "step": 2400 }, { "epoch": 6.217616580310881, "eval_anatomy_loss": 2.3978271484375, "eval_anatomy_runtime": 0.2671, "eval_anatomy_samples_per_second": 7.487, "eval_anatomy_steps_per_second": 3.744, "step": 2400 }, { "epoch": 6.217616580310881, "eval_college_mathematics_loss": 1.7477797269821167, "eval_college_mathematics_runtime": 0.2685, "eval_college_mathematics_samples_per_second": 7.449, "eval_college_mathematics_steps_per_second": 3.724, "step": 2400 }, { "epoch": 6.217616580310881, "eval_international_law_loss": 2.638612985610962, "eval_international_law_runtime": 0.2668, "eval_international_law_samples_per_second": 7.495, "eval_international_law_steps_per_second": 3.748, "step": 2400 }, { "epoch": 6.269430051813472, "grad_norm": 0.2275390625, "learning_rate": 2.2359809110118358e-06, "loss": 0.8391, "step": 2420 }, { "epoch": 6.321243523316062, "grad_norm": 0.2431640625, "learning_rate": 2.1854557539153203e-06, "loss": 0.8368, "step": 2440 }, { "epoch": 6.373056994818652, "grad_norm": 0.2294921875, "learning_rate": 2.1350608150869563e-06, "loss": 0.8321, "step": 2460 }, { "epoch": 6.424870466321243, "grad_norm": 0.23828125, "learning_rate": 2.0848169575428057e-06, "loss": 0.8375, "step": 2480 }, { "epoch": 6.476683937823834, "grad_norm": 0.232421875, "learning_rate": 2.034744981752741e-06, "loss": 0.835, "step": 2500 }, { "epoch": 6.476683937823834, "eval_main_loss": 0.8372688889503479, "eval_main_runtime": 50.7481, "eval_main_samples_per_second": 30.267, "eval_main_steps_per_second": 3.783, "step": 2500 }, { "epoch": 6.476683937823834, "eval_anatomy_loss": 2.397125244140625, "eval_anatomy_runtime": 0.2662, "eval_anatomy_samples_per_second": 7.513, "eval_anatomy_steps_per_second": 3.757, "step": 2500 }, { "epoch": 6.476683937823834, "eval_college_mathematics_loss": 1.7471617460250854, "eval_college_mathematics_runtime": 0.2671, "eval_college_mathematics_samples_per_second": 7.487, "eval_college_mathematics_steps_per_second": 3.744, "step": 2500 }, { "epoch": 6.476683937823834, "eval_international_law_loss": 2.6356821060180664, "eval_international_law_runtime": 0.2665, "eval_international_law_samples_per_second": 7.504, "eval_international_law_steps_per_second": 3.752, "step": 2500 }, { "epoch": 6.528497409326425, "grad_norm": 0.2490234375, "learning_rate": 1.9848656170292556e-06, "loss": 0.8291, "step": 2520 }, { "epoch": 6.580310880829016, "grad_norm": 0.2314453125, "learning_rate": 1.9351995129457305e-06, "loss": 0.8325, "step": 2540 }, { "epoch": 6.632124352331607, "grad_norm": 0.2294921875, "learning_rate": 1.88576723078771e-06, "loss": 0.8345, "step": 2560 }, { "epoch": 6.683937823834197, "grad_norm": 0.2412109375, "learning_rate": 1.8365892350407238e-06, "loss": 0.8387, "step": 2580 }, { "epoch": 6.7357512953367875, "grad_norm": 0.212890625, "learning_rate": 1.7876858849181982e-06, "loss": 0.8311, "step": 2600 }, { "epoch": 6.7357512953367875, "eval_main_loss": 0.8372130393981934, "eval_main_runtime": 50.6463, "eval_main_samples_per_second": 30.328, "eval_main_steps_per_second": 3.791, "step": 2600 }, { "epoch": 6.7357512953367875, "eval_anatomy_loss": 2.393521308898926, "eval_anatomy_runtime": 0.267, "eval_anatomy_samples_per_second": 7.49, "eval_anatomy_steps_per_second": 3.745, "step": 2600 }, { "epoch": 6.7357512953367875, "eval_college_mathematics_loss": 1.748924970626831, "eval_college_mathematics_runtime": 0.2664, "eval_college_mathematics_samples_per_second": 7.509, "eval_college_mathematics_steps_per_second": 3.754, "step": 2600 }, { "epoch": 6.7357512953367875, "eval_international_law_loss": 2.637521505355835, "eval_international_law_runtime": 0.2667, "eval_international_law_samples_per_second": 7.499, "eval_international_law_steps_per_second": 3.749, "step": 2600 }, { "epoch": 6.787564766839378, "grad_norm": 0.2421875, "learning_rate": 1.73907742593293e-06, "loss": 0.8395, "step": 2620 }, { "epoch": 6.839378238341969, "grad_norm": 0.220703125, "learning_rate": 1.690783981515648e-06, "loss": 0.8399, "step": 2640 }, { "epoch": 6.891191709844559, "grad_norm": 0.2470703125, "learning_rate": 1.642825544684101e-06, "loss": 0.8302, "step": 2660 }, { "epoch": 6.94300518134715, "grad_norm": 0.201171875, "learning_rate": 1.5952219697661455e-06, "loss": 0.8338, "step": 2680 }, { "epoch": 6.994818652849741, "grad_norm": 0.224609375, "learning_rate": 1.5479929641802492e-06, "loss": 0.837, "step": 2700 }, { "epoch": 6.994818652849741, "eval_main_loss": 0.8372467160224915, "eval_main_runtime": 50.6632, "eval_main_samples_per_second": 30.318, "eval_main_steps_per_second": 3.79, "step": 2700 }, { "epoch": 6.994818652849741, "eval_anatomy_loss": 2.399848222732544, "eval_anatomy_runtime": 0.2663, "eval_anatomy_samples_per_second": 7.512, "eval_anatomy_steps_per_second": 3.756, "step": 2700 }, { "epoch": 6.994818652849741, "eval_college_mathematics_loss": 1.7486381530761719, "eval_college_mathematics_runtime": 0.268, "eval_college_mathematics_samples_per_second": 7.464, "eval_college_mathematics_steps_per_second": 3.732, "step": 2700 }, { "epoch": 6.994818652849741, "eval_international_law_loss": 2.637385845184326, "eval_international_law_runtime": 0.2665, "eval_international_law_samples_per_second": 7.504, "eval_international_law_steps_per_second": 3.752, "step": 2700 }, { "epoch": 7.046632124352332, "grad_norm": 0.2431640625, "learning_rate": 1.5011580802768048e-06, "loss": 0.8392, "step": 2720 }, { "epoch": 7.098445595854923, "grad_norm": 0.2333984375, "learning_rate": 1.4547367072436519e-06, "loss": 0.8326, "step": 2740 }, { "epoch": 7.150259067357513, "grad_norm": 0.2275390625, "learning_rate": 1.4087480630791405e-06, "loss": 0.8324, "step": 2760 }, { "epoch": 7.2020725388601035, "grad_norm": 0.2333984375, "learning_rate": 1.3632111866360585e-06, "loss": 0.8309, "step": 2780 }, { "epoch": 7.253886010362694, "grad_norm": 0.2265625, "learning_rate": 1.318144929739743e-06, "loss": 0.8292, "step": 2800 }, { "epoch": 7.253886010362694, "eval_main_loss": 0.8372478485107422, "eval_main_runtime": 50.677, "eval_main_samples_per_second": 30.31, "eval_main_steps_per_second": 3.789, "step": 2800 }, { "epoch": 7.253886010362694, "eval_anatomy_loss": 2.3977112770080566, "eval_anatomy_runtime": 0.2672, "eval_anatomy_samples_per_second": 7.484, "eval_anatomy_steps_per_second": 3.742, "step": 2800 }, { "epoch": 7.253886010362694, "eval_college_mathematics_loss": 1.745300531387329, "eval_college_mathematics_runtime": 0.2659, "eval_college_mathematics_samples_per_second": 7.523, "eval_college_mathematics_steps_per_second": 3.761, "step": 2800 }, { "epoch": 7.253886010362694, "eval_international_law_loss": 2.6362087726593018, "eval_international_law_runtime": 0.2666, "eval_international_law_samples_per_second": 7.502, "eval_international_law_steps_per_second": 3.751, "step": 2800 }, { "epoch": 7.305699481865285, "grad_norm": 0.2451171875, "learning_rate": 1.273567949383601e-06, "loss": 0.8384, "step": 2820 }, { "epoch": 7.357512953367876, "grad_norm": 0.21875, "learning_rate": 1.229498700005295e-06, "loss": 0.8375, "step": 2840 }, { "epoch": 7.409326424870466, "grad_norm": 0.2314453125, "learning_rate": 1.1859554258467843e-06, "loss": 0.8416, "step": 2860 }, { "epoch": 7.461139896373057, "grad_norm": 0.2265625, "learning_rate": 1.1429561534013869e-06, "loss": 0.8367, "step": 2880 }, { "epoch": 7.512953367875648, "grad_norm": 0.2412109375, "learning_rate": 1.1005186839509887e-06, "loss": 0.8372, "step": 2900 }, { "epoch": 7.512953367875648, "eval_main_loss": 0.8372209668159485, "eval_main_runtime": 50.6545, "eval_main_samples_per_second": 30.323, "eval_main_steps_per_second": 3.79, "step": 2900 }, { "epoch": 7.512953367875648, "eval_anatomy_loss": 2.3963139057159424, "eval_anatomy_runtime": 0.2679, "eval_anatomy_samples_per_second": 7.464, "eval_anatomy_steps_per_second": 3.732, "step": 2900 }, { "epoch": 7.512953367875648, "eval_college_mathematics_loss": 1.7459056377410889, "eval_college_mathematics_runtime": 0.2655, "eval_college_mathematics_samples_per_second": 7.534, "eval_college_mathematics_steps_per_second": 3.767, "step": 2900 }, { "epoch": 7.512953367875648, "eval_international_law_loss": 2.6337826251983643, "eval_international_law_runtime": 0.2662, "eval_international_law_samples_per_second": 7.512, "eval_international_law_steps_per_second": 3.756, "step": 2900 }, { "epoch": 7.564766839378239, "grad_norm": 0.2138671875, "learning_rate": 1.0586605861964804e-06, "loss": 0.8313, "step": 2920 }, { "epoch": 7.616580310880829, "grad_norm": 0.2431640625, "learning_rate": 1.01739918898449e-06, "loss": 0.8346, "step": 2940 }, { "epoch": 7.668393782383419, "grad_norm": 0.21875, "learning_rate": 9.767515741334039e-07, "loss": 0.8372, "step": 2960 }, { "epoch": 7.72020725388601, "grad_norm": 0.2177734375, "learning_rate": 9.367345693616625e-07, "loss": 0.8343, "step": 2980 }, { "epoch": 7.772020725388601, "grad_norm": 0.24609375, "learning_rate": 8.973647413212494e-07, "loss": 0.8441, "step": 3000 }, { "epoch": 7.772020725388601, "eval_main_loss": 0.8372817039489746, "eval_main_runtime": 50.6245, "eval_main_samples_per_second": 30.341, "eval_main_steps_per_second": 3.793, "step": 3000 }, { "epoch": 7.772020725388601, "eval_anatomy_loss": 2.3954248428344727, "eval_anatomy_runtime": 0.2651, "eval_anatomy_samples_per_second": 7.545, "eval_anatomy_steps_per_second": 3.773, "step": 3000 }, { "epoch": 7.772020725388601, "eval_college_mathematics_loss": 1.744511365890503, "eval_college_mathematics_runtime": 0.2656, "eval_college_mathematics_samples_per_second": 7.53, "eval_college_mathematics_steps_per_second": 3.765, "step": 3000 }, { "epoch": 7.772020725388601, "eval_international_law_loss": 2.635951280593872, "eval_international_law_runtime": 0.2679, "eval_international_law_samples_per_second": 7.464, "eval_international_law_steps_per_second": 3.732, "step": 3000 }, { "epoch": 7.823834196891192, "grad_norm": 0.22265625, "learning_rate": 8.586583887392546e-07, "loss": 0.8383, "step": 3020 }, { "epoch": 7.875647668393782, "grad_norm": 0.2177734375, "learning_rate": 8.206315356703634e-07, "loss": 0.8312, "step": 3040 }, { "epoch": 7.927461139896373, "grad_norm": 0.2265625, "learning_rate": 7.832999248630479e-07, "loss": 0.8401, "step": 3060 }, { "epoch": 7.979274611398964, "grad_norm": 0.2490234375, "learning_rate": 7.466790112422257e-07, "loss": 0.8283, "step": 3080 }, { "epoch": 8.031088082901555, "grad_norm": 0.2216796875, "learning_rate": 7.107839555110707e-07, "loss": 0.8378, "step": 3100 }, { "epoch": 8.031088082901555, "eval_main_loss": 0.8372604846954346, "eval_main_runtime": 50.6329, "eval_main_samples_per_second": 30.336, "eval_main_steps_per_second": 3.792, "step": 3100 }, { "epoch": 8.031088082901555, "eval_anatomy_loss": 2.397829532623291, "eval_anatomy_runtime": 0.2664, "eval_anatomy_samples_per_second": 7.509, "eval_anatomy_steps_per_second": 3.754, "step": 3100 }, { "epoch": 8.031088082901555, "eval_college_mathematics_loss": 1.745050311088562, "eval_college_mathematics_runtime": 0.2662, "eval_college_mathematics_samples_per_second": 7.512, "eval_college_mathematics_steps_per_second": 3.756, "step": 3100 }, { "epoch": 8.031088082901555, "eval_international_law_loss": 2.635841131210327, "eval_international_law_runtime": 0.2664, "eval_international_law_samples_per_second": 7.508, "eval_international_law_steps_per_second": 3.754, "step": 3100 }, { "epoch": 8.082901554404145, "grad_norm": 0.20703125, "learning_rate": 6.756296178746282e-07, "loss": 0.8382, "step": 3120 }, { "epoch": 8.134715025906736, "grad_norm": 0.23828125, "learning_rate": 6.412305518878343e-07, "loss": 0.8411, "step": 3140 }, { "epoch": 8.186528497409327, "grad_norm": 0.2294921875, "learning_rate": 6.076009984304837e-07, "loss": 0.8411, "step": 3160 }, { "epoch": 8.238341968911918, "grad_norm": 0.2216796875, "learning_rate": 5.747548798116451e-07, "loss": 0.8384, "step": 3180 }, { "epoch": 8.290155440414507, "grad_norm": 0.2119140625, "learning_rate": 5.427057940059607e-07, "loss": 0.8304, "step": 3200 }, { "epoch": 8.290155440414507, "eval_main_loss": 0.8372419476509094, "eval_main_runtime": 50.616, "eval_main_samples_per_second": 30.346, "eval_main_steps_per_second": 3.793, "step": 3200 }, { "epoch": 8.290155440414507, "eval_anatomy_loss": 2.398320436477661, "eval_anatomy_runtime": 0.2666, "eval_anatomy_samples_per_second": 7.501, "eval_anatomy_steps_per_second": 3.751, "step": 3200 }, { "epoch": 8.290155440414507, "eval_college_mathematics_loss": 1.7492754459381104, "eval_college_mathematics_runtime": 0.2666, "eval_college_mathematics_samples_per_second": 7.501, "eval_college_mathematics_steps_per_second": 3.75, "step": 3200 }, { "epoch": 8.290155440414507, "eval_international_law_loss": 2.6363821029663086, "eval_international_law_runtime": 0.266, "eval_international_law_samples_per_second": 7.519, "eval_international_law_steps_per_second": 3.76, "step": 3200 }, { "epoch": 8.341968911917098, "grad_norm": 0.2255859375, "learning_rate": 5.11467009024216e-07, "loss": 0.8361, "step": 3220 }, { "epoch": 8.393782383419689, "grad_norm": 0.2294921875, "learning_rate": 4.810514574205125e-07, "loss": 0.8339, "step": 3240 }, { "epoch": 8.44559585492228, "grad_norm": 0.2294921875, "learning_rate": 4.5147173093831264e-07, "loss": 0.8345, "step": 3260 }, { "epoch": 8.49740932642487, "grad_norm": 0.2265625, "learning_rate": 4.227400752975835e-07, "loss": 0.8374, "step": 3280 }, { "epoch": 8.549222797927461, "grad_norm": 0.2294921875, "learning_rate": 3.9486838512518777e-07, "loss": 0.8374, "step": 3300 }, { "epoch": 8.549222797927461, "eval_main_loss": 0.8372399806976318, "eval_main_runtime": 50.5872, "eval_main_samples_per_second": 30.363, "eval_main_steps_per_second": 3.795, "step": 3300 }, { "epoch": 8.549222797927461, "eval_anatomy_loss": 2.3960189819335938, "eval_anatomy_runtime": 0.2667, "eval_anatomy_samples_per_second": 7.498, "eval_anatomy_steps_per_second": 3.749, "step": 3300 }, { "epoch": 8.549222797927461, "eval_college_mathematics_loss": 1.7492492198944092, "eval_college_mathematics_runtime": 0.2676, "eval_college_mathematics_samples_per_second": 7.475, "eval_college_mathematics_steps_per_second": 3.738, "step": 3300 }, { "epoch": 8.549222797927461, "eval_international_law_loss": 2.6380083560943604, "eval_international_law_runtime": 0.2658, "eval_international_law_samples_per_second": 7.525, "eval_international_law_steps_per_second": 3.762, "step": 3300 }, { "epoch": 8.601036269430052, "grad_norm": 0.255859375, "learning_rate": 3.678681990306207e-07, "loss": 0.8359, "step": 3320 }, { "epoch": 8.652849740932643, "grad_norm": 0.2412109375, "learning_rate": 3.4175069482914105e-07, "loss": 0.8284, "step": 3340 }, { "epoch": 8.704663212435234, "grad_norm": 0.2138671875, "learning_rate": 3.165266849142581e-07, "loss": 0.8334, "step": 3360 }, { "epoch": 8.756476683937823, "grad_norm": 0.232421875, "learning_rate": 2.9220661178151366e-07, "loss": 0.8337, "step": 3380 }, { "epoch": 8.808290155440414, "grad_norm": 0.20703125, "learning_rate": 2.688005437053845e-07, "loss": 0.8382, "step": 3400 }, { "epoch": 8.808290155440414, "eval_main_loss": 0.8372331261634827, "eval_main_runtime": 50.6285, "eval_main_samples_per_second": 30.339, "eval_main_steps_per_second": 3.792, "step": 3400 }, { "epoch": 8.808290155440414, "eval_anatomy_loss": 2.39411997795105, "eval_anatomy_runtime": 0.2664, "eval_anatomy_samples_per_second": 7.507, "eval_anatomy_steps_per_second": 3.753, "step": 3400 }, { "epoch": 8.808290155440414, "eval_college_mathematics_loss": 1.7451952695846558, "eval_college_mathematics_runtime": 0.2673, "eval_college_mathematics_samples_per_second": 7.482, "eval_college_mathematics_steps_per_second": 3.741, "step": 3400 }, { "epoch": 8.808290155440414, "eval_international_law_loss": 2.6384801864624023, "eval_international_law_runtime": 0.2666, "eval_international_law_samples_per_second": 7.502, "eval_international_law_steps_per_second": 3.751, "step": 3400 }, { "epoch": 8.860103626943005, "grad_norm": 0.21875, "learning_rate": 2.4631817057111597e-07, "loss": 0.8363, "step": 3420 }, { "epoch": 8.911917098445596, "grad_norm": 0.2255859375, "learning_rate": 2.247687998632031e-07, "loss": 0.8389, "step": 3440 }, { "epoch": 8.963730569948186, "grad_norm": 0.21875, "learning_rate": 2.0416135281218218e-07, "loss": 0.8336, "step": 3460 }, { "epoch": 9.015544041450777, "grad_norm": 0.236328125, "learning_rate": 1.8450436070132889e-07, "loss": 0.8333, "step": 3480 }, { "epoch": 9.067357512953368, "grad_norm": 0.23828125, "learning_rate": 1.6580596133478926e-07, "loss": 0.8373, "step": 3500 }, { "epoch": 9.067357512953368, "eval_main_loss": 0.837254524230957, "eval_main_runtime": 50.6558, "eval_main_samples_per_second": 30.322, "eval_main_steps_per_second": 3.79, "step": 3500 }, { "epoch": 9.067357512953368, "eval_anatomy_loss": 2.395867347717285, "eval_anatomy_runtime": 0.2665, "eval_anatomy_samples_per_second": 7.506, "eval_anatomy_steps_per_second": 3.753, "step": 3500 }, { "epoch": 9.067357512953368, "eval_college_mathematics_loss": 1.7460345029830933, "eval_college_mathematics_runtime": 0.267, "eval_college_mathematics_samples_per_second": 7.491, "eval_college_mathematics_steps_per_second": 3.746, "step": 3500 }, { "epoch": 9.067357512953368, "eval_international_law_loss": 2.6364336013793945, "eval_international_law_runtime": 0.2659, "eval_international_law_samples_per_second": 7.522, "eval_international_law_steps_per_second": 3.761, "step": 3500 }, { "epoch": 9.119170984455959, "grad_norm": 0.2431640625, "learning_rate": 1.4807389566860675e-07, "loss": 0.838, "step": 3520 }, { "epoch": 9.17098445595855, "grad_norm": 0.2158203125, "learning_rate": 1.3131550460604242e-07, "loss": 0.8266, "step": 3540 }, { "epoch": 9.22279792746114, "grad_norm": 0.224609375, "learning_rate": 1.1553772595851109e-07, "loss": 0.8412, "step": 3560 }, { "epoch": 9.27461139896373, "grad_norm": 0.2412109375, "learning_rate": 1.0074709157339657e-07, "loss": 0.8361, "step": 3580 }, { "epoch": 9.32642487046632, "grad_norm": 0.2197265625, "learning_rate": 8.694972462992918e-08, "loss": 0.8373, "step": 3600 }, { "epoch": 9.32642487046632, "eval_main_loss": 0.8372175693511963, "eval_main_runtime": 50.62, "eval_main_samples_per_second": 30.344, "eval_main_steps_per_second": 3.793, "step": 3600 }, { "epoch": 9.32642487046632, "eval_anatomy_loss": 2.394951105117798, "eval_anatomy_runtime": 0.2668, "eval_anatomy_samples_per_second": 7.496, "eval_anatomy_steps_per_second": 3.748, "step": 3600 }, { "epoch": 9.32642487046632, "eval_college_mathematics_loss": 1.7458065748214722, "eval_college_mathematics_runtime": 0.2669, "eval_college_mathematics_samples_per_second": 7.494, "eval_college_mathematics_steps_per_second": 3.747, "step": 3600 }, { "epoch": 9.32642487046632, "eval_international_law_loss": 2.639335870742798, "eval_international_law_runtime": 0.2665, "eval_international_law_samples_per_second": 7.505, "eval_international_law_steps_per_second": 3.752, "step": 3600 }, { "epoch": 9.378238341968911, "grad_norm": 0.2431640625, "learning_rate": 7.415133710424794e-08, "loss": 0.8335, "step": 3620 }, { "epoch": 9.430051813471502, "grad_norm": 0.248046875, "learning_rate": 6.235722740469936e-08, "loss": 0.8379, "step": 3640 }, { "epoch": 9.481865284974093, "grad_norm": 0.23046875, "learning_rate": 5.157227817834648e-08, "loss": 0.831, "step": 3660 }, { "epoch": 9.533678756476684, "grad_norm": 0.2138671875, "learning_rate": 4.180095428960168e-08, "loss": 0.8399, "step": 3680 }, { "epoch": 9.585492227979275, "grad_norm": 0.2216796875, "learning_rate": 3.304730097181463e-08, "loss": 0.8428, "step": 3700 }, { "epoch": 9.585492227979275, "eval_main_loss": 0.8372709155082703, "eval_main_runtime": 50.6434, "eval_main_samples_per_second": 30.33, "eval_main_steps_per_second": 3.791, "step": 3700 }, { "epoch": 9.585492227979275, "eval_anatomy_loss": 2.3959925174713135, "eval_anatomy_runtime": 0.2686, "eval_anatomy_samples_per_second": 7.447, "eval_anatomy_steps_per_second": 3.723, "step": 3700 }, { "epoch": 9.585492227979275, "eval_college_mathematics_loss": 1.7489873170852661, "eval_college_mathematics_runtime": 0.2656, "eval_college_mathematics_samples_per_second": 7.529, "eval_college_mathematics_steps_per_second": 3.765, "step": 3700 }, { "epoch": 9.585492227979275, "eval_international_law_loss": 2.6358718872070312, "eval_international_law_runtime": 0.2658, "eval_international_law_samples_per_second": 7.525, "eval_international_law_steps_per_second": 3.763, "step": 3700 }, { "epoch": 9.637305699481866, "grad_norm": 0.2373046875, "learning_rate": 2.5314942152586954e-08, "loss": 0.8366, "step": 3720 }, { "epoch": 9.689119170984457, "grad_norm": 0.208984375, "learning_rate": 1.8607078953498392e-08, "loss": 0.8386, "step": 3740 }, { "epoch": 9.740932642487046, "grad_norm": 0.2421875, "learning_rate": 1.292648836487609e-08, "loss": 0.8321, "step": 3760 }, { "epoch": 9.792746113989637, "grad_norm": 0.2412109375, "learning_rate": 8.275522096146404e-09, "loss": 0.8342, "step": 3780 }, { "epoch": 9.844559585492227, "grad_norm": 0.2275390625, "learning_rate": 4.656105602250382e-09, "loss": 0.8404, "step": 3800 }, { "epoch": 9.844559585492227, "eval_main_loss": 0.8372488021850586, "eval_main_runtime": 50.6539, "eval_main_samples_per_second": 30.323, "eval_main_steps_per_second": 3.79, "step": 3800 }, { "epoch": 9.844559585492227, "eval_anatomy_loss": 2.3990156650543213, "eval_anatomy_runtime": 0.2664, "eval_anatomy_samples_per_second": 7.508, "eval_anatomy_steps_per_second": 3.754, "step": 3800 }, { "epoch": 9.844559585492227, "eval_college_mathematics_loss": 1.7470096349716187, "eval_college_mathematics_runtime": 0.2654, "eval_college_mathematics_samples_per_second": 7.535, "eval_college_mathematics_steps_per_second": 3.767, "step": 3800 }, { "epoch": 9.844559585492227, "eval_international_law_loss": 2.6360206604003906, "eval_international_law_runtime": 0.2672, "eval_international_law_samples_per_second": 7.485, "eval_international_law_steps_per_second": 3.743, "step": 3800 }, { "epoch": 9.896373056994818, "grad_norm": 0.2138671875, "learning_rate": 2.0697372865235986e-09, "loss": 0.8301, "step": 3820 }, { "epoch": 9.94818652849741, "grad_norm": 0.2265625, "learning_rate": 5.174878803720917e-10, "loss": 0.8299, "step": 3840 }, { "epoch": 10.0, "grad_norm": 0.201171875, "learning_rate": 0.0, "loss": 0.832, "step": 3860 } ], "logging_steps": 20, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8346399786239263e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }