diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13025 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.550545185732766, + "global_step": 500000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "FLOPS loss": 7.404106872854754e-05, + "L0_d": 21035.61, + "MLM loss": 8.70656681060791, + "epoch": 0.01, + "step": 499 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "loss": 9.4402, + "step": 500 + }, + { + "FLOPS loss": 0.0008858467335812747, + "L0_d": 27735.09, + "MLM loss": 7.479305267333984, + "epoch": 0.02, + "step": 999 + }, + { + "epoch": 0.02, + "learning_rate": 1e-05, + "loss": 7.9955, + "step": 1000 + }, + { + "FLOPS loss": 0.002816247520968318, + "L0_d": 28068.22, + "MLM loss": 7.293828010559082, + "epoch": 0.03, + "step": 1499 + }, + { + "epoch": 0.03, + "learning_rate": 1.5e-05, + "loss": 7.3438, + "step": 1500 + }, + { + "FLOPS loss": 0.00617683120071888, + "L0_d": 28429.55, + "MLM loss": 7.137616157531738, + "epoch": 0.05, + "step": 1999 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 7.1243, + "step": 2000 + }, + { + "FLOPS loss": 0.011358148418366909, + "L0_d": 29154.48, + "MLM loss": 6.879582405090332, + "epoch": 0.06, + "step": 2499 + }, + { + "epoch": 0.06, + "learning_rate": 2.5e-05, + "loss": 6.9824, + "step": 2500 + }, + { + "FLOPS loss": 0.018066763877868652, + "L0_d": 29186.34, + "MLM loss": 6.785317420959473, + "epoch": 0.07, + "step": 2999 + }, + { + "epoch": 0.07, + "learning_rate": 3e-05, + "loss": 6.8578, + "step": 3000 + }, + { + "FLOPS loss": 0.02440321072936058, + "L0_d": 28600.52, + "MLM loss": 6.300593376159668, + "epoch": 0.08, + "step": 3499 + }, + { + "epoch": 0.08, + "learning_rate": 3.5e-05, + "loss": 6.5223, + "step": 3500 + }, + { + "FLOPS loss": 0.03402633219957352, + "L0_d": 28263.14, + "MLM loss": 5.7942047119140625, + "epoch": 0.09, + "step": 3999 + }, + { + "epoch": 0.09, + "learning_rate": 4e-05, + "loss": 6.0331, + "step": 4000 + }, + { + "FLOPS loss": 0.04102250933647156, + "L0_d": 27604.8, + "MLM loss": 5.463202953338623, + "epoch": 0.1, + "step": 4499 + }, + { + "epoch": 0.1, + "learning_rate": 4.5e-05, + "loss": 5.6372, + "step": 4500 + }, + { + "FLOPS loss": 0.048070844262838364, + "L0_d": 27568.05, + "MLM loss": 5.396732330322266, + "epoch": 0.12, + "step": 4999 + }, + { + "epoch": 0.12, + "learning_rate": 5e-05, + "loss": 5.346, + "step": 5000 + }, + { + "FLOPS loss": 0.053655628114938736, + "L0_d": 26272.8, + "MLM loss": 4.88920783996582, + "epoch": 0.13, + "step": 5499 + }, + { + "epoch": 0.13, + "learning_rate": 5.500000000000001e-05, + "loss": 5.1141, + "step": 5500 + }, + { + "FLOPS loss": 0.05216597020626068, + "L0_d": 23949.03, + "MLM loss": 4.8225998878479, + "epoch": 0.14, + "step": 5999 + }, + { + "epoch": 0.14, + "learning_rate": 6e-05, + "loss": 4.9298, + "step": 6000 + }, + { + "FLOPS loss": 0.045533325523138046, + "L0_d": 21281.05, + "MLM loss": 4.535486221313477, + "epoch": 0.15, + "step": 6499 + }, + { + "epoch": 0.15, + "learning_rate": 6.499000000000001e-05, + "loss": 4.7614, + "step": 6500 + }, + { + "FLOPS loss": 0.04692893847823143, + "L0_d": 19877.23, + "MLM loss": 4.604552268981934, + "epoch": 0.16, + "step": 6999 + }, + { + "epoch": 0.16, + "learning_rate": 6.999e-05, + "loss": 4.622, + "step": 7000 + }, + { + "FLOPS loss": 0.0333898551762104, + "L0_d": 15760.62, + "MLM loss": 4.219477653503418, + "epoch": 0.17, + "step": 7499 + }, + { + "epoch": 0.17, + "learning_rate": 7.499e-05, + "loss": 4.4779, + "step": 7500 + }, + { + "FLOPS loss": 0.0419338159263134, + "L0_d": 16492.83, + "MLM loss": 4.006763458251953, + "epoch": 0.18, + "step": 7999 + }, + { + "epoch": 0.18, + "learning_rate": 7.999000000000001e-05, + "loss": 4.3399, + "step": 8000 + }, + { + "FLOPS loss": 0.038417913019657135, + "L0_d": 14914.7, + "MLM loss": 4.344199180603027, + "epoch": 0.2, + "step": 8499 + }, + { + "epoch": 0.2, + "learning_rate": 8.499e-05, + "loss": 4.1982, + "step": 8500 + }, + { + "FLOPS loss": 0.0357012003660202, + "L0_d": 11732.44, + "MLM loss": 3.9161806106567383, + "epoch": 0.21, + "step": 8999 + }, + { + "epoch": 0.21, + "learning_rate": 8.999000000000001e-05, + "loss": 4.0719, + "step": 9000 + }, + { + "FLOPS loss": 0.027552934363484383, + "L0_d": 10824.06, + "MLM loss": 3.7745988368988037, + "epoch": 0.22, + "step": 9499 + }, + { + "epoch": 0.22, + "learning_rate": 9.499e-05, + "loss": 3.9603, + "step": 9500 + }, + { + "FLOPS loss": 0.02538445219397545, + "L0_d": 9813.38, + "MLM loss": 3.884188175201416, + "epoch": 0.23, + "step": 9999 + }, + { + "epoch": 0.23, + "learning_rate": 9.999000000000001e-05, + "loss": 3.876, + "step": 10000 + }, + { + "FLOPS loss": 0.025783885270357132, + "L0_d": 8739.94, + "MLM loss": 3.7749428749084473, + "epoch": 0.24, + "step": 10499 + }, + { + "epoch": 0.24, + "learning_rate": 9.989836734693878e-05, + "loss": 3.795, + "step": 10500 + }, + { + "FLOPS loss": 0.030093064531683922, + "L0_d": 8860.89, + "MLM loss": 3.7490487098693848, + "epoch": 0.25, + "step": 10999 + }, + { + "epoch": 0.25, + "learning_rate": 9.979632653061225e-05, + "loss": 3.721, + "step": 11000 + }, + { + "FLOPS loss": 0.028102116659283638, + "L0_d": 8556.38, + "MLM loss": 3.7181925773620605, + "epoch": 0.27, + "step": 11499 + }, + { + "epoch": 0.27, + "learning_rate": 9.969428571428572e-05, + "loss": 3.6627, + "step": 11500 + }, + { + "FLOPS loss": 0.03335424140095711, + "L0_d": 8712.69, + "MLM loss": 3.4150214195251465, + "epoch": 0.28, + "step": 11999 + }, + { + "epoch": 0.28, + "learning_rate": 9.959224489795919e-05, + "loss": 3.5984, + "step": 12000 + }, + { + "FLOPS loss": 0.029852230101823807, + "L0_d": 7392.5, + "MLM loss": 3.384998083114624, + "epoch": 0.29, + "step": 12499 + }, + { + "epoch": 0.29, + "learning_rate": 9.949020408163265e-05, + "loss": 3.5447, + "step": 12500 + }, + { + "FLOPS loss": 0.03067166358232498, + "L0_d": 7107.91, + "MLM loss": 3.4410061836242676, + "epoch": 0.3, + "step": 12999 + }, + { + "epoch": 0.3, + "learning_rate": 9.938836734693879e-05, + "loss": 3.5081, + "step": 13000 + }, + { + "FLOPS loss": 0.030213626101613045, + "L0_d": 6799.55, + "MLM loss": 3.4604804515838623, + "epoch": 0.31, + "step": 13499 + }, + { + "epoch": 0.31, + "learning_rate": 9.928632653061225e-05, + "loss": 3.4544, + "step": 13500 + }, + { + "FLOPS loss": 0.03034152090549469, + "L0_d": 6092.02, + "MLM loss": 3.4961204528808594, + "epoch": 0.32, + "step": 13999 + }, + { + "epoch": 0.32, + "learning_rate": 9.918428571428572e-05, + "loss": 3.4262, + "step": 14000 + }, + { + "FLOPS loss": 0.026655618101358414, + "L0_d": 5715.41, + "MLM loss": 3.321794033050537, + "epoch": 0.33, + "step": 14499 + }, + { + "epoch": 0.33, + "learning_rate": 9.908224489795918e-05, + "loss": 3.3953, + "step": 14500 + }, + { + "FLOPS loss": 0.028114214539527893, + "L0_d": 5818.91, + "MLM loss": 3.3361945152282715, + "epoch": 0.35, + "step": 14999 + }, + { + "epoch": 0.35, + "learning_rate": 9.898040816326532e-05, + "loss": 3.3651, + "step": 15000 + }, + { + "FLOPS loss": 0.02521996572613716, + "L0_d": 4736.66, + "MLM loss": 3.268618106842041, + "epoch": 0.36, + "step": 15499 + }, + { + "epoch": 0.36, + "learning_rate": 9.887836734693878e-05, + "loss": 3.3403, + "step": 15500 + }, + { + "FLOPS loss": 0.029343057423830032, + "L0_d": 4741.28, + "MLM loss": 3.2674968242645264, + "epoch": 0.37, + "step": 15999 + }, + { + "epoch": 0.37, + "learning_rate": 9.877632653061225e-05, + "loss": 3.3179, + "step": 16000 + }, + { + "FLOPS loss": 0.029638290405273438, + "L0_d": 4553.05, + "MLM loss": 3.4543089866638184, + "epoch": 0.38, + "step": 16499 + }, + { + "epoch": 0.38, + "learning_rate": 9.867428571428572e-05, + "loss": 3.2948, + "step": 16500 + }, + { + "FLOPS loss": 0.039912909269332886, + "L0_d": 5712.59, + "MLM loss": 3.241018056869507, + "epoch": 0.39, + "step": 16999 + }, + { + "epoch": 0.39, + "learning_rate": 9.857244897959183e-05, + "loss": 3.2752, + "step": 17000 + }, + { + "FLOPS loss": 0.035952694714069366, + "L0_d": 4726.56, + "MLM loss": 3.2455244064331055, + "epoch": 0.4, + "step": 17499 + }, + { + "epoch": 0.4, + "learning_rate": 9.847040816326531e-05, + "loss": 3.2526, + "step": 17500 + }, + { + "FLOPS loss": 0.027841633185744286, + "L0_d": 3770.58, + "MLM loss": 3.104790449142456, + "epoch": 0.42, + "step": 17999 + }, + { + "epoch": 0.42, + "learning_rate": 9.836836734693879e-05, + "loss": 3.2298, + "step": 18000 + }, + { + "FLOPS loss": 0.042448658496141434, + "L0_d": 4327.95, + "MLM loss": 3.2372331619262695, + "epoch": 0.43, + "step": 18499 + }, + { + "epoch": 0.43, + "learning_rate": 9.826632653061225e-05, + "loss": 3.218, + "step": 18500 + }, + { + "FLOPS loss": 0.03443501517176628, + "L0_d": 3733.66, + "MLM loss": 3.439894676208496, + "epoch": 0.44, + "step": 18999 + }, + { + "epoch": 0.44, + "learning_rate": 9.816448979591837e-05, + "loss": 3.1942, + "step": 19000 + }, + { + "FLOPS loss": 0.03354410454630852, + "L0_d": 3822.12, + "MLM loss": 3.2109413146972656, + "epoch": 0.45, + "step": 19499 + }, + { + "epoch": 0.45, + "learning_rate": 9.806244897959184e-05, + "loss": 3.183, + "step": 19500 + }, + { + "FLOPS loss": 0.0343574658036232, + "L0_d": 3360.17, + "MLM loss": 3.143649101257324, + "epoch": 0.46, + "step": 19999 + }, + { + "epoch": 0.46, + "learning_rate": 9.796040816326532e-05, + "loss": 3.1615, + "step": 20000 + }, + { + "FLOPS loss": 0.032427407801151276, + "L0_d": 3297.11, + "MLM loss": 3.3535146713256836, + "epoch": 0.47, + "step": 20499 + }, + { + "epoch": 0.47, + "learning_rate": 9.785857142857144e-05, + "loss": 3.1547, + "step": 20500 + }, + { + "FLOPS loss": 0.030162997543811798, + "L0_d": 2771.8, + "MLM loss": 3.0056474208831787, + "epoch": 0.49, + "step": 20999 + }, + { + "epoch": 0.49, + "learning_rate": 9.77565306122449e-05, + "loss": 3.1396, + "step": 21000 + }, + { + "FLOPS loss": 0.031429558992385864, + "L0_d": 2561.09, + "MLM loss": 2.9690394401550293, + "epoch": 0.5, + "step": 21499 + }, + { + "epoch": 0.5, + "learning_rate": 9.765448979591837e-05, + "loss": 3.1262, + "step": 21500 + }, + { + "FLOPS loss": 0.033806342631578445, + "L0_d": 2472.33, + "MLM loss": 3.2172610759735107, + "epoch": 0.51, + "step": 21999 + }, + { + "epoch": 0.51, + "learning_rate": 9.755244897959183e-05, + "loss": 3.1159, + "step": 22000 + }, + { + "FLOPS loss": 0.03768898546695709, + "L0_d": 3027.31, + "MLM loss": 3.0621213912963867, + "epoch": 0.52, + "step": 22499 + }, + { + "epoch": 0.52, + "learning_rate": 9.745061224489797e-05, + "loss": 3.1082, + "step": 22500 + }, + { + "FLOPS loss": 0.04951038211584091, + "L0_d": 3470.08, + "MLM loss": 3.0145416259765625, + "epoch": 0.53, + "step": 22999 + }, + { + "epoch": 0.53, + "learning_rate": 9.734857142857143e-05, + "loss": 3.0992, + "step": 23000 + }, + { + "FLOPS loss": 0.032062649726867676, + "L0_d": 1964.75, + "MLM loss": 2.9846644401550293, + "epoch": 0.54, + "step": 23499 + }, + { + "epoch": 0.54, + "learning_rate": 9.724653061224491e-05, + "loss": 3.0815, + "step": 23500 + }, + { + "FLOPS loss": 0.0394759438931942, + "L0_d": 2687.91, + "MLM loss": 3.1492552757263184, + "epoch": 0.55, + "step": 23999 + }, + { + "epoch": 0.55, + "learning_rate": 9.714448979591837e-05, + "loss": 3.0752, + "step": 24000 + }, + { + "FLOPS loss": 0.046340227127075195, + "L0_d": 2852.02, + "MLM loss": 2.789968490600586, + "epoch": 0.57, + "step": 24499 + }, + { + "epoch": 0.57, + "learning_rate": 9.70426530612245e-05, + "loss": 3.0647, + "step": 24500 + }, + { + "FLOPS loss": 0.04598288983106613, + "L0_d": 2861.27, + "MLM loss": 2.9972972869873047, + "epoch": 0.58, + "step": 24999 + }, + { + "epoch": 0.58, + "learning_rate": 9.694061224489797e-05, + "loss": 3.0544, + "step": 25000 + }, + { + "FLOPS loss": 0.04686904326081276, + "L0_d": 2361.75, + "MLM loss": 2.7404046058654785, + "epoch": 0.59, + "step": 25499 + }, + { + "epoch": 0.59, + "learning_rate": 9.683857142857144e-05, + "loss": 3.0444, + "step": 25500 + }, + { + "FLOPS loss": 0.04754837229847908, + "L0_d": 2383.89, + "MLM loss": 2.9416728019714355, + "epoch": 0.6, + "step": 25999 + }, + { + "epoch": 0.6, + "learning_rate": 9.67365306122449e-05, + "loss": 3.0383, + "step": 26000 + }, + { + "FLOPS loss": 0.05846235528588295, + "L0_d": 2928.42, + "MLM loss": 2.9113664627075195, + "epoch": 0.61, + "step": 26499 + }, + { + "epoch": 0.61, + "learning_rate": 9.663448979591837e-05, + "loss": 3.0277, + "step": 26500 + }, + { + "FLOPS loss": 0.04545710235834122, + "L0_d": 2646.41, + "MLM loss": 2.850696563720703, + "epoch": 0.62, + "step": 26999 + }, + { + "epoch": 0.62, + "learning_rate": 9.65326530612245e-05, + "loss": 3.0261, + "step": 27000 + }, + { + "FLOPS loss": 0.04402191564440727, + "L0_d": 1969.67, + "MLM loss": 2.861927032470703, + "epoch": 0.64, + "step": 27499 + }, + { + "epoch": 0.64, + "learning_rate": 9.643061224489796e-05, + "loss": 3.0153, + "step": 27500 + }, + { + "FLOPS loss": 0.04754064232110977, + "L0_d": 1777.67, + "MLM loss": 3.047110080718994, + "epoch": 0.65, + "step": 27999 + }, + { + "epoch": 0.65, + "learning_rate": 9.632857142857143e-05, + "loss": 3.0075, + "step": 28000 + }, + { + "FLOPS loss": 0.042845577001571655, + "L0_d": 1711.33, + "MLM loss": 2.8969533443450928, + "epoch": 0.66, + "step": 28499 + }, + { + "epoch": 0.66, + "learning_rate": 9.622653061224491e-05, + "loss": 3.0014, + "step": 28500 + }, + { + "FLOPS loss": 0.051014166325330734, + "L0_d": 2422.53, + "MLM loss": 3.259045362472534, + "epoch": 0.67, + "step": 28999 + }, + { + "epoch": 0.67, + "learning_rate": 9.612469387755101e-05, + "loss": 2.9962, + "step": 29000 + }, + { + "FLOPS loss": 0.05882557854056358, + "L0_d": 2171.72, + "MLM loss": 3.0199313163757324, + "epoch": 0.68, + "step": 29499 + }, + { + "epoch": 0.68, + "learning_rate": 9.602265306122449e-05, + "loss": 2.9811, + "step": 29500 + }, + { + "FLOPS loss": 0.06353907287120819, + "L0_d": 2784.38, + "MLM loss": 2.922330617904663, + "epoch": 0.69, + "step": 29999 + }, + { + "epoch": 0.69, + "learning_rate": 9.592061224489797e-05, + "loss": 2.9847, + "step": 30000 + }, + { + "FLOPS loss": 0.06058761849999428, + "L0_d": 2210.28, + "MLM loss": 2.9217536449432373, + "epoch": 0.7, + "step": 30499 + }, + { + "epoch": 0.7, + "learning_rate": 9.581857142857144e-05, + "loss": 2.974, + "step": 30500 + }, + { + "FLOPS loss": 0.049193985760211945, + "L0_d": 1656.2, + "MLM loss": 2.7800402641296387, + "epoch": 0.72, + "step": 30999 + }, + { + "epoch": 0.72, + "learning_rate": 9.571673469387756e-05, + "loss": 2.9664, + "step": 31000 + }, + { + "FLOPS loss": 0.06122875213623047, + "L0_d": 1938.58, + "MLM loss": 2.921651840209961, + "epoch": 0.73, + "step": 31499 + }, + { + "epoch": 0.73, + "learning_rate": 9.561469387755102e-05, + "loss": 2.9618, + "step": 31500 + }, + { + "FLOPS loss": 0.061618685722351074, + "L0_d": 2109.42, + "MLM loss": 2.9213171005249023, + "epoch": 0.74, + "step": 31999 + }, + { + "epoch": 0.74, + "learning_rate": 9.55126530612245e-05, + "loss": 2.9624, + "step": 32000 + }, + { + "FLOPS loss": 0.0622367262840271, + "L0_d": 1870.62, + "MLM loss": 2.93810772895813, + "epoch": 0.75, + "step": 32499 + }, + { + "epoch": 0.75, + "learning_rate": 9.541061224489796e-05, + "loss": 2.9546, + "step": 32500 + }, + { + "FLOPS loss": 0.06546282023191452, + "L0_d": 1941.95, + "MLM loss": 2.7924461364746094, + "epoch": 0.76, + "step": 32999 + }, + { + "epoch": 0.76, + "learning_rate": 9.530877551020408e-05, + "loss": 2.9479, + "step": 33000 + }, + { + "FLOPS loss": 0.05119035392999649, + "L0_d": 1321.45, + "MLM loss": 3.0193562507629395, + "epoch": 0.77, + "step": 33499 + }, + { + "epoch": 0.77, + "learning_rate": 9.520673469387755e-05, + "loss": 2.945, + "step": 33500 + }, + { + "FLOPS loss": 0.05955345928668976, + "L0_d": 1532.5, + "MLM loss": 2.9778778553009033, + "epoch": 0.79, + "step": 33999 + }, + { + "epoch": 0.79, + "learning_rate": 9.510469387755101e-05, + "loss": 2.937, + "step": 34000 + }, + { + "FLOPS loss": 0.06750689446926117, + "L0_d": 1965.52, + "MLM loss": 2.970550537109375, + "epoch": 0.8, + "step": 34499 + }, + { + "epoch": 0.8, + "learning_rate": 9.500265306122449e-05, + "loss": 2.9383, + "step": 34500 + }, + { + "FLOPS loss": 0.057339150458574295, + "L0_d": 1509.08, + "MLM loss": 3.053955078125, + "epoch": 0.81, + "step": 34999 + }, + { + "epoch": 0.81, + "learning_rate": 9.490081632653061e-05, + "loss": 2.9327, + "step": 35000 + }, + { + "FLOPS loss": 0.060432739555835724, + "L0_d": 1440.39, + "MLM loss": 2.9338531494140625, + "epoch": 0.82, + "step": 35499 + }, + { + "epoch": 0.82, + "learning_rate": 9.479877551020409e-05, + "loss": 2.9273, + "step": 35500 + }, + { + "FLOPS loss": 0.05953816697001457, + "L0_d": 1626.62, + "MLM loss": 2.8334543704986572, + "epoch": 0.83, + "step": 35999 + }, + { + "epoch": 0.83, + "learning_rate": 9.469673469387756e-05, + "loss": 2.9226, + "step": 36000 + }, + { + "FLOPS loss": 0.0538330040872097, + "L0_d": 1472.11, + "MLM loss": 3.0895233154296875, + "epoch": 0.84, + "step": 36499 + }, + { + "epoch": 0.84, + "learning_rate": 9.459469387755102e-05, + "loss": 2.9109, + "step": 36500 + }, + { + "FLOPS loss": 0.07072566449642181, + "L0_d": 1756.19, + "MLM loss": 2.800241470336914, + "epoch": 0.85, + "step": 36999 + }, + { + "epoch": 0.85, + "learning_rate": 9.449285714285716e-05, + "loss": 2.9108, + "step": 37000 + }, + { + "FLOPS loss": 0.055461227893829346, + "L0_d": 1410.72, + "MLM loss": 3.147261381149292, + "epoch": 0.87, + "step": 37499 + }, + { + "epoch": 0.87, + "learning_rate": 9.439081632653062e-05, + "loss": 2.9103, + "step": 37500 + }, + { + "FLOPS loss": 0.05898945778608322, + "L0_d": 1273.66, + "MLM loss": 2.9772825241088867, + "epoch": 0.88, + "step": 37999 + }, + { + "epoch": 0.88, + "learning_rate": 9.428877551020408e-05, + "loss": 2.9015, + "step": 38000 + }, + { + "FLOPS loss": 0.06924092024564743, + "L0_d": 1601.7, + "MLM loss": 2.562720775604248, + "epoch": 0.89, + "step": 38499 + }, + { + "epoch": 0.89, + "learning_rate": 9.418673469387755e-05, + "loss": 2.9001, + "step": 38500 + }, + { + "FLOPS loss": 0.07076826691627502, + "L0_d": 1320.19, + "MLM loss": 2.9359970092773438, + "epoch": 0.9, + "step": 38999 + }, + { + "epoch": 0.9, + "learning_rate": 9.408489795918368e-05, + "loss": 2.9028, + "step": 39000 + }, + { + "FLOPS loss": 0.06523440033197403, + "L0_d": 1285.02, + "MLM loss": 2.816526412963867, + "epoch": 0.91, + "step": 39499 + }, + { + "epoch": 0.91, + "learning_rate": 9.398285714285715e-05, + "loss": 2.892, + "step": 39500 + }, + { + "FLOPS loss": 0.058553751558065414, + "L0_d": 1458.62, + "MLM loss": 2.819638729095459, + "epoch": 0.92, + "step": 39999 + }, + { + "epoch": 0.92, + "learning_rate": 9.388081632653061e-05, + "loss": 2.8847, + "step": 40000 + }, + { + "FLOPS loss": 0.053785067051649094, + "L0_d": 1089.17, + "MLM loss": 2.7337148189544678, + "epoch": 0.94, + "step": 40499 + }, + { + "epoch": 0.94, + "learning_rate": 9.377877551020409e-05, + "loss": 2.8918, + "step": 40500 + }, + { + "FLOPS loss": 0.06995739042758942, + "L0_d": 1275.78, + "MLM loss": 2.9106178283691406, + "epoch": 0.95, + "step": 40999 + }, + { + "epoch": 0.95, + "learning_rate": 9.367673469387756e-05, + "loss": 2.8765, + "step": 41000 + }, + { + "FLOPS loss": 0.06189718097448349, + "L0_d": 1194.67, + "MLM loss": 2.9347684383392334, + "epoch": 0.96, + "step": 41499 + }, + { + "epoch": 0.96, + "learning_rate": 9.357489795918368e-05, + "loss": 2.8766, + "step": 41500 + }, + { + "FLOPS loss": 0.07404523342847824, + "L0_d": 1580.86, + "MLM loss": 2.7358357906341553, + "epoch": 0.97, + "step": 41999 + }, + { + "epoch": 0.97, + "learning_rate": 9.347285714285715e-05, + "loss": 2.8695, + "step": 42000 + }, + { + "FLOPS loss": 0.07238270342350006, + "L0_d": 1594.48, + "MLM loss": 2.8579506874084473, + "epoch": 0.98, + "step": 42499 + }, + { + "epoch": 0.98, + "learning_rate": 9.337081632653062e-05, + "loss": 2.8693, + "step": 42500 + }, + { + "FLOPS loss": 0.07086482644081116, + "L0_d": 1330.3, + "MLM loss": 2.827727794647217, + "epoch": 0.99, + "step": 42999 + }, + { + "epoch": 0.99, + "learning_rate": 9.326877551020408e-05, + "loss": 2.8691, + "step": 43000 + }, + { + "FLOPS loss": 0.06407759338617325, + "L0_d": 963.44, + "MLM loss": 2.902876377105713, + "epoch": 1.0, + "step": 43499 + }, + { + "epoch": 1.0, + "learning_rate": 9.31669387755102e-05, + "loss": 2.8607, + "step": 43500 + }, + { + "FLOPS loss": 0.07373356819152832, + "L0_d": 1145.33, + "MLM loss": 2.8871655464172363, + "epoch": 1.02, + "step": 43999 + }, + { + "epoch": 1.02, + "learning_rate": 9.306489795918368e-05, + "loss": 2.8582, + "step": 44000 + }, + { + "FLOPS loss": 0.05329473689198494, + "L0_d": 1014.88, + "MLM loss": 2.8173437118530273, + "epoch": 1.03, + "step": 44499 + }, + { + "epoch": 1.03, + "learning_rate": 9.296285714285715e-05, + "loss": 2.8558, + "step": 44500 + }, + { + "FLOPS loss": 0.07186653465032578, + "L0_d": 1229.8, + "MLM loss": 2.6114912033081055, + "epoch": 1.04, + "step": 44999 + }, + { + "epoch": 1.04, + "learning_rate": 9.286081632653063e-05, + "loss": 2.8566, + "step": 45000 + }, + { + "FLOPS loss": 0.06906762719154358, + "L0_d": 1032.41, + "MLM loss": 2.6828746795654297, + "epoch": 1.05, + "step": 45499 + }, + { + "epoch": 1.05, + "learning_rate": 9.275897959183673e-05, + "loss": 2.8458, + "step": 45500 + }, + { + "FLOPS loss": 0.07486148178577423, + "L0_d": 1279.2, + "MLM loss": 2.6118578910827637, + "epoch": 1.06, + "step": 45999 + }, + { + "epoch": 1.06, + "learning_rate": 9.265693877551021e-05, + "loss": 2.8553, + "step": 46000 + }, + { + "FLOPS loss": 0.06602790206670761, + "L0_d": 826.56, + "MLM loss": 2.7278029918670654, + "epoch": 1.07, + "step": 46499 + }, + { + "epoch": 1.07, + "learning_rate": 9.255489795918368e-05, + "loss": 2.8456, + "step": 46500 + }, + { + "FLOPS loss": 0.08286317437887192, + "L0_d": 1138.0, + "MLM loss": 2.9847941398620605, + "epoch": 1.09, + "step": 46999 + }, + { + "epoch": 1.09, + "learning_rate": 9.245285714285715e-05, + "loss": 2.8397, + "step": 47000 + }, + { + "FLOPS loss": 0.07151538878679276, + "L0_d": 1165.0, + "MLM loss": 2.9278883934020996, + "epoch": 1.1, + "step": 47499 + }, + { + "epoch": 1.1, + "learning_rate": 9.235081632653062e-05, + "loss": 2.839, + "step": 47500 + }, + { + "FLOPS loss": 0.06656032800674438, + "L0_d": 1079.83, + "MLM loss": 2.819605827331543, + "epoch": 1.11, + "step": 47999 + }, + { + "epoch": 1.11, + "learning_rate": 9.224897959183674e-05, + "loss": 2.8352, + "step": 48000 + }, + { + "FLOPS loss": 0.056283093988895416, + "L0_d": 749.2, + "MLM loss": 2.7207205295562744, + "epoch": 1.12, + "step": 48499 + }, + { + "epoch": 1.12, + "learning_rate": 9.21469387755102e-05, + "loss": 2.8386, + "step": 48500 + }, + { + "FLOPS loss": 0.0777335837483406, + "L0_d": 1304.86, + "MLM loss": 2.6587204933166504, + "epoch": 1.13, + "step": 48999 + }, + { + "epoch": 1.13, + "learning_rate": 9.204489795918367e-05, + "loss": 2.8318, + "step": 49000 + }, + { + "FLOPS loss": 0.0929664671421051, + "L0_d": 1032.58, + "MLM loss": 2.735736846923828, + "epoch": 1.14, + "step": 49499 + }, + { + "epoch": 1.14, + "learning_rate": 9.194285714285715e-05, + "loss": 2.8318, + "step": 49500 + }, + { + "FLOPS loss": 0.0649925172328949, + "L0_d": 917.53, + "MLM loss": 2.6756234169006348, + "epoch": 1.16, + "step": 49999 + }, + { + "epoch": 1.16, + "learning_rate": 9.184102040816327e-05, + "loss": 2.8333, + "step": 50000 + }, + { + "FLOPS loss": 0.07300473749637604, + "L0_d": 927.86, + "MLM loss": 2.652163505554199, + "epoch": 1.17, + "step": 50499 + }, + { + "epoch": 1.17, + "learning_rate": 9.173897959183673e-05, + "loss": 2.8214, + "step": 50500 + }, + { + "FLOPS loss": 0.07810306549072266, + "L0_d": 933.62, + "MLM loss": 2.746854782104492, + "epoch": 1.18, + "step": 50999 + }, + { + "epoch": 1.18, + "learning_rate": 9.163693877551021e-05, + "loss": 2.8241, + "step": 51000 + }, + { + "FLOPS loss": 0.07342377305030823, + "L0_d": 1304.92, + "MLM loss": 2.8114891052246094, + "epoch": 1.19, + "step": 51499 + }, + { + "epoch": 1.19, + "learning_rate": 9.153489795918368e-05, + "loss": 2.8203, + "step": 51500 + }, + { + "FLOPS loss": 0.07382655888795853, + "L0_d": 852.91, + "MLM loss": 2.8023548126220703, + "epoch": 1.2, + "step": 51999 + }, + { + "epoch": 1.2, + "learning_rate": 9.14330612244898e-05, + "loss": 2.8119, + "step": 52000 + }, + { + "FLOPS loss": 0.07053514569997787, + "L0_d": 921.94, + "MLM loss": 2.863825798034668, + "epoch": 1.21, + "step": 52499 + }, + { + "epoch": 1.21, + "learning_rate": 9.133102040816327e-05, + "loss": 2.812, + "step": 52500 + }, + { + "FLOPS loss": 0.059692882001399994, + "L0_d": 794.02, + "MLM loss": 2.5097005367279053, + "epoch": 1.22, + "step": 52999 + }, + { + "epoch": 1.22, + "learning_rate": 9.122897959183674e-05, + "loss": 2.8098, + "step": 53000 + }, + { + "FLOPS loss": 0.055110666900873184, + "L0_d": 732.67, + "MLM loss": 2.9446611404418945, + "epoch": 1.24, + "step": 53499 + }, + { + "epoch": 1.24, + "learning_rate": 9.11269387755102e-05, + "loss": 2.8038, + "step": 53500 + }, + { + "FLOPS loss": 0.06119615212082863, + "L0_d": 995.5, + "MLM loss": 2.774935007095337, + "epoch": 1.25, + "step": 53999 + }, + { + "epoch": 1.25, + "learning_rate": 9.102489795918367e-05, + "loss": 2.7985, + "step": 54000 + }, + { + "FLOPS loss": 0.07340589165687561, + "L0_d": 1106.77, + "MLM loss": 2.5551958084106445, + "epoch": 1.26, + "step": 54499 + }, + { + "epoch": 1.26, + "learning_rate": 9.09230612244898e-05, + "loss": 2.8004, + "step": 54500 + }, + { + "FLOPS loss": 0.08167064934968948, + "L0_d": 1180.7, + "MLM loss": 2.8482956886291504, + "epoch": 1.27, + "step": 54999 + }, + { + "epoch": 1.27, + "learning_rate": 9.082102040816327e-05, + "loss": 2.7972, + "step": 55000 + }, + { + "FLOPS loss": 0.0676698312163353, + "L0_d": 831.5, + "MLM loss": 2.8186306953430176, + "epoch": 1.28, + "step": 55499 + }, + { + "epoch": 1.28, + "learning_rate": 9.071897959183673e-05, + "loss": 2.7908, + "step": 55500 + }, + { + "FLOPS loss": 0.07911355048418045, + "L0_d": 1112.89, + "MLM loss": 2.6768712997436523, + "epoch": 1.29, + "step": 55999 + }, + { + "epoch": 1.29, + "learning_rate": 9.061693877551021e-05, + "loss": 2.7891, + "step": 56000 + }, + { + "FLOPS loss": 0.08468664437532425, + "L0_d": 1028.75, + "MLM loss": 2.5861382484436035, + "epoch": 1.31, + "step": 56499 + }, + { + "epoch": 1.31, + "learning_rate": 9.051510204081633e-05, + "loss": 2.7854, + "step": 56500 + }, + { + "FLOPS loss": 0.06968092918395996, + "L0_d": 1092.77, + "MLM loss": 2.644059181213379, + "epoch": 1.32, + "step": 56999 + }, + { + "epoch": 1.32, + "learning_rate": 9.041306122448981e-05, + "loss": 2.7848, + "step": 57000 + }, + { + "FLOPS loss": 0.07919490337371826, + "L0_d": 1015.0, + "MLM loss": 2.743734359741211, + "epoch": 1.33, + "step": 57499 + }, + { + "epoch": 1.33, + "learning_rate": 9.031102040816327e-05, + "loss": 2.7847, + "step": 57500 + }, + { + "FLOPS loss": 0.05528505891561508, + "L0_d": 672.12, + "MLM loss": 2.7157461643218994, + "epoch": 1.34, + "step": 57999 + }, + { + "epoch": 1.34, + "learning_rate": 9.020897959183674e-05, + "loss": 2.7756, + "step": 58000 + }, + { + "FLOPS loss": 0.06388474255800247, + "L0_d": 836.95, + "MLM loss": 2.740152359008789, + "epoch": 1.35, + "step": 58499 + }, + { + "epoch": 1.35, + "learning_rate": 9.010714285714286e-05, + "loss": 2.7753, + "step": 58500 + }, + { + "FLOPS loss": 0.05673633888363838, + "L0_d": 743.12, + "MLM loss": 2.7288174629211426, + "epoch": 1.36, + "step": 58999 + }, + { + "epoch": 1.36, + "learning_rate": 9.000510204081634e-05, + "loss": 2.7682, + "step": 59000 + }, + { + "FLOPS loss": 0.060955584049224854, + "L0_d": 723.3, + "MLM loss": 2.4606099128723145, + "epoch": 1.37, + "step": 59499 + }, + { + "epoch": 1.37, + "learning_rate": 8.99030612244898e-05, + "loss": 2.7721, + "step": 59500 + }, + { + "FLOPS loss": 0.07077349722385406, + "L0_d": 859.67, + "MLM loss": 2.699582576751709, + "epoch": 1.39, + "step": 59999 + }, + { + "epoch": 1.39, + "learning_rate": 8.980102040816327e-05, + "loss": 2.7674, + "step": 60000 + }, + { + "FLOPS loss": 0.07387091219425201, + "L0_d": 1039.08, + "MLM loss": 2.389226198196411, + "epoch": 1.4, + "step": 60499 + }, + { + "epoch": 1.4, + "learning_rate": 8.969918367346939e-05, + "loss": 2.7638, + "step": 60500 + }, + { + "FLOPS loss": 0.07885833084583282, + "L0_d": 1065.38, + "MLM loss": 2.8349263668060303, + "epoch": 1.41, + "step": 60999 + }, + { + "epoch": 1.41, + "learning_rate": 8.959714285714285e-05, + "loss": 2.7619, + "step": 61000 + }, + { + "FLOPS loss": 0.053016629070043564, + "L0_d": 637.61, + "MLM loss": 2.779146194458008, + "epoch": 1.42, + "step": 61499 + }, + { + "epoch": 1.42, + "learning_rate": 8.949510204081633e-05, + "loss": 2.7601, + "step": 61500 + }, + { + "FLOPS loss": 0.05506119132041931, + "L0_d": 491.41, + "MLM loss": 2.655466079711914, + "epoch": 1.43, + "step": 61999 + }, + { + "epoch": 1.43, + "learning_rate": 8.939306122448981e-05, + "loss": 2.7567, + "step": 62000 + }, + { + "FLOPS loss": 0.0772499218583107, + "L0_d": 1390.47, + "MLM loss": 2.617006301879883, + "epoch": 1.44, + "step": 62499 + }, + { + "epoch": 1.44, + "learning_rate": 8.929122448979592e-05, + "loss": 2.752, + "step": 62500 + }, + { + "FLOPS loss": 0.07647674530744553, + "L0_d": 849.38, + "MLM loss": 2.6698994636535645, + "epoch": 1.46, + "step": 62999 + }, + { + "epoch": 1.46, + "learning_rate": 8.91891836734694e-05, + "loss": 2.7522, + "step": 63000 + }, + { + "FLOPS loss": 0.0736238956451416, + "L0_d": 982.06, + "MLM loss": 2.5969135761260986, + "epoch": 1.47, + "step": 63499 + }, + { + "epoch": 1.47, + "learning_rate": 8.908714285714286e-05, + "loss": 2.7454, + "step": 63500 + }, + { + "FLOPS loss": 0.08865927159786224, + "L0_d": 1242.19, + "MLM loss": 2.737492084503174, + "epoch": 1.48, + "step": 63999 + }, + { + "epoch": 1.48, + "learning_rate": 8.898510204081632e-05, + "loss": 2.7504, + "step": 64000 + }, + { + "FLOPS loss": 0.062135014683008194, + "L0_d": 840.81, + "MLM loss": 2.648526668548584, + "epoch": 1.49, + "step": 64499 + }, + { + "epoch": 1.49, + "learning_rate": 8.888326530612246e-05, + "loss": 2.7451, + "step": 64500 + }, + { + "FLOPS loss": 0.05925214663147926, + "L0_d": 926.7, + "MLM loss": 2.7958133220672607, + "epoch": 1.5, + "step": 64999 + }, + { + "epoch": 1.5, + "learning_rate": 8.878122448979592e-05, + "loss": 2.7378, + "step": 65000 + }, + { + "FLOPS loss": 0.08276008069515228, + "L0_d": 976.19, + "MLM loss": 2.6551318168640137, + "epoch": 1.51, + "step": 65499 + }, + { + "epoch": 1.51, + "learning_rate": 8.867918367346939e-05, + "loss": 2.7361, + "step": 65500 + }, + { + "FLOPS loss": 0.07621205598115921, + "L0_d": 1072.14, + "MLM loss": 2.7889227867126465, + "epoch": 1.52, + "step": 65999 + }, + { + "epoch": 1.52, + "learning_rate": 8.857714285714285e-05, + "loss": 2.7355, + "step": 66000 + }, + { + "FLOPS loss": 0.07239171862602234, + "L0_d": 914.05, + "MLM loss": 2.578742027282715, + "epoch": 1.54, + "step": 66499 + }, + { + "epoch": 1.54, + "learning_rate": 8.847530612244899e-05, + "loss": 2.7293, + "step": 66500 + }, + { + "FLOPS loss": 0.06520795077085495, + "L0_d": 836.77, + "MLM loss": 2.6456704139709473, + "epoch": 1.55, + "step": 66999 + }, + { + "epoch": 1.55, + "learning_rate": 8.837326530612245e-05, + "loss": 2.7282, + "step": 67000 + }, + { + "FLOPS loss": 0.07171254605054855, + "L0_d": 850.34, + "MLM loss": 2.5980749130249023, + "epoch": 1.56, + "step": 67499 + }, + { + "epoch": 1.56, + "learning_rate": 8.827122448979593e-05, + "loss": 2.7284, + "step": 67500 + }, + { + "FLOPS loss": 0.07132073491811752, + "L0_d": 785.77, + "MLM loss": 2.6635169982910156, + "epoch": 1.57, + "step": 67999 + }, + { + "epoch": 1.57, + "learning_rate": 8.81691836734694e-05, + "loss": 2.7293, + "step": 68000 + }, + { + "FLOPS loss": 0.06449418514966965, + "L0_d": 750.56, + "MLM loss": 2.6216351985931396, + "epoch": 1.58, + "step": 68499 + }, + { + "epoch": 1.58, + "learning_rate": 8.806734693877551e-05, + "loss": 2.7213, + "step": 68500 + }, + { + "FLOPS loss": 0.07830583304166794, + "L0_d": 1105.78, + "MLM loss": 2.6381173133850098, + "epoch": 1.59, + "step": 68999 + }, + { + "epoch": 1.59, + "learning_rate": 8.796530612244899e-05, + "loss": 2.7196, + "step": 69000 + }, + { + "FLOPS loss": 0.06058787554502487, + "L0_d": 774.34, + "MLM loss": 2.590695858001709, + "epoch": 1.61, + "step": 69499 + }, + { + "epoch": 1.61, + "learning_rate": 8.786326530612246e-05, + "loss": 2.7193, + "step": 69500 + }, + { + "FLOPS loss": 0.07099094986915588, + "L0_d": 1099.67, + "MLM loss": 2.466123342514038, + "epoch": 1.62, + "step": 69999 + }, + { + "epoch": 1.62, + "learning_rate": 8.776122448979592e-05, + "loss": 2.7156, + "step": 70000 + }, + { + "FLOPS loss": 0.05859340727329254, + "L0_d": 635.06, + "MLM loss": 2.771111488342285, + "epoch": 1.63, + "step": 70499 + }, + { + "epoch": 1.63, + "learning_rate": 8.765938775510204e-05, + "loss": 2.7133, + "step": 70500 + }, + { + "FLOPS loss": 0.06658073514699936, + "L0_d": 971.06, + "MLM loss": 2.656001091003418, + "epoch": 1.64, + "step": 70999 + }, + { + "epoch": 1.64, + "learning_rate": 8.755734693877552e-05, + "loss": 2.7096, + "step": 71000 + }, + { + "FLOPS loss": 0.10472187399864197, + "L0_d": 1132.7, + "MLM loss": 2.534677267074585, + "epoch": 1.65, + "step": 71499 + }, + { + "epoch": 1.65, + "learning_rate": 8.745530612244899e-05, + "loss": 2.707, + "step": 71500 + }, + { + "FLOPS loss": 0.054848697036504745, + "L0_d": 609.2, + "MLM loss": 2.721672534942627, + "epoch": 1.66, + "step": 71999 + }, + { + "epoch": 1.66, + "learning_rate": 8.735326530612245e-05, + "loss": 2.7124, + "step": 72000 + }, + { + "FLOPS loss": 0.06477601826190948, + "L0_d": 952.28, + "MLM loss": 2.4252777099609375, + "epoch": 1.67, + "step": 72499 + }, + { + "epoch": 1.67, + "learning_rate": 8.725142857142857e-05, + "loss": 2.7049, + "step": 72500 + }, + { + "FLOPS loss": 0.0618317611515522, + "L0_d": 742.14, + "MLM loss": 2.300227642059326, + "epoch": 1.69, + "step": 72999 + }, + { + "epoch": 1.69, + "learning_rate": 8.714938775510204e-05, + "loss": 2.7044, + "step": 73000 + }, + { + "FLOPS loss": 0.06794578582048416, + "L0_d": 947.11, + "MLM loss": 2.5354833602905273, + "epoch": 1.7, + "step": 73499 + }, + { + "epoch": 1.7, + "learning_rate": 8.704734693877551e-05, + "loss": 2.7013, + "step": 73500 + }, + { + "FLOPS loss": 0.05277640372514725, + "L0_d": 623.72, + "MLM loss": 2.622762680053711, + "epoch": 1.71, + "step": 73999 + }, + { + "epoch": 1.71, + "learning_rate": 8.694530612244899e-05, + "loss": 2.6975, + "step": 74000 + }, + { + "FLOPS loss": 0.07353149354457855, + "L0_d": 848.69, + "MLM loss": 2.625680446624756, + "epoch": 1.72, + "step": 74499 + }, + { + "epoch": 1.72, + "learning_rate": 8.68434693877551e-05, + "loss": 2.6965, + "step": 74500 + }, + { + "FLOPS loss": 0.07950403541326523, + "L0_d": 909.52, + "MLM loss": 2.6366257667541504, + "epoch": 1.73, + "step": 74999 + }, + { + "epoch": 1.73, + "learning_rate": 8.674142857142858e-05, + "loss": 2.6986, + "step": 75000 + }, + { + "FLOPS loss": 0.0671720951795578, + "L0_d": 1059.39, + "MLM loss": 2.6872715950012207, + "epoch": 1.74, + "step": 75499 + }, + { + "epoch": 1.74, + "learning_rate": 8.663938775510204e-05, + "loss": 2.694, + "step": 75500 + }, + { + "FLOPS loss": 0.06870570033788681, + "L0_d": 728.41, + "MLM loss": 2.6747617721557617, + "epoch": 1.76, + "step": 75999 + }, + { + "epoch": 1.76, + "learning_rate": 8.653734693877551e-05, + "loss": 2.6882, + "step": 76000 + }, + { + "FLOPS loss": 0.0645112618803978, + "L0_d": 747.95, + "MLM loss": 2.559619426727295, + "epoch": 1.77, + "step": 76499 + }, + { + "epoch": 1.77, + "learning_rate": 8.643551020408164e-05, + "loss": 2.691, + "step": 76500 + }, + { + "FLOPS loss": 0.04983799159526825, + "L0_d": 720.17, + "MLM loss": 2.6016077995300293, + "epoch": 1.78, + "step": 76999 + }, + { + "epoch": 1.78, + "learning_rate": 8.63334693877551e-05, + "loss": 2.6884, + "step": 77000 + }, + { + "FLOPS loss": 0.0694853812456131, + "L0_d": 955.48, + "MLM loss": 2.679753303527832, + "epoch": 1.79, + "step": 77499 + }, + { + "epoch": 1.79, + "learning_rate": 8.623142857142857e-05, + "loss": 2.6839, + "step": 77500 + }, + { + "FLOPS loss": 0.07788027077913284, + "L0_d": 1066.77, + "MLM loss": 2.5656166076660156, + "epoch": 1.8, + "step": 77999 + }, + { + "epoch": 1.8, + "learning_rate": 8.612938775510204e-05, + "loss": 2.6836, + "step": 78000 + }, + { + "FLOPS loss": 0.05925082787871361, + "L0_d": 885.91, + "MLM loss": 2.4231157302856445, + "epoch": 1.81, + "step": 78499 + }, + { + "epoch": 1.81, + "learning_rate": 8.602755102040817e-05, + "loss": 2.6817, + "step": 78500 + }, + { + "FLOPS loss": 0.07589417695999146, + "L0_d": 827.25, + "MLM loss": 2.515610933303833, + "epoch": 1.82, + "step": 78999 + }, + { + "epoch": 1.82, + "learning_rate": 8.592551020408163e-05, + "loss": 2.6793, + "step": 79000 + }, + { + "FLOPS loss": 0.06679127365350723, + "L0_d": 889.84, + "MLM loss": 2.479564905166626, + "epoch": 1.84, + "step": 79499 + }, + { + "epoch": 1.84, + "learning_rate": 8.582346938775511e-05, + "loss": 2.6738, + "step": 79500 + }, + { + "FLOPS loss": 0.06588038057088852, + "L0_d": 816.66, + "MLM loss": 2.779053211212158, + "epoch": 1.85, + "step": 79999 + }, + { + "epoch": 1.85, + "learning_rate": 8.572142857142858e-05, + "loss": 2.6703, + "step": 80000 + }, + { + "FLOPS loss": 0.06606688350439072, + "L0_d": 742.75, + "MLM loss": 2.5370850563049316, + "epoch": 1.86, + "step": 80499 + }, + { + "epoch": 1.86, + "learning_rate": 8.56195918367347e-05, + "loss": 2.673, + "step": 80500 + }, + { + "FLOPS loss": 0.06476680189371109, + "L0_d": 953.19, + "MLM loss": 2.5319418907165527, + "epoch": 1.87, + "step": 80999 + }, + { + "epoch": 1.87, + "learning_rate": 8.551755102040818e-05, + "loss": 2.6687, + "step": 81000 + }, + { + "FLOPS loss": 0.06020694598555565, + "L0_d": 820.14, + "MLM loss": 2.550804615020752, + "epoch": 1.88, + "step": 81499 + }, + { + "epoch": 1.88, + "learning_rate": 8.541551020408164e-05, + "loss": 2.6729, + "step": 81500 + }, + { + "FLOPS loss": 0.06602118909358978, + "L0_d": 819.67, + "MLM loss": 2.5585505962371826, + "epoch": 1.89, + "step": 81999 + }, + { + "epoch": 1.89, + "learning_rate": 8.53134693877551e-05, + "loss": 2.6738, + "step": 82000 + }, + { + "FLOPS loss": 0.06344706565141678, + "L0_d": 836.28, + "MLM loss": 2.728942394256592, + "epoch": 1.91, + "step": 82499 + }, + { + "epoch": 1.91, + "learning_rate": 8.521163265306123e-05, + "loss": 2.6635, + "step": 82500 + }, + { + "FLOPS loss": 0.06816563755273819, + "L0_d": 860.02, + "MLM loss": 2.4916951656341553, + "epoch": 1.92, + "step": 82999 + }, + { + "epoch": 1.92, + "learning_rate": 8.51095918367347e-05, + "loss": 2.6667, + "step": 83000 + }, + { + "FLOPS loss": 0.059945568442344666, + "L0_d": 667.73, + "MLM loss": 2.548002004623413, + "epoch": 1.93, + "step": 83499 + }, + { + "epoch": 1.93, + "learning_rate": 8.500755102040817e-05, + "loss": 2.6663, + "step": 83500 + }, + { + "FLOPS loss": 0.07439987361431122, + "L0_d": 867.16, + "MLM loss": 2.7152485847473145, + "epoch": 1.94, + "step": 83999 + }, + { + "epoch": 1.94, + "learning_rate": 8.490551020408163e-05, + "loss": 2.6637, + "step": 84000 + }, + { + "FLOPS loss": 0.06373114883899689, + "L0_d": 820.31, + "MLM loss": 2.5678906440734863, + "epoch": 1.95, + "step": 84499 + }, + { + "epoch": 1.95, + "learning_rate": 8.480367346938775e-05, + "loss": 2.6615, + "step": 84500 + }, + { + "FLOPS loss": 0.056776098906993866, + "L0_d": 765.92, + "MLM loss": 2.439091682434082, + "epoch": 1.96, + "step": 84999 + }, + { + "epoch": 1.96, + "learning_rate": 8.470163265306122e-05, + "loss": 2.6571, + "step": 85000 + }, + { + "FLOPS loss": 0.06744925677776337, + "L0_d": 710.83, + "MLM loss": 2.693312644958496, + "epoch": 1.98, + "step": 85499 + }, + { + "epoch": 1.98, + "learning_rate": 8.45995918367347e-05, + "loss": 2.6605, + "step": 85500 + }, + { + "FLOPS loss": 0.06582710146903992, + "L0_d": 778.98, + "MLM loss": 2.602993965148926, + "epoch": 1.99, + "step": 85999 + }, + { + "epoch": 1.99, + "learning_rate": 8.449755102040818e-05, + "loss": 2.6552, + "step": 86000 + }, + { + "FLOPS loss": 0.07695449888706207, + "L0_d": 884.67, + "MLM loss": 2.4756698608398438, + "epoch": 2.0, + "step": 86499 + }, + { + "epoch": 2.0, + "learning_rate": 8.439571428571428e-05, + "loss": 2.6529, + "step": 86500 + }, + { + "FLOPS loss": 0.07339202612638474, + "L0_d": 901.58, + "MLM loss": 2.5836970806121826, + "epoch": 2.01, + "step": 86999 + }, + { + "epoch": 2.01, + "learning_rate": 8.429367346938776e-05, + "loss": 2.6459, + "step": 87000 + }, + { + "FLOPS loss": 0.06989730894565582, + "L0_d": 806.12, + "MLM loss": 2.558981418609619, + "epoch": 2.02, + "step": 87499 + }, + { + "epoch": 2.02, + "learning_rate": 8.419163265306123e-05, + "loss": 2.6459, + "step": 87500 + }, + { + "FLOPS loss": 0.06631279736757278, + "L0_d": 830.09, + "MLM loss": 2.4851489067077637, + "epoch": 2.03, + "step": 87999 + }, + { + "epoch": 2.03, + "learning_rate": 8.408959183673469e-05, + "loss": 2.644, + "step": 88000 + }, + { + "FLOPS loss": 0.07449375092983246, + "L0_d": 906.97, + "MLM loss": 2.5570497512817383, + "epoch": 2.04, + "step": 88499 + }, + { + "epoch": 2.04, + "learning_rate": 8.398775510204083e-05, + "loss": 2.6432, + "step": 88500 + }, + { + "FLOPS loss": 0.08294404298067093, + "L0_d": 1214.31, + "MLM loss": 2.543278932571411, + "epoch": 2.06, + "step": 88999 + }, + { + "epoch": 2.06, + "learning_rate": 8.388571428571429e-05, + "loss": 2.6493, + "step": 89000 + }, + { + "FLOPS loss": 0.06517869979143143, + "L0_d": 899.28, + "MLM loss": 2.4547858238220215, + "epoch": 2.07, + "step": 89499 + }, + { + "epoch": 2.07, + "learning_rate": 8.378367346938775e-05, + "loss": 2.6421, + "step": 89500 + }, + { + "FLOPS loss": 0.07665277272462845, + "L0_d": 713.2, + "MLM loss": 2.541884422302246, + "epoch": 2.08, + "step": 89999 + }, + { + "epoch": 2.08, + "learning_rate": 8.368163265306122e-05, + "loss": 2.641, + "step": 90000 + }, + { + "FLOPS loss": 0.08727514743804932, + "L0_d": 1206.22, + "MLM loss": 2.6945085525512695, + "epoch": 2.09, + "step": 90499 + }, + { + "epoch": 2.09, + "learning_rate": 8.357979591836735e-05, + "loss": 2.6428, + "step": 90500 + }, + { + "FLOPS loss": 0.06770230829715729, + "L0_d": 1007.8, + "MLM loss": 2.507075309753418, + "epoch": 2.1, + "step": 90999 + }, + { + "epoch": 2.1, + "learning_rate": 8.347775510204082e-05, + "loss": 2.6341, + "step": 91000 + }, + { + "FLOPS loss": 0.06252799928188324, + "L0_d": 856.61, + "MLM loss": 2.5938777923583984, + "epoch": 2.11, + "step": 91499 + }, + { + "epoch": 2.11, + "learning_rate": 8.33757142857143e-05, + "loss": 2.6381, + "step": 91500 + }, + { + "FLOPS loss": 0.060771603137254715, + "L0_d": 886.33, + "MLM loss": 2.4438791275024414, + "epoch": 2.13, + "step": 91999 + }, + { + "epoch": 2.13, + "learning_rate": 8.327367346938776e-05, + "loss": 2.6346, + "step": 92000 + }, + { + "FLOPS loss": 0.07406602054834366, + "L0_d": 751.48, + "MLM loss": 2.4598939418792725, + "epoch": 2.14, + "step": 92499 + }, + { + "epoch": 2.14, + "learning_rate": 8.317183673469388e-05, + "loss": 2.639, + "step": 92500 + }, + { + "FLOPS loss": 0.0679251030087471, + "L0_d": 731.48, + "MLM loss": 2.6497678756713867, + "epoch": 2.15, + "step": 92999 + }, + { + "epoch": 2.15, + "learning_rate": 8.306979591836736e-05, + "loss": 2.6292, + "step": 93000 + }, + { + "FLOPS loss": 0.05723276361823082, + "L0_d": 706.5, + "MLM loss": 2.60920786857605, + "epoch": 2.16, + "step": 93499 + }, + { + "epoch": 2.16, + "learning_rate": 8.296775510204082e-05, + "loss": 2.6314, + "step": 93500 + }, + { + "FLOPS loss": 0.08006785064935684, + "L0_d": 1262.91, + "MLM loss": 2.6934850215911865, + "epoch": 2.17, + "step": 93999 + }, + { + "epoch": 2.17, + "learning_rate": 8.286571428571429e-05, + "loss": 2.6246, + "step": 94000 + }, + { + "FLOPS loss": 0.07803159207105637, + "L0_d": 1247.95, + "MLM loss": 2.520078659057617, + "epoch": 2.18, + "step": 94499 + }, + { + "epoch": 2.18, + "learning_rate": 8.276387755102041e-05, + "loss": 2.6303, + "step": 94500 + }, + { + "FLOPS loss": 0.06651651114225388, + "L0_d": 725.97, + "MLM loss": 2.3245010375976562, + "epoch": 2.19, + "step": 94999 + }, + { + "epoch": 2.19, + "learning_rate": 8.266183673469387e-05, + "loss": 2.6238, + "step": 95000 + }, + { + "FLOPS loss": 0.052927643060684204, + "L0_d": 549.06, + "MLM loss": 2.350855588912964, + "epoch": 2.21, + "step": 95499 + }, + { + "epoch": 2.21, + "learning_rate": 8.255979591836735e-05, + "loss": 2.6232, + "step": 95500 + }, + { + "FLOPS loss": 0.07090628147125244, + "L0_d": 782.56, + "MLM loss": 2.5654730796813965, + "epoch": 2.22, + "step": 95999 + }, + { + "epoch": 2.22, + "learning_rate": 8.245775510204082e-05, + "loss": 2.622, + "step": 96000 + }, + { + "FLOPS loss": 0.07428272813558578, + "L0_d": 1075.02, + "MLM loss": 2.656454563140869, + "epoch": 2.23, + "step": 96499 + }, + { + "epoch": 2.23, + "learning_rate": 8.235591836734694e-05, + "loss": 2.6238, + "step": 96500 + }, + { + "FLOPS loss": 0.06645943969488144, + "L0_d": 740.31, + "MLM loss": 2.509154796600342, + "epoch": 2.24, + "step": 96999 + }, + { + "epoch": 2.24, + "learning_rate": 8.22538775510204e-05, + "loss": 2.617, + "step": 97000 + }, + { + "FLOPS loss": 0.08122547715902328, + "L0_d": 959.48, + "MLM loss": 2.543363094329834, + "epoch": 2.25, + "step": 97499 + }, + { + "epoch": 2.25, + "learning_rate": 8.215183673469388e-05, + "loss": 2.6142, + "step": 97500 + }, + { + "FLOPS loss": 0.0727510079741478, + "L0_d": 1239.5, + "MLM loss": 2.648973226547241, + "epoch": 2.26, + "step": 97999 + }, + { + "epoch": 2.26, + "learning_rate": 8.204979591836736e-05, + "loss": 2.6215, + "step": 98000 + }, + { + "FLOPS loss": 0.06306217610836029, + "L0_d": 1277.14, + "MLM loss": 2.6636743545532227, + "epoch": 2.28, + "step": 98499 + }, + { + "epoch": 2.28, + "learning_rate": 8.194795918367347e-05, + "loss": 2.6183, + "step": 98500 + }, + { + "FLOPS loss": 0.07064758986234665, + "L0_d": 1065.84, + "MLM loss": 2.3864660263061523, + "epoch": 2.29, + "step": 98999 + }, + { + "epoch": 2.29, + "learning_rate": 8.184591836734695e-05, + "loss": 2.6213, + "step": 99000 + }, + { + "FLOPS loss": 0.05990250036120415, + "L0_d": 717.38, + "MLM loss": 2.5958118438720703, + "epoch": 2.3, + "step": 99499 + }, + { + "epoch": 2.3, + "learning_rate": 8.174387755102041e-05, + "loss": 2.6102, + "step": 99500 + }, + { + "FLOPS loss": 0.09523125737905502, + "L0_d": 1146.02, + "MLM loss": 2.806295394897461, + "epoch": 2.31, + "step": 99999 + }, + { + "epoch": 2.31, + "learning_rate": 8.164204081632653e-05, + "loss": 2.6184, + "step": 100000 + }, + { + "FLOPS loss": 0.07538797706365585, + "L0_d": 1017.2, + "MLM loss": 2.494988441467285, + "epoch": 2.32, + "step": 100499 + }, + { + "epoch": 2.32, + "learning_rate": 8.154000000000001e-05, + "loss": 2.6124, + "step": 100500 + }, + { + "FLOPS loss": 0.07495392858982086, + "L0_d": 927.31, + "MLM loss": 2.38315749168396, + "epoch": 2.33, + "step": 100999 + }, + { + "epoch": 2.33, + "learning_rate": 8.143795918367347e-05, + "loss": 2.6134, + "step": 101000 + }, + { + "FLOPS loss": 0.06713060289621353, + "L0_d": 897.39, + "MLM loss": 2.548612356185913, + "epoch": 2.34, + "step": 101499 + }, + { + "epoch": 2.34, + "learning_rate": 8.133591836734694e-05, + "loss": 2.6139, + "step": 101500 + }, + { + "FLOPS loss": 0.0808817520737648, + "L0_d": 1192.45, + "MLM loss": 2.7223920822143555, + "epoch": 2.36, + "step": 101999 + }, + { + "epoch": 2.36, + "learning_rate": 8.123387755102042e-05, + "loss": 2.6094, + "step": 102000 + }, + { + "FLOPS loss": 0.07406562566757202, + "L0_d": 1115.09, + "MLM loss": 2.659451961517334, + "epoch": 2.37, + "step": 102499 + }, + { + "epoch": 2.37, + "learning_rate": 8.113183673469388e-05, + "loss": 2.6038, + "step": 102500 + }, + { + "FLOPS loss": 0.0725577175617218, + "L0_d": 778.3, + "MLM loss": 2.425353527069092, + "epoch": 2.38, + "step": 102999 + }, + { + "epoch": 2.38, + "learning_rate": 8.102979591836735e-05, + "loss": 2.6091, + "step": 103000 + }, + { + "FLOPS loss": 0.07214351743459702, + "L0_d": 974.78, + "MLM loss": 2.6830689907073975, + "epoch": 2.39, + "step": 103499 + }, + { + "epoch": 2.39, + "learning_rate": 8.092775510204082e-05, + "loss": 2.6073, + "step": 103500 + }, + { + "FLOPS loss": 0.07022420316934586, + "L0_d": 763.67, + "MLM loss": 2.4790377616882324, + "epoch": 2.4, + "step": 103999 + }, + { + "epoch": 2.4, + "learning_rate": 8.082612244897959e-05, + "loss": 2.6051, + "step": 104000 + }, + { + "FLOPS loss": 0.08411877602338791, + "L0_d": 1579.39, + "MLM loss": 2.727245807647705, + "epoch": 2.41, + "step": 104499 + }, + { + "epoch": 2.41, + "learning_rate": 8.072408163265307e-05, + "loss": 2.602, + "step": 104500 + }, + { + "FLOPS loss": 0.07292833179235458, + "L0_d": 985.05, + "MLM loss": 2.465031385421753, + "epoch": 2.43, + "step": 104999 + }, + { + "epoch": 2.43, + "learning_rate": 8.062204081632654e-05, + "loss": 2.6007, + "step": 105000 + }, + { + "FLOPS loss": 0.06477630138397217, + "L0_d": 1046.03, + "MLM loss": 2.657435894012451, + "epoch": 2.44, + "step": 105499 + }, + { + "epoch": 2.44, + "learning_rate": 8.052000000000001e-05, + "loss": 2.6033, + "step": 105500 + }, + { + "FLOPS loss": 0.07536941021680832, + "L0_d": 947.55, + "MLM loss": 2.5360069274902344, + "epoch": 2.45, + "step": 105999 + }, + { + "epoch": 2.45, + "learning_rate": 8.041795918367347e-05, + "loss": 2.5996, + "step": 106000 + }, + { + "FLOPS loss": 0.05159715563058853, + "L0_d": 753.66, + "MLM loss": 2.666386842727661, + "epoch": 2.46, + "step": 106499 + }, + { + "epoch": 2.46, + "learning_rate": 8.031591836734694e-05, + "loss": 2.596, + "step": 106500 + }, + { + "FLOPS loss": 0.05982593074440956, + "L0_d": 697.56, + "MLM loss": 2.532409906387329, + "epoch": 2.47, + "step": 106999 + }, + { + "epoch": 2.47, + "learning_rate": 8.021387755102042e-05, + "loss": 2.5964, + "step": 107000 + }, + { + "FLOPS loss": 0.05745324492454529, + "L0_d": 811.14, + "MLM loss": 2.605365037918091, + "epoch": 2.48, + "step": 107499 + }, + { + "epoch": 2.48, + "learning_rate": 8.011183673469388e-05, + "loss": 2.5926, + "step": 107500 + }, + { + "FLOPS loss": 0.05868089944124222, + "L0_d": 651.72, + "MLM loss": 2.6506195068359375, + "epoch": 2.49, + "step": 107999 + }, + { + "epoch": 2.49, + "learning_rate": 8.001e-05, + "loss": 2.5949, + "step": 108000 + }, + { + "FLOPS loss": 0.07431881874799728, + "L0_d": 1089.89, + "MLM loss": 2.470217704772949, + "epoch": 2.51, + "step": 108499 + }, + { + "epoch": 2.51, + "learning_rate": 7.990795918367348e-05, + "loss": 2.5869, + "step": 108500 + }, + { + "FLOPS loss": 0.0629262775182724, + "L0_d": 616.03, + "MLM loss": 2.6411805152893066, + "epoch": 2.52, + "step": 108999 + }, + { + "epoch": 2.52, + "learning_rate": 7.980591836734694e-05, + "loss": 2.5961, + "step": 109000 + }, + { + "FLOPS loss": 0.06784447282552719, + "L0_d": 863.77, + "MLM loss": 2.5637335777282715, + "epoch": 2.53, + "step": 109499 + }, + { + "epoch": 2.53, + "learning_rate": 7.970387755102041e-05, + "loss": 2.5915, + "step": 109500 + }, + { + "FLOPS loss": 0.07220463454723358, + "L0_d": 896.98, + "MLM loss": 2.5497515201568604, + "epoch": 2.54, + "step": 109999 + }, + { + "epoch": 2.54, + "learning_rate": 7.960204081632654e-05, + "loss": 2.5891, + "step": 110000 + }, + { + "FLOPS loss": 0.06754712760448456, + "L0_d": 956.58, + "MLM loss": 2.623528003692627, + "epoch": 2.55, + "step": 110499 + }, + { + "epoch": 2.55, + "learning_rate": 7.950000000000001e-05, + "loss": 2.5828, + "step": 110500 + }, + { + "FLOPS loss": 0.07457764446735382, + "L0_d": 910.52, + "MLM loss": 2.426210880279541, + "epoch": 2.56, + "step": 110999 + }, + { + "epoch": 2.56, + "learning_rate": 7.939795918367347e-05, + "loss": 2.5891, + "step": 111000 + }, + { + "FLOPS loss": 0.05750475823879242, + "L0_d": 984.28, + "MLM loss": 2.6451473236083984, + "epoch": 2.58, + "step": 111499 + }, + { + "epoch": 2.58, + "learning_rate": 7.929591836734695e-05, + "loss": 2.5836, + "step": 111500 + }, + { + "FLOPS loss": 0.0681430771946907, + "L0_d": 805.7, + "MLM loss": 2.4090609550476074, + "epoch": 2.59, + "step": 111999 + }, + { + "epoch": 2.59, + "learning_rate": 7.919408163265306e-05, + "loss": 2.5878, + "step": 112000 + }, + { + "FLOPS loss": 0.06985077261924744, + "L0_d": 780.66, + "MLM loss": 2.6808598041534424, + "epoch": 2.6, + "step": 112499 + }, + { + "epoch": 2.6, + "learning_rate": 7.909204081632652e-05, + "loss": 2.5829, + "step": 112500 + }, + { + "FLOPS loss": 0.05487761273980141, + "L0_d": 690.89, + "MLM loss": 2.610987424850464, + "epoch": 2.61, + "step": 112999 + }, + { + "epoch": 2.61, + "learning_rate": 7.899000000000001e-05, + "loss": 2.581, + "step": 113000 + }, + { + "FLOPS loss": 0.07488895207643509, + "L0_d": 1201.34, + "MLM loss": 2.655109405517578, + "epoch": 2.62, + "step": 113499 + }, + { + "epoch": 2.62, + "learning_rate": 7.888795918367348e-05, + "loss": 2.5752, + "step": 113500 + }, + { + "FLOPS loss": 0.0709005743265152, + "L0_d": 991.5, + "MLM loss": 2.626437187194824, + "epoch": 2.63, + "step": 113999 + }, + { + "epoch": 2.63, + "learning_rate": 7.87861224489796e-05, + "loss": 2.5814, + "step": 114000 + }, + { + "FLOPS loss": 0.08310511708259583, + "L0_d": 945.17, + "MLM loss": 2.42806077003479, + "epoch": 2.65, + "step": 114499 + }, + { + "epoch": 2.65, + "learning_rate": 7.868408163265306e-05, + "loss": 2.5759, + "step": 114500 + }, + { + "FLOPS loss": 0.06651800125837326, + "L0_d": 752.34, + "MLM loss": 2.439891815185547, + "epoch": 2.66, + "step": 114999 + }, + { + "epoch": 2.66, + "learning_rate": 7.858204081632653e-05, + "loss": 2.5783, + "step": 115000 + }, + { + "FLOPS loss": 0.06491895765066147, + "L0_d": 878.84, + "MLM loss": 2.401111125946045, + "epoch": 2.67, + "step": 115499 + }, + { + "epoch": 2.67, + "learning_rate": 7.848000000000001e-05, + "loss": 2.5778, + "step": 115500 + }, + { + "FLOPS loss": 0.07317697256803513, + "L0_d": 854.97, + "MLM loss": 2.4240808486938477, + "epoch": 2.68, + "step": 115999 + }, + { + "epoch": 2.68, + "learning_rate": 7.837816326530613e-05, + "loss": 2.5771, + "step": 116000 + }, + { + "FLOPS loss": 0.07135210931301117, + "L0_d": 901.19, + "MLM loss": 2.5726559162139893, + "epoch": 2.69, + "step": 116499 + }, + { + "epoch": 2.69, + "learning_rate": 7.827612244897959e-05, + "loss": 2.5728, + "step": 116500 + }, + { + "FLOPS loss": 0.07425196468830109, + "L0_d": 984.05, + "MLM loss": 2.447499990463257, + "epoch": 2.7, + "step": 116999 + }, + { + "epoch": 2.7, + "learning_rate": 7.817408163265306e-05, + "loss": 2.5697, + "step": 117000 + }, + { + "FLOPS loss": 0.057068273425102234, + "L0_d": 810.42, + "MLM loss": 2.51332950592041, + "epoch": 2.71, + "step": 117499 + }, + { + "epoch": 2.71, + "learning_rate": 7.807204081632654e-05, + "loss": 2.5769, + "step": 117500 + }, + { + "FLOPS loss": 0.07609079033136368, + "L0_d": 943.66, + "MLM loss": 2.5173351764678955, + "epoch": 2.73, + "step": 117999 + }, + { + "epoch": 2.73, + "learning_rate": 7.797020408163266e-05, + "loss": 2.5737, + "step": 118000 + }, + { + "FLOPS loss": 0.07194855064153671, + "L0_d": 938.45, + "MLM loss": 2.5522868633270264, + "epoch": 2.74, + "step": 118499 + }, + { + "epoch": 2.74, + "learning_rate": 7.786816326530612e-05, + "loss": 2.5668, + "step": 118500 + }, + { + "FLOPS loss": 0.08181776106357574, + "L0_d": 1071.69, + "MLM loss": 2.3746166229248047, + "epoch": 2.75, + "step": 118999 + }, + { + "epoch": 2.75, + "learning_rate": 7.77661224489796e-05, + "loss": 2.5687, + "step": 119000 + }, + { + "FLOPS loss": 0.07504246383905411, + "L0_d": 901.45, + "MLM loss": 2.3082871437072754, + "epoch": 2.76, + "step": 119499 + }, + { + "epoch": 2.76, + "learning_rate": 7.766408163265306e-05, + "loss": 2.5651, + "step": 119500 + }, + { + "FLOPS loss": 0.07117310911417007, + "L0_d": 729.27, + "MLM loss": 2.4190526008605957, + "epoch": 2.77, + "step": 119999 + }, + { + "epoch": 2.77, + "learning_rate": 7.756224489795918e-05, + "loss": 2.5636, + "step": 120000 + }, + { + "FLOPS loss": 0.057962577790021896, + "L0_d": 845.22, + "MLM loss": 2.624579429626465, + "epoch": 2.78, + "step": 120499 + }, + { + "epoch": 2.78, + "learning_rate": 7.746020408163266e-05, + "loss": 2.5709, + "step": 120500 + }, + { + "FLOPS loss": 0.07766810059547424, + "L0_d": 963.77, + "MLM loss": 2.475480079650879, + "epoch": 2.8, + "step": 120999 + }, + { + "epoch": 2.8, + "learning_rate": 7.735816326530613e-05, + "loss": 2.5629, + "step": 121000 + }, + { + "FLOPS loss": 0.0626688003540039, + "L0_d": 1204.02, + "MLM loss": 2.631937026977539, + "epoch": 2.81, + "step": 121499 + }, + { + "epoch": 2.81, + "learning_rate": 7.725612244897959e-05, + "loss": 2.5651, + "step": 121500 + }, + { + "FLOPS loss": 0.05523882806301117, + "L0_d": 606.53, + "MLM loss": 2.1938161849975586, + "epoch": 2.82, + "step": 121999 + }, + { + "epoch": 2.82, + "learning_rate": 7.715428571428573e-05, + "loss": 2.5658, + "step": 122000 + }, + { + "FLOPS loss": 0.0693681538105011, + "L0_d": 941.92, + "MLM loss": 2.559535503387451, + "epoch": 2.83, + "step": 122499 + }, + { + "epoch": 2.83, + "learning_rate": 7.705224489795919e-05, + "loss": 2.5655, + "step": 122500 + }, + { + "FLOPS loss": 0.059378039091825485, + "L0_d": 838.56, + "MLM loss": 2.5249712467193604, + "epoch": 2.84, + "step": 122999 + }, + { + "epoch": 2.84, + "learning_rate": 7.695020408163266e-05, + "loss": 2.5654, + "step": 123000 + }, + { + "FLOPS loss": 0.060634613037109375, + "L0_d": 911.55, + "MLM loss": 2.4866700172424316, + "epoch": 2.85, + "step": 123499 + }, + { + "epoch": 2.85, + "learning_rate": 7.684816326530613e-05, + "loss": 2.5592, + "step": 123500 + }, + { + "FLOPS loss": 0.07920261472463608, + "L0_d": 879.38, + "MLM loss": 2.4505066871643066, + "epoch": 2.86, + "step": 123999 + }, + { + "epoch": 2.86, + "learning_rate": 7.674653061224491e-05, + "loss": 2.5641, + "step": 124000 + }, + { + "FLOPS loss": 0.07321295887231827, + "L0_d": 1098.55, + "MLM loss": 2.629206657409668, + "epoch": 2.88, + "step": 124499 + }, + { + "epoch": 2.88, + "learning_rate": 7.664448979591838e-05, + "loss": 2.562, + "step": 124500 + }, + { + "FLOPS loss": 0.06171201169490814, + "L0_d": 766.72, + "MLM loss": 2.5414156913757324, + "epoch": 2.89, + "step": 124999 + }, + { + "epoch": 2.89, + "learning_rate": 7.654244897959184e-05, + "loss": 2.5537, + "step": 125000 + }, + { + "FLOPS loss": 0.06980007141828537, + "L0_d": 844.75, + "MLM loss": 2.4977681636810303, + "epoch": 2.9, + "step": 125499 + }, + { + "epoch": 2.9, + "learning_rate": 7.64404081632653e-05, + "loss": 2.5586, + "step": 125500 + }, + { + "FLOPS loss": 0.06336244940757751, + "L0_d": 737.97, + "MLM loss": 2.600965976715088, + "epoch": 2.91, + "step": 125999 + }, + { + "epoch": 2.91, + "learning_rate": 7.633836734693878e-05, + "loss": 2.5583, + "step": 126000 + }, + { + "FLOPS loss": 0.0698259174823761, + "L0_d": 1158.59, + "MLM loss": 2.584460496902466, + "epoch": 2.92, + "step": 126499 + }, + { + "epoch": 2.92, + "learning_rate": 7.623632653061225e-05, + "loss": 2.5576, + "step": 126500 + }, + { + "FLOPS loss": 0.061494987457990646, + "L0_d": 814.61, + "MLM loss": 2.4188783168792725, + "epoch": 2.93, + "step": 126999 + }, + { + "epoch": 2.93, + "learning_rate": 7.613428571428571e-05, + "loss": 2.5567, + "step": 127000 + }, + { + "FLOPS loss": 0.06094374880194664, + "L0_d": 864.25, + "MLM loss": 2.5430715084075928, + "epoch": 2.95, + "step": 127499 + }, + { + "epoch": 2.95, + "learning_rate": 7.603244897959185e-05, + "loss": 2.5541, + "step": 127500 + }, + { + "FLOPS loss": 0.06037312000989914, + "L0_d": 794.36, + "MLM loss": 2.5025737285614014, + "epoch": 2.96, + "step": 127999 + }, + { + "epoch": 2.96, + "learning_rate": 7.593040816326531e-05, + "loss": 2.5528, + "step": 128000 + }, + { + "FLOPS loss": 0.06420066207647324, + "L0_d": 810.97, + "MLM loss": 2.546041488647461, + "epoch": 2.97, + "step": 128499 + }, + { + "epoch": 2.97, + "learning_rate": 7.582836734693878e-05, + "loss": 2.5518, + "step": 128500 + }, + { + "FLOPS loss": 0.058551397174596786, + "L0_d": 718.36, + "MLM loss": 2.5001282691955566, + "epoch": 2.98, + "step": 128999 + }, + { + "epoch": 2.98, + "learning_rate": 7.572632653061224e-05, + "loss": 2.554, + "step": 129000 + }, + { + "FLOPS loss": 0.06057830527424812, + "L0_d": 939.95, + "MLM loss": 2.5786666870117188, + "epoch": 2.99, + "step": 129499 + }, + { + "epoch": 2.99, + "learning_rate": 7.562428571428572e-05, + "loss": 2.5472, + "step": 129500 + }, + { + "FLOPS loss": 0.06409164518117905, + "L0_d": 823.48, + "MLM loss": 2.62949800491333, + "epoch": 3.0, + "step": 129999 + }, + { + "epoch": 3.0, + "learning_rate": 7.552224489795918e-05, + "loss": 2.5478, + "step": 130000 + }, + { + "FLOPS loss": 0.08524499088525772, + "L0_d": 1072.58, + "MLM loss": 2.638741970062256, + "epoch": 3.01, + "step": 130499 + }, + { + "epoch": 3.01, + "learning_rate": 7.54204081632653e-05, + "loss": 2.5418, + "step": 130500 + }, + { + "FLOPS loss": 0.06733393669128418, + "L0_d": 877.78, + "MLM loss": 2.2196545600891113, + "epoch": 3.03, + "step": 130999 + }, + { + "epoch": 3.03, + "learning_rate": 7.531836734693878e-05, + "loss": 2.5425, + "step": 131000 + }, + { + "FLOPS loss": 0.066806860268116, + "L0_d": 858.41, + "MLM loss": 2.543398857116699, + "epoch": 3.04, + "step": 131499 + }, + { + "epoch": 3.04, + "learning_rate": 7.521632653061225e-05, + "loss": 2.5362, + "step": 131500 + }, + { + "FLOPS loss": 0.08286657929420471, + "L0_d": 1106.33, + "MLM loss": 2.3611695766448975, + "epoch": 3.05, + "step": 131999 + }, + { + "epoch": 3.05, + "learning_rate": 7.511428571428571e-05, + "loss": 2.5454, + "step": 132000 + }, + { + "FLOPS loss": 0.0694253146648407, + "L0_d": 733.84, + "MLM loss": 2.368508815765381, + "epoch": 3.06, + "step": 132499 + }, + { + "epoch": 3.06, + "learning_rate": 7.501224489795918e-05, + "loss": 2.5423, + "step": 132500 + }, + { + "FLOPS loss": 0.06028592213988304, + "L0_d": 672.62, + "MLM loss": 2.5725514888763428, + "epoch": 3.07, + "step": 132999 + }, + { + "epoch": 3.07, + "learning_rate": 7.491040816326531e-05, + "loss": 2.5393, + "step": 133000 + }, + { + "FLOPS loss": 0.0720820501446724, + "L0_d": 866.38, + "MLM loss": 2.5013206005096436, + "epoch": 3.08, + "step": 133499 + }, + { + "epoch": 3.08, + "learning_rate": 7.480836734693878e-05, + "loss": 2.5438, + "step": 133500 + }, + { + "FLOPS loss": 0.05794764682650566, + "L0_d": 776.61, + "MLM loss": 2.3504562377929688, + "epoch": 3.1, + "step": 133999 + }, + { + "epoch": 3.1, + "learning_rate": 7.470632653061224e-05, + "loss": 2.5376, + "step": 134000 + }, + { + "FLOPS loss": 0.06785812228918076, + "L0_d": 818.0, + "MLM loss": 2.5081028938293457, + "epoch": 3.11, + "step": 134499 + }, + { + "epoch": 3.11, + "learning_rate": 7.460428571428572e-05, + "loss": 2.5382, + "step": 134500 + }, + { + "FLOPS loss": 0.060359518975019455, + "L0_d": 841.02, + "MLM loss": 2.4709324836730957, + "epoch": 3.12, + "step": 134999 + }, + { + "epoch": 3.12, + "learning_rate": 7.450224489795918e-05, + "loss": 2.5437, + "step": 135000 + }, + { + "FLOPS loss": 0.06685052067041397, + "L0_d": 615.05, + "MLM loss": 2.397055149078369, + "epoch": 3.13, + "step": 135499 + }, + { + "epoch": 3.13, + "learning_rate": 7.440020408163265e-05, + "loss": 2.5368, + "step": 135500 + }, + { + "FLOPS loss": 0.06463826447725296, + "L0_d": 1002.53, + "MLM loss": 2.415133476257324, + "epoch": 3.14, + "step": 135999 + }, + { + "epoch": 3.14, + "learning_rate": 7.429836734693878e-05, + "loss": 2.5313, + "step": 136000 + }, + { + "FLOPS loss": 0.05908415466547012, + "L0_d": 812.0, + "MLM loss": 2.544156789779663, + "epoch": 3.15, + "step": 136499 + }, + { + "epoch": 3.15, + "learning_rate": 7.419632653061225e-05, + "loss": 2.5364, + "step": 136500 + }, + { + "FLOPS loss": 0.05779504403471947, + "L0_d": 698.34, + "MLM loss": 2.5028882026672363, + "epoch": 3.16, + "step": 136999 + }, + { + "epoch": 3.16, + "learning_rate": 7.409428571428571e-05, + "loss": 2.539, + "step": 137000 + }, + { + "FLOPS loss": 0.06667586416006088, + "L0_d": 976.53, + "MLM loss": 2.46866512298584, + "epoch": 3.18, + "step": 137499 + }, + { + "epoch": 3.18, + "learning_rate": 7.399224489795919e-05, + "loss": 2.5344, + "step": 137500 + }, + { + "FLOPS loss": 0.0754542201757431, + "L0_d": 902.7, + "MLM loss": 2.4399266242980957, + "epoch": 3.19, + "step": 137999 + }, + { + "epoch": 3.19, + "learning_rate": 7.389020408163265e-05, + "loss": 2.5295, + "step": 138000 + }, + { + "FLOPS loss": 0.05262158066034317, + "L0_d": 840.47, + "MLM loss": 2.5828146934509277, + "epoch": 3.2, + "step": 138499 + }, + { + "epoch": 3.2, + "learning_rate": 7.378816326530612e-05, + "loss": 2.532, + "step": 138500 + }, + { + "FLOPS loss": 0.06721633672714233, + "L0_d": 794.09, + "MLM loss": 2.4749674797058105, + "epoch": 3.21, + "step": 138999 + }, + { + "epoch": 3.21, + "learning_rate": 7.36861224489796e-05, + "loss": 2.531, + "step": 139000 + }, + { + "FLOPS loss": 0.06448085606098175, + "L0_d": 764.8, + "MLM loss": 2.1966323852539062, + "epoch": 3.22, + "step": 139499 + }, + { + "epoch": 3.22, + "learning_rate": 7.358408163265306e-05, + "loss": 2.5345, + "step": 139500 + }, + { + "FLOPS loss": 0.0996355339884758, + "L0_d": 1485.3, + "MLM loss": 2.5816798210144043, + "epoch": 3.23, + "step": 139999 + }, + { + "epoch": 3.23, + "learning_rate": 7.348224489795918e-05, + "loss": 2.531, + "step": 140000 + }, + { + "FLOPS loss": 0.057376861572265625, + "L0_d": 625.52, + "MLM loss": 2.390540599822998, + "epoch": 3.25, + "step": 140499 + }, + { + "epoch": 3.25, + "learning_rate": 7.338020408163265e-05, + "loss": 2.5293, + "step": 140500 + }, + { + "FLOPS loss": 0.0580020509660244, + "L0_d": 743.03, + "MLM loss": 2.636993885040283, + "epoch": 3.26, + "step": 140999 + }, + { + "epoch": 3.26, + "learning_rate": 7.327816326530613e-05, + "loss": 2.5274, + "step": 141000 + }, + { + "FLOPS loss": 0.06330183893442154, + "L0_d": 684.05, + "MLM loss": 2.500195264816284, + "epoch": 3.27, + "step": 141499 + }, + { + "epoch": 3.27, + "learning_rate": 7.31761224489796e-05, + "loss": 2.522, + "step": 141500 + }, + { + "FLOPS loss": 0.07066886126995087, + "L0_d": 913.33, + "MLM loss": 2.485964298248291, + "epoch": 3.28, + "step": 141999 + }, + { + "epoch": 3.28, + "learning_rate": 7.307408163265307e-05, + "loss": 2.5279, + "step": 142000 + }, + { + "FLOPS loss": 0.06484290212392807, + "L0_d": 708.78, + "MLM loss": 2.480302333831787, + "epoch": 3.29, + "step": 142499 + }, + { + "epoch": 3.29, + "learning_rate": 7.297204081632653e-05, + "loss": 2.5259, + "step": 142500 + }, + { + "FLOPS loss": 0.07047165185213089, + "L0_d": 1122.41, + "MLM loss": 2.410188674926758, + "epoch": 3.3, + "step": 142999 + }, + { + "epoch": 3.3, + "learning_rate": 7.287e-05, + "loss": 2.5247, + "step": 143000 + }, + { + "FLOPS loss": 0.07310927659273148, + "L0_d": 933.52, + "MLM loss": 2.3634700775146484, + "epoch": 3.31, + "step": 143499 + }, + { + "epoch": 3.32, + "learning_rate": 7.276795918367348e-05, + "loss": 2.5253, + "step": 143500 + }, + { + "FLOPS loss": 0.09060615301132202, + "L0_d": 973.42, + "MLM loss": 2.3984951972961426, + "epoch": 3.33, + "step": 143999 + }, + { + "epoch": 3.33, + "learning_rate": 7.266612244897958e-05, + "loss": 2.521, + "step": 144000 + }, + { + "FLOPS loss": 0.07206370681524277, + "L0_d": 1249.56, + "MLM loss": 2.3524210453033447, + "epoch": 3.34, + "step": 144499 + }, + { + "epoch": 3.34, + "learning_rate": 7.256408163265308e-05, + "loss": 2.5234, + "step": 144500 + }, + { + "FLOPS loss": 0.06724249571561813, + "L0_d": 814.05, + "MLM loss": 2.5358521938323975, + "epoch": 3.35, + "step": 144999 + }, + { + "epoch": 3.35, + "learning_rate": 7.246204081632654e-05, + "loss": 2.5236, + "step": 145000 + }, + { + "FLOPS loss": 0.06638313829898834, + "L0_d": 966.23, + "MLM loss": 2.3402137756347656, + "epoch": 3.36, + "step": 145499 + }, + { + "epoch": 3.36, + "learning_rate": 7.236e-05, + "loss": 2.5146, + "step": 145500 + }, + { + "FLOPS loss": 0.05941477045416832, + "L0_d": 764.61, + "MLM loss": 2.6106839179992676, + "epoch": 3.37, + "step": 145999 + }, + { + "epoch": 3.37, + "learning_rate": 7.225816326530613e-05, + "loss": 2.5194, + "step": 146000 + }, + { + "FLOPS loss": 0.08146113157272339, + "L0_d": 932.0, + "MLM loss": 2.321835994720459, + "epoch": 3.38, + "step": 146499 + }, + { + "epoch": 3.38, + "learning_rate": 7.215612244897959e-05, + "loss": 2.5239, + "step": 146500 + }, + { + "FLOPS loss": 0.063787080347538, + "L0_d": 868.5, + "MLM loss": 2.428879976272583, + "epoch": 3.4, + "step": 146999 + }, + { + "epoch": 3.4, + "learning_rate": 7.205408163265307e-05, + "loss": 2.5189, + "step": 147000 + }, + { + "FLOPS loss": 0.0798846185207367, + "L0_d": 1102.03, + "MLM loss": 2.4849820137023926, + "epoch": 3.41, + "step": 147499 + }, + { + "epoch": 3.41, + "learning_rate": 7.195204081632653e-05, + "loss": 2.5158, + "step": 147500 + }, + { + "FLOPS loss": 0.060002878308296204, + "L0_d": 809.48, + "MLM loss": 2.5779366493225098, + "epoch": 3.42, + "step": 147999 + }, + { + "epoch": 3.42, + "learning_rate": 7.185040816326531e-05, + "loss": 2.5199, + "step": 148000 + }, + { + "FLOPS loss": 0.07338414341211319, + "L0_d": 946.84, + "MLM loss": 2.385986089706421, + "epoch": 3.43, + "step": 148499 + }, + { + "epoch": 3.43, + "learning_rate": 7.174836734693879e-05, + "loss": 2.519, + "step": 148500 + }, + { + "FLOPS loss": 0.07006336748600006, + "L0_d": 939.03, + "MLM loss": 2.53456711769104, + "epoch": 3.44, + "step": 148999 + }, + { + "epoch": 3.44, + "learning_rate": 7.164632653061225e-05, + "loss": 2.5185, + "step": 149000 + }, + { + "FLOPS loss": 0.0663251280784607, + "L0_d": 868.73, + "MLM loss": 2.4946329593658447, + "epoch": 3.45, + "step": 149499 + }, + { + "epoch": 3.45, + "learning_rate": 7.154428571428572e-05, + "loss": 2.5158, + "step": 149500 + }, + { + "FLOPS loss": 0.05879246070981026, + "L0_d": 841.36, + "MLM loss": 2.4197614192962646, + "epoch": 3.47, + "step": 149999 + }, + { + "epoch": 3.47, + "learning_rate": 7.144224489795918e-05, + "loss": 2.516, + "step": 150000 + }, + { + "FLOPS loss": 0.06893225014209747, + "L0_d": 1096.61, + "MLM loss": 2.459559202194214, + "epoch": 3.48, + "step": 150499 + }, + { + "epoch": 3.48, + "learning_rate": 7.134020408163266e-05, + "loss": 2.516, + "step": 150500 + }, + { + "FLOPS loss": 0.06730066984891891, + "L0_d": 1061.92, + "MLM loss": 2.5642447471618652, + "epoch": 3.49, + "step": 150999 + }, + { + "epoch": 3.49, + "learning_rate": 7.123836734693878e-05, + "loss": 2.5121, + "step": 151000 + }, + { + "FLOPS loss": 0.0661296620965004, + "L0_d": 755.22, + "MLM loss": 2.353799819946289, + "epoch": 3.5, + "step": 151499 + }, + { + "epoch": 3.5, + "learning_rate": 7.113632653061225e-05, + "loss": 2.5118, + "step": 151500 + }, + { + "FLOPS loss": 0.07206888496875763, + "L0_d": 981.41, + "MLM loss": 2.3166513442993164, + "epoch": 3.51, + "step": 151999 + }, + { + "epoch": 3.51, + "learning_rate": 7.103428571428572e-05, + "loss": 2.5139, + "step": 152000 + }, + { + "FLOPS loss": 0.06955097615718842, + "L0_d": 1074.69, + "MLM loss": 2.5231192111968994, + "epoch": 3.52, + "step": 152499 + }, + { + "epoch": 3.52, + "learning_rate": 7.093224489795919e-05, + "loss": 2.5161, + "step": 152500 + }, + { + "FLOPS loss": 0.06898514181375504, + "L0_d": 981.22, + "MLM loss": 2.3581669330596924, + "epoch": 3.53, + "step": 152999 + }, + { + "epoch": 3.53, + "learning_rate": 7.083040816326531e-05, + "loss": 2.5105, + "step": 153000 + }, + { + "FLOPS loss": 0.061563555151224136, + "L0_d": 751.62, + "MLM loss": 2.4208712577819824, + "epoch": 3.55, + "step": 153499 + }, + { + "epoch": 3.55, + "learning_rate": 7.072836734693879e-05, + "loss": 2.5089, + "step": 153500 + }, + { + "FLOPS loss": 0.0783623531460762, + "L0_d": 980.56, + "MLM loss": 2.6533374786376953, + "epoch": 3.56, + "step": 153999 + }, + { + "epoch": 3.56, + "learning_rate": 7.062632653061225e-05, + "loss": 2.5099, + "step": 154000 + }, + { + "FLOPS loss": 0.06331176310777664, + "L0_d": 770.83, + "MLM loss": 2.3688929080963135, + "epoch": 3.57, + "step": 154499 + }, + { + "epoch": 3.57, + "learning_rate": 7.052428571428572e-05, + "loss": 2.508, + "step": 154500 + }, + { + "FLOPS loss": 0.06306309252977371, + "L0_d": 811.0, + "MLM loss": 2.386118173599243, + "epoch": 3.58, + "step": 154999 + }, + { + "epoch": 3.58, + "learning_rate": 7.042224489795918e-05, + "loss": 2.5072, + "step": 155000 + }, + { + "FLOPS loss": 0.06929371505975723, + "L0_d": 981.34, + "MLM loss": 2.2670648097991943, + "epoch": 3.59, + "step": 155499 + }, + { + "epoch": 3.59, + "learning_rate": 7.032020408163266e-05, + "loss": 2.5088, + "step": 155500 + }, + { + "FLOPS loss": 0.07412055879831314, + "L0_d": 1162.94, + "MLM loss": 2.541748523712158, + "epoch": 3.6, + "step": 155999 + }, + { + "epoch": 3.6, + "learning_rate": 7.021836734693877e-05, + "loss": 2.5025, + "step": 156000 + }, + { + "FLOPS loss": 0.06029690429568291, + "L0_d": 692.28, + "MLM loss": 2.228192090988159, + "epoch": 3.62, + "step": 156499 + }, + { + "epoch": 3.62, + "learning_rate": 7.011632653061226e-05, + "loss": 2.5077, + "step": 156500 + }, + { + "FLOPS loss": 0.07891872525215149, + "L0_d": 1194.77, + "MLM loss": 2.641678810119629, + "epoch": 3.63, + "step": 156999 + }, + { + "epoch": 3.63, + "learning_rate": 7.001428571428572e-05, + "loss": 2.5104, + "step": 157000 + }, + { + "FLOPS loss": 0.07251445204019547, + "L0_d": 1016.83, + "MLM loss": 2.535757303237915, + "epoch": 3.64, + "step": 157499 + }, + { + "epoch": 3.64, + "learning_rate": 6.991224489795919e-05, + "loss": 2.5036, + "step": 157500 + }, + { + "FLOPS loss": 0.06319019943475723, + "L0_d": 923.69, + "MLM loss": 2.5910186767578125, + "epoch": 3.65, + "step": 157999 + }, + { + "epoch": 3.65, + "learning_rate": 6.981020408163265e-05, + "loss": 2.5074, + "step": 158000 + }, + { + "FLOPS loss": 0.0667872279882431, + "L0_d": 888.39, + "MLM loss": 2.5367581844329834, + "epoch": 3.66, + "step": 158499 + }, + { + "epoch": 3.66, + "learning_rate": 6.970816326530613e-05, + "loss": 2.5059, + "step": 158500 + }, + { + "FLOPS loss": 0.07012256979942322, + "L0_d": 811.02, + "MLM loss": 2.6520166397094727, + "epoch": 3.67, + "step": 158999 + }, + { + "epoch": 3.67, + "learning_rate": 6.960632653061225e-05, + "loss": 2.5025, + "step": 159000 + }, + { + "FLOPS loss": 0.06510796397924423, + "L0_d": 698.33, + "MLM loss": 2.357128858566284, + "epoch": 3.68, + "step": 159499 + }, + { + "epoch": 3.68, + "learning_rate": 6.950428571428572e-05, + "loss": 2.5018, + "step": 159500 + }, + { + "FLOPS loss": 0.06573283672332764, + "L0_d": 768.05, + "MLM loss": 2.6148369312286377, + "epoch": 3.7, + "step": 159999 + }, + { + "epoch": 3.7, + "learning_rate": 6.94022448979592e-05, + "loss": 2.503, + "step": 160000 + }, + { + "FLOPS loss": 0.07524820417165756, + "L0_d": 890.48, + "MLM loss": 2.1664788722991943, + "epoch": 3.71, + "step": 160499 + }, + { + "epoch": 3.71, + "learning_rate": 6.930020408163266e-05, + "loss": 2.505, + "step": 160500 + }, + { + "FLOPS loss": 0.08785947412252426, + "L0_d": 948.19, + "MLM loss": 2.360063076019287, + "epoch": 3.72, + "step": 160999 + }, + { + "epoch": 3.72, + "learning_rate": 6.919816326530612e-05, + "loss": 2.4969, + "step": 161000 + }, + { + "FLOPS loss": 0.056488048285245895, + "L0_d": 841.88, + "MLM loss": 2.2539350986480713, + "epoch": 3.73, + "step": 161499 + }, + { + "epoch": 3.73, + "learning_rate": 6.909612244897959e-05, + "loss": 2.4992, + "step": 161500 + }, + { + "FLOPS loss": 0.06215016916394234, + "L0_d": 829.36, + "MLM loss": 2.4596550464630127, + "epoch": 3.74, + "step": 161999 + }, + { + "epoch": 3.74, + "learning_rate": 6.899408163265307e-05, + "loss": 2.499, + "step": 162000 + }, + { + "FLOPS loss": 0.05874626338481903, + "L0_d": 774.12, + "MLM loss": 2.417466640472412, + "epoch": 3.75, + "step": 162499 + }, + { + "epoch": 3.75, + "learning_rate": 6.889224489795919e-05, + "loss": 2.4987, + "step": 162500 + }, + { + "FLOPS loss": 0.06776445358991623, + "L0_d": 798.8, + "MLM loss": 2.3213367462158203, + "epoch": 3.77, + "step": 162999 + }, + { + "epoch": 3.77, + "learning_rate": 6.879020408163265e-05, + "loss": 2.4924, + "step": 163000 + }, + { + "FLOPS loss": 0.06742731481790543, + "L0_d": 967.53, + "MLM loss": 2.482436418533325, + "epoch": 3.78, + "step": 163499 + }, + { + "epoch": 3.78, + "learning_rate": 6.868816326530613e-05, + "loss": 2.4984, + "step": 163500 + }, + { + "FLOPS loss": 0.06478297710418701, + "L0_d": 887.28, + "MLM loss": 2.4176084995269775, + "epoch": 3.79, + "step": 163999 + }, + { + "epoch": 3.79, + "learning_rate": 6.85861224489796e-05, + "loss": 2.4956, + "step": 164000 + }, + { + "FLOPS loss": 0.06537578254938126, + "L0_d": 739.81, + "MLM loss": 2.1967082023620605, + "epoch": 3.8, + "step": 164499 + }, + { + "epoch": 3.8, + "learning_rate": 6.848408163265306e-05, + "loss": 2.4918, + "step": 164500 + }, + { + "FLOPS loss": 0.06188281252980232, + "L0_d": 738.86, + "MLM loss": 2.5005993843078613, + "epoch": 3.81, + "step": 164999 + }, + { + "epoch": 3.81, + "learning_rate": 6.838204081632653e-05, + "loss": 2.4994, + "step": 165000 + }, + { + "FLOPS loss": 0.07663436233997345, + "L0_d": 922.31, + "MLM loss": 2.429055690765381, + "epoch": 3.82, + "step": 165499 + }, + { + "epoch": 3.82, + "learning_rate": 6.828020408163266e-05, + "loss": 2.4945, + "step": 165500 + }, + { + "FLOPS loss": 0.0570794902741909, + "L0_d": 576.69, + "MLM loss": 2.1654653549194336, + "epoch": 3.83, + "step": 165999 + }, + { + "epoch": 3.83, + "learning_rate": 6.817816326530612e-05, + "loss": 2.4976, + "step": 166000 + }, + { + "FLOPS loss": 0.07364285737276077, + "L0_d": 984.09, + "MLM loss": 2.361241579055786, + "epoch": 3.85, + "step": 166499 + }, + { + "epoch": 3.85, + "learning_rate": 6.807612244897959e-05, + "loss": 2.4919, + "step": 166500 + }, + { + "FLOPS loss": 0.06903867423534393, + "L0_d": 704.95, + "MLM loss": 2.1658754348754883, + "epoch": 3.86, + "step": 166999 + }, + { + "epoch": 3.86, + "learning_rate": 6.797408163265307e-05, + "loss": 2.49, + "step": 167000 + }, + { + "FLOPS loss": 0.05657253786921501, + "L0_d": 679.52, + "MLM loss": 2.4498727321624756, + "epoch": 3.87, + "step": 167499 + }, + { + "epoch": 3.87, + "learning_rate": 6.787204081632653e-05, + "loss": 2.4901, + "step": 167500 + }, + { + "FLOPS loss": 0.0767899677157402, + "L0_d": 971.0, + "MLM loss": 2.4667670726776123, + "epoch": 3.88, + "step": 167999 + }, + { + "epoch": 3.88, + "learning_rate": 6.777020408163267e-05, + "loss": 2.4973, + "step": 168000 + }, + { + "FLOPS loss": 0.05883647873997688, + "L0_d": 711.62, + "MLM loss": 2.3792476654052734, + "epoch": 3.89, + "step": 168499 + }, + { + "epoch": 3.89, + "learning_rate": 6.766816326530613e-05, + "loss": 2.4897, + "step": 168500 + }, + { + "FLOPS loss": 0.06558485329151154, + "L0_d": 781.28, + "MLM loss": 2.2171566486358643, + "epoch": 3.9, + "step": 168999 + }, + { + "epoch": 3.9, + "learning_rate": 6.75661224489796e-05, + "loss": 2.4888, + "step": 169000 + }, + { + "FLOPS loss": 0.06537246704101562, + "L0_d": 792.88, + "MLM loss": 2.549509286880493, + "epoch": 3.92, + "step": 169499 + }, + { + "epoch": 3.92, + "learning_rate": 6.746408163265306e-05, + "loss": 2.4841, + "step": 169500 + }, + { + "FLOPS loss": 0.06672216951847076, + "L0_d": 1022.78, + "MLM loss": 2.375237464904785, + "epoch": 3.93, + "step": 169999 + }, + { + "epoch": 3.93, + "learning_rate": 6.736224489795918e-05, + "loss": 2.4857, + "step": 170000 + }, + { + "FLOPS loss": 0.06473018229007721, + "L0_d": 891.12, + "MLM loss": 2.572554349899292, + "epoch": 3.94, + "step": 170499 + }, + { + "epoch": 3.94, + "learning_rate": 6.726020408163266e-05, + "loss": 2.4904, + "step": 170500 + }, + { + "FLOPS loss": 0.06366889923810959, + "L0_d": 627.78, + "MLM loss": 2.4447898864746094, + "epoch": 3.95, + "step": 170999 + }, + { + "epoch": 3.95, + "learning_rate": 6.715816326530612e-05, + "loss": 2.4877, + "step": 171000 + }, + { + "FLOPS loss": 0.06390350311994553, + "L0_d": 1048.25, + "MLM loss": 2.48956561088562, + "epoch": 3.96, + "step": 171499 + }, + { + "epoch": 3.96, + "learning_rate": 6.70561224489796e-05, + "loss": 2.4863, + "step": 171500 + }, + { + "FLOPS loss": 0.07498548924922943, + "L0_d": 1041.67, + "MLM loss": 2.488102674484253, + "epoch": 3.97, + "step": 171999 + }, + { + "epoch": 3.97, + "learning_rate": 6.695428571428571e-05, + "loss": 2.4854, + "step": 172000 + }, + { + "FLOPS loss": 0.08240804076194763, + "L0_d": 914.86, + "MLM loss": 2.6523680686950684, + "epoch": 3.98, + "step": 172499 + }, + { + "epoch": 3.98, + "learning_rate": 6.685224489795919e-05, + "loss": 2.484, + "step": 172500 + }, + { + "FLOPS loss": 0.06099969893693924, + "L0_d": 978.86, + "MLM loss": 2.64540433883667, + "epoch": 4.0, + "step": 172999 + }, + { + "epoch": 4.0, + "learning_rate": 6.675020408163267e-05, + "loss": 2.4841, + "step": 173000 + }, + { + "FLOPS loss": 0.06223977357149124, + "L0_d": 757.42, + "MLM loss": 2.6576099395751953, + "epoch": 4.01, + "step": 173499 + }, + { + "epoch": 4.01, + "learning_rate": 6.664816326530613e-05, + "loss": 2.4863, + "step": 173500 + }, + { + "FLOPS loss": 0.06578174233436584, + "L0_d": 1087.84, + "MLM loss": 2.409296989440918, + "epoch": 4.02, + "step": 173999 + }, + { + "epoch": 4.02, + "learning_rate": 6.65461224489796e-05, + "loss": 2.4807, + "step": 174000 + }, + { + "FLOPS loss": 0.07225192338228226, + "L0_d": 849.7, + "MLM loss": 2.235647678375244, + "epoch": 4.03, + "step": 174499 + }, + { + "epoch": 4.03, + "learning_rate": 6.644408163265306e-05, + "loss": 2.4749, + "step": 174500 + }, + { + "FLOPS loss": 0.06585001945495605, + "L0_d": 951.81, + "MLM loss": 2.3266072273254395, + "epoch": 4.04, + "step": 174999 + }, + { + "epoch": 4.04, + "learning_rate": 6.634224489795918e-05, + "loss": 2.4813, + "step": 175000 + }, + { + "FLOPS loss": 0.06246158108115196, + "L0_d": 697.7, + "MLM loss": 2.5024335384368896, + "epoch": 4.05, + "step": 175499 + }, + { + "epoch": 4.05, + "learning_rate": 6.624020408163265e-05, + "loss": 2.4757, + "step": 175500 + }, + { + "FLOPS loss": 0.071586973965168, + "L0_d": 943.44, + "MLM loss": 2.432471752166748, + "epoch": 4.07, + "step": 175999 + }, + { + "epoch": 4.07, + "learning_rate": 6.613816326530612e-05, + "loss": 2.4804, + "step": 176000 + }, + { + "FLOPS loss": 0.07770819962024689, + "L0_d": 967.03, + "MLM loss": 2.4807729721069336, + "epoch": 4.08, + "step": 176499 + }, + { + "epoch": 4.08, + "learning_rate": 6.60361224489796e-05, + "loss": 2.4792, + "step": 176500 + }, + { + "FLOPS loss": 0.06387147307395935, + "L0_d": 819.58, + "MLM loss": 2.364795207977295, + "epoch": 4.09, + "step": 176999 + }, + { + "epoch": 4.09, + "learning_rate": 6.593408163265307e-05, + "loss": 2.4746, + "step": 177000 + }, + { + "FLOPS loss": 0.061465099453926086, + "L0_d": 921.8, + "MLM loss": 2.4672608375549316, + "epoch": 4.1, + "step": 177499 + }, + { + "epoch": 4.1, + "learning_rate": 6.583224489795919e-05, + "loss": 2.4807, + "step": 177500 + }, + { + "FLOPS loss": 0.06799270212650299, + "L0_d": 828.22, + "MLM loss": 2.5122363567352295, + "epoch": 4.11, + "step": 177999 + }, + { + "epoch": 4.11, + "learning_rate": 6.573020408163265e-05, + "loss": 2.4813, + "step": 178000 + }, + { + "FLOPS loss": 0.06093809753656387, + "L0_d": 871.48, + "MLM loss": 2.2750113010406494, + "epoch": 4.12, + "step": 178499 + }, + { + "epoch": 4.12, + "learning_rate": 6.562816326530613e-05, + "loss": 2.477, + "step": 178500 + }, + { + "FLOPS loss": 0.05775380879640579, + "L0_d": 925.05, + "MLM loss": 2.4610414505004883, + "epoch": 4.14, + "step": 178999 + }, + { + "epoch": 4.14, + "learning_rate": 6.55261224489796e-05, + "loss": 2.478, + "step": 179000 + }, + { + "FLOPS loss": 0.05679761990904808, + "L0_d": 599.97, + "MLM loss": 2.371889114379883, + "epoch": 4.15, + "step": 179499 + }, + { + "epoch": 4.15, + "learning_rate": 6.542408163265307e-05, + "loss": 2.4746, + "step": 179500 + }, + { + "FLOPS loss": 0.07693447172641754, + "L0_d": 914.91, + "MLM loss": 2.4633376598358154, + "epoch": 4.16, + "step": 179999 + }, + { + "epoch": 4.16, + "learning_rate": 6.532204081632654e-05, + "loss": 2.4724, + "step": 180000 + }, + { + "FLOPS loss": 0.086503766477108, + "L0_d": 987.11, + "MLM loss": 2.541353464126587, + "epoch": 4.17, + "step": 180499 + }, + { + "epoch": 4.17, + "learning_rate": 6.522e-05, + "loss": 2.4715, + "step": 180500 + }, + { + "FLOPS loss": 0.07103153318166733, + "L0_d": 972.25, + "MLM loss": 2.412931442260742, + "epoch": 4.18, + "step": 180999 + }, + { + "epoch": 4.18, + "learning_rate": 6.511795918367347e-05, + "loss": 2.4691, + "step": 181000 + }, + { + "FLOPS loss": 0.0597836896777153, + "L0_d": 771.88, + "MLM loss": 2.3725850582122803, + "epoch": 4.19, + "step": 181499 + }, + { + "epoch": 4.19, + "learning_rate": 6.50161224489796e-05, + "loss": 2.4685, + "step": 181500 + }, + { + "FLOPS loss": 0.06964124739170074, + "L0_d": 835.75, + "MLM loss": 2.400667190551758, + "epoch": 4.2, + "step": 181999 + }, + { + "epoch": 4.2, + "learning_rate": 6.491428571428572e-05, + "loss": 2.4715, + "step": 182000 + }, + { + "FLOPS loss": 0.07146003842353821, + "L0_d": 1029.83, + "MLM loss": 2.122744560241699, + "epoch": 4.22, + "step": 182499 + }, + { + "epoch": 4.22, + "learning_rate": 6.481224489795919e-05, + "loss": 2.4713, + "step": 182500 + }, + { + "FLOPS loss": 0.06947391480207443, + "L0_d": 910.61, + "MLM loss": 2.2833240032196045, + "epoch": 4.23, + "step": 182999 + }, + { + "epoch": 4.23, + "learning_rate": 6.471020408163265e-05, + "loss": 2.4702, + "step": 183000 + }, + { + "FLOPS loss": 0.06873425096273422, + "L0_d": 746.22, + "MLM loss": 2.384280204772949, + "epoch": 4.24, + "step": 183499 + }, + { + "epoch": 4.24, + "learning_rate": 6.460816326530612e-05, + "loss": 2.4663, + "step": 183500 + }, + { + "FLOPS loss": 0.07945630699396133, + "L0_d": 962.33, + "MLM loss": 2.3059887886047363, + "epoch": 4.25, + "step": 183999 + }, + { + "epoch": 4.25, + "learning_rate": 6.45061224489796e-05, + "loss": 2.4725, + "step": 184000 + }, + { + "FLOPS loss": 0.07592689990997314, + "L0_d": 860.5, + "MLM loss": 2.4063377380371094, + "epoch": 4.26, + "step": 184499 + }, + { + "epoch": 4.26, + "learning_rate": 6.440408163265307e-05, + "loss": 2.466, + "step": 184500 + }, + { + "FLOPS loss": 0.0667710080742836, + "L0_d": 990.41, + "MLM loss": 2.3277931213378906, + "epoch": 4.27, + "step": 184999 + }, + { + "epoch": 4.27, + "learning_rate": 6.430204081632654e-05, + "loss": 2.4706, + "step": 185000 + }, + { + "FLOPS loss": 0.059676315635442734, + "L0_d": 775.2, + "MLM loss": 2.447890281677246, + "epoch": 4.29, + "step": 185499 + }, + { + "epoch": 4.29, + "learning_rate": 6.42e-05, + "loss": 2.4713, + "step": 185500 + }, + { + "FLOPS loss": 0.04730994626879692, + "L0_d": 562.97, + "MLM loss": 2.41772723197937, + "epoch": 4.3, + "step": 185999 + }, + { + "epoch": 4.3, + "learning_rate": 6.409816326530612e-05, + "loss": 2.4665, + "step": 186000 + }, + { + "FLOPS loss": 0.07394890487194061, + "L0_d": 839.94, + "MLM loss": 2.3134212493896484, + "epoch": 4.31, + "step": 186499 + }, + { + "epoch": 4.31, + "learning_rate": 6.399612244897959e-05, + "loss": 2.466, + "step": 186500 + }, + { + "FLOPS loss": 0.06606470048427582, + "L0_d": 914.36, + "MLM loss": 2.381443500518799, + "epoch": 4.32, + "step": 186999 + }, + { + "epoch": 4.32, + "learning_rate": 6.389428571428572e-05, + "loss": 2.4618, + "step": 187000 + }, + { + "FLOPS loss": 0.05293360352516174, + "L0_d": 687.72, + "MLM loss": 2.465054512023926, + "epoch": 4.33, + "step": 187499 + }, + { + "epoch": 4.33, + "learning_rate": 6.379224489795919e-05, + "loss": 2.4642, + "step": 187500 + }, + { + "FLOPS loss": 0.07550951093435287, + "L0_d": 1098.16, + "MLM loss": 2.3925881385803223, + "epoch": 4.34, + "step": 187999 + }, + { + "epoch": 4.34, + "learning_rate": 6.369020408163265e-05, + "loss": 2.4604, + "step": 188000 + }, + { + "FLOPS loss": 0.07207269966602325, + "L0_d": 883.66, + "MLM loss": 2.3470559120178223, + "epoch": 4.35, + "step": 188499 + }, + { + "epoch": 4.35, + "learning_rate": 6.358816326530612e-05, + "loss": 2.4648, + "step": 188500 + }, + { + "FLOPS loss": 0.06928683072328568, + "L0_d": 1007.05, + "MLM loss": 2.087768077850342, + "epoch": 4.37, + "step": 188999 + }, + { + "epoch": 4.37, + "learning_rate": 6.348632653061225e-05, + "loss": 2.4638, + "step": 189000 + }, + { + "FLOPS loss": 0.07827793061733246, + "L0_d": 1170.62, + "MLM loss": 2.2623305320739746, + "epoch": 4.38, + "step": 189499 + }, + { + "epoch": 4.38, + "learning_rate": 6.338428571428571e-05, + "loss": 2.4648, + "step": 189500 + }, + { + "FLOPS loss": 0.07797204703092575, + "L0_d": 805.08, + "MLM loss": 2.424680233001709, + "epoch": 4.39, + "step": 189999 + }, + { + "epoch": 4.39, + "learning_rate": 6.328224489795919e-05, + "loss": 2.4647, + "step": 190000 + }, + { + "FLOPS loss": 0.07720698416233063, + "L0_d": 1047.55, + "MLM loss": 2.3045454025268555, + "epoch": 4.4, + "step": 190499 + }, + { + "epoch": 4.4, + "learning_rate": 6.318020408163266e-05, + "loss": 2.4631, + "step": 190500 + }, + { + "FLOPS loss": 0.060575131326913834, + "L0_d": 751.7, + "MLM loss": 2.3671631813049316, + "epoch": 4.41, + "step": 190999 + }, + { + "epoch": 4.41, + "learning_rate": 6.307836734693878e-05, + "loss": 2.4619, + "step": 191000 + }, + { + "FLOPS loss": 0.06604286283254623, + "L0_d": 847.69, + "MLM loss": 2.46954083442688, + "epoch": 4.42, + "step": 191499 + }, + { + "epoch": 4.42, + "learning_rate": 6.297632653061226e-05, + "loss": 2.4616, + "step": 191500 + }, + { + "FLOPS loss": 0.04703153297305107, + "L0_d": 675.36, + "MLM loss": 2.5719079971313477, + "epoch": 4.44, + "step": 191999 + }, + { + "epoch": 4.44, + "learning_rate": 6.287428571428572e-05, + "loss": 2.4656, + "step": 192000 + }, + { + "FLOPS loss": 0.060653429478406906, + "L0_d": 787.83, + "MLM loss": 2.260763645172119, + "epoch": 4.45, + "step": 192499 + }, + { + "epoch": 4.45, + "learning_rate": 6.277224489795919e-05, + "loss": 2.4611, + "step": 192500 + }, + { + "FLOPS loss": 0.06292744725942612, + "L0_d": 938.81, + "MLM loss": 2.5848846435546875, + "epoch": 4.46, + "step": 192999 + }, + { + "epoch": 4.46, + "learning_rate": 6.26704081632653e-05, + "loss": 2.4647, + "step": 193000 + }, + { + "FLOPS loss": 0.0530436746776104, + "L0_d": 707.17, + "MLM loss": 2.472729206085205, + "epoch": 4.47, + "step": 193499 + }, + { + "epoch": 4.47, + "learning_rate": 6.256836734693878e-05, + "loss": 2.4558, + "step": 193500 + }, + { + "FLOPS loss": 0.07420609146356583, + "L0_d": 895.44, + "MLM loss": 2.316894292831421, + "epoch": 4.48, + "step": 193999 + }, + { + "epoch": 4.48, + "learning_rate": 6.246632653061225e-05, + "loss": 2.4583, + "step": 194000 + }, + { + "FLOPS loss": 0.0655120238661766, + "L0_d": 853.75, + "MLM loss": 2.3448705673217773, + "epoch": 4.49, + "step": 194499 + }, + { + "epoch": 4.49, + "learning_rate": 6.236428571428571e-05, + "loss": 2.4601, + "step": 194500 + }, + { + "FLOPS loss": 0.07075578719377518, + "L0_d": 1198.17, + "MLM loss": 2.44946551322937, + "epoch": 4.5, + "step": 194999 + }, + { + "epoch": 4.5, + "learning_rate": 6.226224489795919e-05, + "loss": 2.4558, + "step": 195000 + }, + { + "FLOPS loss": 0.07796045392751694, + "L0_d": 940.88, + "MLM loss": 2.329540491104126, + "epoch": 4.52, + "step": 195499 + }, + { + "epoch": 4.52, + "learning_rate": 6.216020408163266e-05, + "loss": 2.4581, + "step": 195500 + }, + { + "FLOPS loss": 0.06250694394111633, + "L0_d": 813.84, + "MLM loss": 2.342804431915283, + "epoch": 4.53, + "step": 195999 + }, + { + "epoch": 4.53, + "learning_rate": 6.205836734693878e-05, + "loss": 2.4541, + "step": 196000 + }, + { + "FLOPS loss": 0.059746935963630676, + "L0_d": 639.66, + "MLM loss": 2.2123336791992188, + "epoch": 4.54, + "step": 196499 + }, + { + "epoch": 4.54, + "learning_rate": 6.195632653061226e-05, + "loss": 2.4565, + "step": 196500 + }, + { + "FLOPS loss": 0.05932118371129036, + "L0_d": 833.75, + "MLM loss": 2.4198126792907715, + "epoch": 4.55, + "step": 196999 + }, + { + "epoch": 4.55, + "learning_rate": 6.185428571428572e-05, + "loss": 2.4549, + "step": 197000 + }, + { + "FLOPS loss": 0.06285431236028671, + "L0_d": 736.86, + "MLM loss": 2.3848378658294678, + "epoch": 4.56, + "step": 197499 + }, + { + "epoch": 4.56, + "learning_rate": 6.175224489795919e-05, + "loss": 2.4597, + "step": 197500 + }, + { + "FLOPS loss": 0.07099881768226624, + "L0_d": 950.34, + "MLM loss": 2.4628701210021973, + "epoch": 4.57, + "step": 197999 + }, + { + "epoch": 4.57, + "learning_rate": 6.16504081632653e-05, + "loss": 2.4559, + "step": 198000 + }, + { + "FLOPS loss": 0.05927359312772751, + "L0_d": 951.58, + "MLM loss": 2.459381103515625, + "epoch": 4.59, + "step": 198499 + }, + { + "epoch": 4.59, + "learning_rate": 6.154836734693877e-05, + "loss": 2.4561, + "step": 198500 + }, + { + "FLOPS loss": 0.07203791290521622, + "L0_d": 927.34, + "MLM loss": 2.253113269805908, + "epoch": 4.6, + "step": 198999 + }, + { + "epoch": 4.6, + "learning_rate": 6.144632653061225e-05, + "loss": 2.4606, + "step": 199000 + }, + { + "FLOPS loss": 0.055465567857027054, + "L0_d": 823.09, + "MLM loss": 2.26115345954895, + "epoch": 4.61, + "step": 199499 + }, + { + "epoch": 4.61, + "learning_rate": 6.134428571428573e-05, + "loss": 2.449, + "step": 199500 + }, + { + "FLOPS loss": 0.07690739631652832, + "L0_d": 980.25, + "MLM loss": 2.428745746612549, + "epoch": 4.62, + "step": 199999 + }, + { + "epoch": 4.62, + "learning_rate": 6.124224489795919e-05, + "loss": 2.4557, + "step": 200000 + }, + { + "FLOPS loss": 0.0735900029540062, + "L0_d": 1076.41, + "MLM loss": 2.3314719200134277, + "epoch": 4.63, + "step": 200499 + }, + { + "epoch": 4.63, + "learning_rate": 6.114020408163266e-05, + "loss": 2.4534, + "step": 200500 + }, + { + "FLOPS loss": 0.06763220578432083, + "L0_d": 796.91, + "MLM loss": 2.3352127075195312, + "epoch": 4.64, + "step": 200999 + }, + { + "epoch": 4.64, + "learning_rate": 6.103836734693878e-05, + "loss": 2.4505, + "step": 201000 + }, + { + "FLOPS loss": 0.05880173668265343, + "L0_d": 761.19, + "MLM loss": 2.3325793743133545, + "epoch": 4.65, + "step": 201499 + }, + { + "epoch": 4.65, + "learning_rate": 6.0936326530612256e-05, + "loss": 2.4529, + "step": 201500 + }, + { + "FLOPS loss": 0.07583904266357422, + "L0_d": 1133.97, + "MLM loss": 2.347179889678955, + "epoch": 4.67, + "step": 201999 + }, + { + "epoch": 4.67, + "learning_rate": 6.083428571428572e-05, + "loss": 2.448, + "step": 202000 + }, + { + "FLOPS loss": 0.05725431814789772, + "L0_d": 652.91, + "MLM loss": 2.454516887664795, + "epoch": 4.68, + "step": 202499 + }, + { + "epoch": 4.68, + "learning_rate": 6.073224489795919e-05, + "loss": 2.4514, + "step": 202500 + }, + { + "FLOPS loss": 0.062322214245796204, + "L0_d": 775.33, + "MLM loss": 2.613377094268799, + "epoch": 4.69, + "step": 202999 + }, + { + "epoch": 4.69, + "learning_rate": 6.0630408163265306e-05, + "loss": 2.4488, + "step": 203000 + }, + { + "FLOPS loss": 0.06807457655668259, + "L0_d": 692.81, + "MLM loss": 2.4366507530212402, + "epoch": 4.7, + "step": 203499 + }, + { + "epoch": 4.7, + "learning_rate": 6.052836734693878e-05, + "loss": 2.4506, + "step": 203500 + }, + { + "FLOPS loss": 0.07732067257165909, + "L0_d": 895.83, + "MLM loss": 2.3486175537109375, + "epoch": 4.71, + "step": 203999 + }, + { + "epoch": 4.71, + "learning_rate": 6.042632653061224e-05, + "loss": 2.4488, + "step": 204000 + }, + { + "FLOPS loss": 0.071961410343647, + "L0_d": 880.2, + "MLM loss": 2.502091407775879, + "epoch": 4.72, + "step": 204499 + }, + { + "epoch": 4.72, + "learning_rate": 6.032428571428572e-05, + "loss": 2.452, + "step": 204500 + }, + { + "FLOPS loss": 0.06560243666172028, + "L0_d": 830.52, + "MLM loss": 2.345327138900757, + "epoch": 4.74, + "step": 204999 + }, + { + "epoch": 4.74, + "learning_rate": 6.022244897959184e-05, + "loss": 2.4487, + "step": 205000 + }, + { + "FLOPS loss": 0.06014092266559601, + "L0_d": 812.0, + "MLM loss": 2.3884024620056152, + "epoch": 4.75, + "step": 205499 + }, + { + "epoch": 4.75, + "learning_rate": 6.0120408163265306e-05, + "loss": 2.4475, + "step": 205500 + }, + { + "FLOPS loss": 0.06797828525304794, + "L0_d": 832.78, + "MLM loss": 2.3767597675323486, + "epoch": 4.76, + "step": 205999 + }, + { + "epoch": 4.76, + "learning_rate": 6.001836734693878e-05, + "loss": 2.4496, + "step": 206000 + }, + { + "FLOPS loss": 0.06602929532527924, + "L0_d": 895.05, + "MLM loss": 2.238253355026245, + "epoch": 4.77, + "step": 206499 + }, + { + "epoch": 4.77, + "learning_rate": 5.991632653061224e-05, + "loss": 2.4493, + "step": 206500 + }, + { + "FLOPS loss": 0.062009185552597046, + "L0_d": 832.91, + "MLM loss": 2.436962604522705, + "epoch": 4.78, + "step": 206999 + }, + { + "epoch": 4.78, + "learning_rate": 5.981428571428572e-05, + "loss": 2.4444, + "step": 207000 + }, + { + "FLOPS loss": 0.07398274540901184, + "L0_d": 1058.25, + "MLM loss": 2.415109157562256, + "epoch": 4.79, + "step": 207499 + }, + { + "epoch": 4.79, + "learning_rate": 5.971244897959184e-05, + "loss": 2.4436, + "step": 207500 + }, + { + "FLOPS loss": 0.06136331707239151, + "L0_d": 744.12, + "MLM loss": 2.3068037033081055, + "epoch": 4.81, + "step": 207999 + }, + { + "epoch": 4.81, + "learning_rate": 5.9610408163265305e-05, + "loss": 2.4464, + "step": 208000 + }, + { + "FLOPS loss": 0.06158547103404999, + "L0_d": 929.0, + "MLM loss": 2.406006336212158, + "epoch": 4.82, + "step": 208499 + }, + { + "epoch": 4.82, + "learning_rate": 5.950836734693878e-05, + "loss": 2.446, + "step": 208500 + }, + { + "FLOPS loss": 0.06520802527666092, + "L0_d": 732.05, + "MLM loss": 2.3168740272521973, + "epoch": 4.83, + "step": 208999 + }, + { + "epoch": 4.83, + "learning_rate": 5.940632653061224e-05, + "loss": 2.4432, + "step": 209000 + }, + { + "FLOPS loss": 0.08313161134719849, + "L0_d": 1113.67, + "MLM loss": 2.4824371337890625, + "epoch": 4.84, + "step": 209499 + }, + { + "epoch": 4.84, + "learning_rate": 5.930448979591837e-05, + "loss": 2.4441, + "step": 209500 + }, + { + "FLOPS loss": 0.06880463659763336, + "L0_d": 1170.94, + "MLM loss": 2.373368978500366, + "epoch": 4.85, + "step": 209999 + }, + { + "epoch": 4.85, + "learning_rate": 5.920244897959184e-05, + "loss": 2.4453, + "step": 210000 + }, + { + "FLOPS loss": 0.06296434253454208, + "L0_d": 1000.72, + "MLM loss": 2.4930760860443115, + "epoch": 4.86, + "step": 210499 + }, + { + "epoch": 4.86, + "learning_rate": 5.9100408163265305e-05, + "loss": 2.4431, + "step": 210500 + }, + { + "FLOPS loss": 0.0654015988111496, + "L0_d": 783.62, + "MLM loss": 2.3720173835754395, + "epoch": 4.87, + "step": 210999 + }, + { + "epoch": 4.87, + "learning_rate": 5.899836734693878e-05, + "loss": 2.4443, + "step": 211000 + }, + { + "FLOPS loss": 0.06743564456701279, + "L0_d": 929.95, + "MLM loss": 2.36538028717041, + "epoch": 4.89, + "step": 211499 + }, + { + "epoch": 4.89, + "learning_rate": 5.8896530612244904e-05, + "loss": 2.4434, + "step": 211500 + }, + { + "FLOPS loss": 0.06421862542629242, + "L0_d": 922.92, + "MLM loss": 2.4841079711914062, + "epoch": 4.9, + "step": 211999 + }, + { + "epoch": 4.9, + "learning_rate": 5.8794489795918376e-05, + "loss": 2.44, + "step": 212000 + }, + { + "FLOPS loss": 0.06765349954366684, + "L0_d": 860.09, + "MLM loss": 2.574281692504883, + "epoch": 4.91, + "step": 212499 + }, + { + "epoch": 4.91, + "learning_rate": 5.869244897959184e-05, + "loss": 2.4354, + "step": 212500 + }, + { + "FLOPS loss": 0.08442234247922897, + "L0_d": 1016.08, + "MLM loss": 2.45365047454834, + "epoch": 4.92, + "step": 212999 + }, + { + "epoch": 4.92, + "learning_rate": 5.859040816326531e-05, + "loss": 2.4416, + "step": 213000 + }, + { + "FLOPS loss": 0.06505122035741806, + "L0_d": 833.3, + "MLM loss": 2.5009758472442627, + "epoch": 4.93, + "step": 213499 + }, + { + "epoch": 4.93, + "learning_rate": 5.8488367346938776e-05, + "loss": 2.438, + "step": 213500 + }, + { + "FLOPS loss": 0.054140783846378326, + "L0_d": 705.83, + "MLM loss": 2.4696202278137207, + "epoch": 4.94, + "step": 213999 + }, + { + "epoch": 4.94, + "learning_rate": 5.8386530612244904e-05, + "loss": 2.4395, + "step": 214000 + }, + { + "FLOPS loss": 0.07031732052564621, + "L0_d": 888.97, + "MLM loss": 2.2769343852996826, + "epoch": 4.96, + "step": 214499 + }, + { + "epoch": 4.96, + "learning_rate": 5.8284489795918375e-05, + "loss": 2.4403, + "step": 214500 + }, + { + "FLOPS loss": 0.06005926430225372, + "L0_d": 835.02, + "MLM loss": 2.513908863067627, + "epoch": 4.97, + "step": 214999 + }, + { + "epoch": 4.97, + "learning_rate": 5.818244897959184e-05, + "loss": 2.4433, + "step": 215000 + }, + { + "FLOPS loss": 0.08554711937904358, + "L0_d": 1060.28, + "MLM loss": 2.2464144229888916, + "epoch": 4.98, + "step": 215499 + }, + { + "epoch": 4.98, + "learning_rate": 5.808040816326531e-05, + "loss": 2.434, + "step": 215500 + }, + { + "FLOPS loss": 0.07612525671720505, + "L0_d": 1262.25, + "MLM loss": 2.3883957862854004, + "epoch": 4.99, + "step": 215999 + }, + { + "epoch": 4.99, + "learning_rate": 5.7978367346938776e-05, + "loss": 2.4383, + "step": 216000 + }, + { + "FLOPS loss": 0.07493530958890915, + "L0_d": 963.84, + "MLM loss": 2.5143649578094482, + "epoch": 5.0, + "step": 216499 + }, + { + "epoch": 5.0, + "learning_rate": 5.7876530612244904e-05, + "loss": 2.4388, + "step": 216500 + }, + { + "FLOPS loss": 0.05979205295443535, + "L0_d": 788.12, + "MLM loss": 2.4131922721862793, + "epoch": 5.01, + "step": 216999 + }, + { + "epoch": 5.01, + "learning_rate": 5.7774489795918375e-05, + "loss": 2.4333, + "step": 217000 + }, + { + "FLOPS loss": 0.0720650851726532, + "L0_d": 978.7, + "MLM loss": 2.429448366165161, + "epoch": 5.02, + "step": 217499 + }, + { + "epoch": 5.02, + "learning_rate": 5.767244897959184e-05, + "loss": 2.4342, + "step": 217500 + }, + { + "FLOPS loss": 0.07064210623502731, + "L0_d": 862.91, + "MLM loss": 2.4350945949554443, + "epoch": 5.04, + "step": 217999 + }, + { + "epoch": 5.04, + "learning_rate": 5.757040816326531e-05, + "loss": 2.4375, + "step": 218000 + }, + { + "FLOPS loss": 0.062107719480991364, + "L0_d": 719.12, + "MLM loss": 2.218329668045044, + "epoch": 5.05, + "step": 218499 + }, + { + "epoch": 5.05, + "learning_rate": 5.7468367346938776e-05, + "loss": 2.4326, + "step": 218500 + }, + { + "FLOPS loss": 0.0665595754981041, + "L0_d": 793.11, + "MLM loss": 2.3749895095825195, + "epoch": 5.06, + "step": 218999 + }, + { + "epoch": 5.06, + "learning_rate": 5.73665306122449e-05, + "loss": 2.4326, + "step": 219000 + }, + { + "FLOPS loss": 0.05444691702723503, + "L0_d": 645.83, + "MLM loss": 2.405651330947876, + "epoch": 5.07, + "step": 219499 + }, + { + "epoch": 5.07, + "learning_rate": 5.7264489795918375e-05, + "loss": 2.4248, + "step": 219500 + }, + { + "FLOPS loss": 0.051724907010793686, + "L0_d": 658.75, + "MLM loss": 2.5104503631591797, + "epoch": 5.08, + "step": 219999 + }, + { + "epoch": 5.08, + "learning_rate": 5.716244897959184e-05, + "loss": 2.4326, + "step": 220000 + }, + { + "FLOPS loss": 0.05557708442211151, + "L0_d": 765.36, + "MLM loss": 2.4535903930664062, + "epoch": 5.09, + "step": 220499 + }, + { + "epoch": 5.09, + "learning_rate": 5.706040816326531e-05, + "loss": 2.4311, + "step": 220500 + }, + { + "FLOPS loss": 0.05966062471270561, + "L0_d": 719.16, + "MLM loss": 2.3905961513519287, + "epoch": 5.11, + "step": 220999 + }, + { + "epoch": 5.11, + "learning_rate": 5.6958571428571425e-05, + "loss": 2.4261, + "step": 221000 + }, + { + "FLOPS loss": 0.0590512789785862, + "L0_d": 834.92, + "MLM loss": 2.4592933654785156, + "epoch": 5.12, + "step": 221499 + }, + { + "epoch": 5.12, + "learning_rate": 5.6856530612244897e-05, + "loss": 2.4283, + "step": 221500 + }, + { + "FLOPS loss": 0.06647661328315735, + "L0_d": 777.38, + "MLM loss": 2.529719829559326, + "epoch": 5.13, + "step": 221999 + }, + { + "epoch": 5.13, + "learning_rate": 5.6754489795918375e-05, + "loss": 2.4262, + "step": 222000 + }, + { + "FLOPS loss": 0.059614311903715134, + "L0_d": 668.86, + "MLM loss": 2.487185001373291, + "epoch": 5.14, + "step": 222499 + }, + { + "epoch": 5.14, + "learning_rate": 5.6652448979591846e-05, + "loss": 2.4292, + "step": 222500 + }, + { + "FLOPS loss": 0.061933524906635284, + "L0_d": 705.98, + "MLM loss": 2.349621295928955, + "epoch": 5.15, + "step": 222999 + }, + { + "epoch": 5.15, + "learning_rate": 5.655040816326531e-05, + "loss": 2.4285, + "step": 223000 + }, + { + "FLOPS loss": 0.07037528604269028, + "L0_d": 1112.05, + "MLM loss": 2.242041826248169, + "epoch": 5.16, + "step": 223499 + }, + { + "epoch": 5.16, + "learning_rate": 5.644857142857143e-05, + "loss": 2.4274, + "step": 223500 + }, + { + "FLOPS loss": 0.06243540346622467, + "L0_d": 933.86, + "MLM loss": 2.3377318382263184, + "epoch": 5.17, + "step": 223999 + }, + { + "epoch": 5.17, + "learning_rate": 5.6346530612244896e-05, + "loss": 2.4293, + "step": 224000 + }, + { + "FLOPS loss": 0.07176888734102249, + "L0_d": 1007.48, + "MLM loss": 2.4397826194763184, + "epoch": 5.19, + "step": 224499 + }, + { + "epoch": 5.19, + "learning_rate": 5.624448979591837e-05, + "loss": 2.4249, + "step": 224500 + }, + { + "FLOPS loss": 0.06889332085847855, + "L0_d": 973.16, + "MLM loss": 2.3823471069335938, + "epoch": 5.2, + "step": 224999 + }, + { + "epoch": 5.2, + "learning_rate": 5.6142448979591846e-05, + "loss": 2.4297, + "step": 225000 + }, + { + "FLOPS loss": 0.06415066123008728, + "L0_d": 1091.34, + "MLM loss": 2.323089361190796, + "epoch": 5.21, + "step": 225499 + }, + { + "epoch": 5.21, + "learning_rate": 5.604061224489796e-05, + "loss": 2.4279, + "step": 225500 + }, + { + "FLOPS loss": 0.0608639270067215, + "L0_d": 789.3, + "MLM loss": 2.265446901321411, + "epoch": 5.22, + "step": 225999 + }, + { + "epoch": 5.22, + "learning_rate": 5.593857142857143e-05, + "loss": 2.4256, + "step": 226000 + }, + { + "FLOPS loss": 0.06591716408729553, + "L0_d": 893.92, + "MLM loss": 2.2293763160705566, + "epoch": 5.23, + "step": 226499 + }, + { + "epoch": 5.23, + "learning_rate": 5.5836530612244896e-05, + "loss": 2.4313, + "step": 226500 + }, + { + "FLOPS loss": 0.05170826241374016, + "L0_d": 763.05, + "MLM loss": 2.5374226570129395, + "epoch": 5.24, + "step": 226999 + }, + { + "epoch": 5.24, + "learning_rate": 5.573448979591837e-05, + "loss": 2.4269, + "step": 227000 + }, + { + "FLOPS loss": 0.059597332030534744, + "L0_d": 982.97, + "MLM loss": 2.4384584426879883, + "epoch": 5.26, + "step": 227499 + }, + { + "epoch": 5.26, + "learning_rate": 5.5632653061224495e-05, + "loss": 2.4275, + "step": 227500 + }, + { + "FLOPS loss": 0.06841108202934265, + "L0_d": 841.12, + "MLM loss": 2.26102876663208, + "epoch": 5.27, + "step": 227999 + }, + { + "epoch": 5.27, + "learning_rate": 5.553061224489796e-05, + "loss": 2.4256, + "step": 228000 + }, + { + "FLOPS loss": 0.06574366986751556, + "L0_d": 772.81, + "MLM loss": 2.215580701828003, + "epoch": 5.28, + "step": 228499 + }, + { + "epoch": 5.28, + "learning_rate": 5.542857142857143e-05, + "loss": 2.4234, + "step": 228500 + }, + { + "FLOPS loss": 0.05697168409824371, + "L0_d": 804.02, + "MLM loss": 2.3969311714172363, + "epoch": 5.29, + "step": 228999 + }, + { + "epoch": 5.29, + "learning_rate": 5.5326530612244896e-05, + "loss": 2.4208, + "step": 229000 + }, + { + "FLOPS loss": 0.07022450119256973, + "L0_d": 812.52, + "MLM loss": 2.4542322158813477, + "epoch": 5.3, + "step": 229499 + }, + { + "epoch": 5.3, + "learning_rate": 5.522448979591837e-05, + "loss": 2.423, + "step": 229500 + }, + { + "FLOPS loss": 0.06159794330596924, + "L0_d": 787.3, + "MLM loss": 2.3856964111328125, + "epoch": 5.31, + "step": 229999 + }, + { + "epoch": 5.31, + "learning_rate": 5.5122653061224495e-05, + "loss": 2.4217, + "step": 230000 + }, + { + "FLOPS loss": 0.06548204272985458, + "L0_d": 681.08, + "MLM loss": 2.3479115962982178, + "epoch": 5.32, + "step": 230499 + }, + { + "epoch": 5.32, + "learning_rate": 5.502061224489796e-05, + "loss": 2.4257, + "step": 230500 + }, + { + "FLOPS loss": 0.06918176263570786, + "L0_d": 956.61, + "MLM loss": 2.3696184158325195, + "epoch": 5.34, + "step": 230999 + }, + { + "epoch": 5.34, + "learning_rate": 5.491857142857143e-05, + "loss": 2.42, + "step": 231000 + }, + { + "FLOPS loss": 0.05903591215610504, + "L0_d": 914.56, + "MLM loss": 2.588008403778076, + "epoch": 5.35, + "step": 231499 + }, + { + "epoch": 5.35, + "learning_rate": 5.4816530612244896e-05, + "loss": 2.421, + "step": 231500 + }, + { + "FLOPS loss": 0.06530603766441345, + "L0_d": 738.28, + "MLM loss": 2.2901949882507324, + "epoch": 5.36, + "step": 231999 + }, + { + "epoch": 5.36, + "learning_rate": 5.471469387755102e-05, + "loss": 2.4197, + "step": 232000 + }, + { + "FLOPS loss": 0.0630945935845375, + "L0_d": 786.75, + "MLM loss": 2.353754758834839, + "epoch": 5.37, + "step": 232499 + }, + { + "epoch": 5.37, + "learning_rate": 5.4612653061224495e-05, + "loss": 2.4212, + "step": 232500 + }, + { + "FLOPS loss": 0.06529512256383896, + "L0_d": 911.2, + "MLM loss": 2.315972328186035, + "epoch": 5.38, + "step": 232999 + }, + { + "epoch": 5.38, + "learning_rate": 5.4510612244897966e-05, + "loss": 2.4228, + "step": 233000 + }, + { + "FLOPS loss": 0.06647701561450958, + "L0_d": 1022.75, + "MLM loss": 2.321000814437866, + "epoch": 5.39, + "step": 233499 + }, + { + "epoch": 5.39, + "learning_rate": 5.440857142857143e-05, + "loss": 2.4218, + "step": 233500 + }, + { + "FLOPS loss": 0.060300182551145554, + "L0_d": 700.78, + "MLM loss": 2.3008556365966797, + "epoch": 5.41, + "step": 233999 + }, + { + "epoch": 5.41, + "learning_rate": 5.430673469387756e-05, + "loss": 2.4196, + "step": 234000 + }, + { + "FLOPS loss": 0.057080261409282684, + "L0_d": 816.3, + "MLM loss": 2.3866772651672363, + "epoch": 5.42, + "step": 234499 + }, + { + "epoch": 5.42, + "learning_rate": 5.420469387755103e-05, + "loss": 2.4166, + "step": 234500 + }, + { + "FLOPS loss": 0.07618782669305801, + "L0_d": 1020.66, + "MLM loss": 2.3357157707214355, + "epoch": 5.43, + "step": 234999 + }, + { + "epoch": 5.43, + "learning_rate": 5.4102653061224495e-05, + "loss": 2.4206, + "step": 235000 + }, + { + "FLOPS loss": 0.0620071217417717, + "L0_d": 738.75, + "MLM loss": 2.4523963928222656, + "epoch": 5.44, + "step": 235499 + }, + { + "epoch": 5.44, + "learning_rate": 5.4000612244897966e-05, + "loss": 2.4208, + "step": 235500 + }, + { + "FLOPS loss": 0.0646725594997406, + "L0_d": 1076.83, + "MLM loss": 2.353867292404175, + "epoch": 5.45, + "step": 235999 + }, + { + "epoch": 5.45, + "learning_rate": 5.389857142857143e-05, + "loss": 2.4229, + "step": 236000 + }, + { + "FLOPS loss": 0.06458298116922379, + "L0_d": 860.25, + "MLM loss": 2.3973288536071777, + "epoch": 5.46, + "step": 236499 + }, + { + "epoch": 5.46, + "learning_rate": 5.37965306122449e-05, + "loss": 2.4211, + "step": 236500 + }, + { + "FLOPS loss": 0.07283961027860641, + "L0_d": 744.12, + "MLM loss": 2.4385931491851807, + "epoch": 5.47, + "step": 236999 + }, + { + "epoch": 5.47, + "learning_rate": 5.369469387755103e-05, + "loss": 2.4195, + "step": 237000 + }, + { + "FLOPS loss": 0.0643695816397667, + "L0_d": 817.92, + "MLM loss": 2.379582643508911, + "epoch": 5.49, + "step": 237499 + }, + { + "epoch": 5.49, + "learning_rate": 5.3592653061224494e-05, + "loss": 2.4165, + "step": 237500 + }, + { + "FLOPS loss": 0.07279926538467407, + "L0_d": 962.19, + "MLM loss": 2.22440767288208, + "epoch": 5.5, + "step": 237999 + }, + { + "epoch": 5.5, + "learning_rate": 5.3490612244897966e-05, + "loss": 2.4156, + "step": 238000 + }, + { + "FLOPS loss": 0.05908575281500816, + "L0_d": 686.64, + "MLM loss": 2.48474383354187, + "epoch": 5.51, + "step": 238499 + }, + { + "epoch": 5.51, + "learning_rate": 5.338857142857143e-05, + "loss": 2.416, + "step": 238500 + }, + { + "FLOPS loss": 0.07741891592741013, + "L0_d": 1033.06, + "MLM loss": 2.2399086952209473, + "epoch": 5.52, + "step": 238999 + }, + { + "epoch": 5.52, + "learning_rate": 5.32865306122449e-05, + "loss": 2.4167, + "step": 239000 + }, + { + "FLOPS loss": 0.05061428248882294, + "L0_d": 519.08, + "MLM loss": 2.3229198455810547, + "epoch": 5.53, + "step": 239499 + }, + { + "epoch": 5.53, + "learning_rate": 5.318448979591837e-05, + "loss": 2.4118, + "step": 239500 + }, + { + "FLOPS loss": 0.07955832034349442, + "L0_d": 1008.81, + "MLM loss": 2.3691508769989014, + "epoch": 5.54, + "step": 239999 + }, + { + "epoch": 5.54, + "learning_rate": 5.3082653061224494e-05, + "loss": 2.4098, + "step": 240000 + }, + { + "FLOPS loss": 0.061550572514534, + "L0_d": 968.66, + "MLM loss": 2.6222047805786133, + "epoch": 5.56, + "step": 240499 + }, + { + "epoch": 5.56, + "learning_rate": 5.2980612244897966e-05, + "loss": 2.4183, + "step": 240500 + }, + { + "FLOPS loss": 0.08182385563850403, + "L0_d": 1048.91, + "MLM loss": 2.4917473793029785, + "epoch": 5.57, + "step": 240999 + }, + { + "epoch": 5.57, + "learning_rate": 5.287857142857143e-05, + "loss": 2.4134, + "step": 241000 + }, + { + "FLOPS loss": 0.058586906641721725, + "L0_d": 719.7, + "MLM loss": 2.3518967628479004, + "epoch": 5.58, + "step": 241499 + }, + { + "epoch": 5.58, + "learning_rate": 5.27765306122449e-05, + "loss": 2.4185, + "step": 241500 + }, + { + "FLOPS loss": 0.08758343011140823, + "L0_d": 1149.27, + "MLM loss": 2.231182336807251, + "epoch": 5.59, + "step": 241999 + }, + { + "epoch": 5.59, + "learning_rate": 5.2674489795918366e-05, + "loss": 2.4137, + "step": 242000 + }, + { + "FLOPS loss": 0.07074569165706635, + "L0_d": 842.36, + "MLM loss": 2.271498441696167, + "epoch": 5.6, + "step": 242499 + }, + { + "epoch": 5.6, + "learning_rate": 5.2572653061224494e-05, + "loss": 2.414, + "step": 242500 + }, + { + "FLOPS loss": 0.06920843571424484, + "L0_d": 959.58, + "MLM loss": 2.2417149543762207, + "epoch": 5.61, + "step": 242999 + }, + { + "epoch": 5.61, + "learning_rate": 5.2470612244897965e-05, + "loss": 2.4152, + "step": 243000 + }, + { + "FLOPS loss": 0.06951159238815308, + "L0_d": 879.59, + "MLM loss": 2.3914523124694824, + "epoch": 5.63, + "step": 243499 + }, + { + "epoch": 5.63, + "learning_rate": 5.236857142857144e-05, + "loss": 2.4165, + "step": 243500 + }, + { + "FLOPS loss": 0.06333808600902557, + "L0_d": 845.09, + "MLM loss": 2.3757832050323486, + "epoch": 5.64, + "step": 243999 + }, + { + "epoch": 5.64, + "learning_rate": 5.22665306122449e-05, + "loss": 2.4089, + "step": 244000 + }, + { + "FLOPS loss": 0.06202933192253113, + "L0_d": 826.56, + "MLM loss": 2.427809476852417, + "epoch": 5.65, + "step": 244499 + }, + { + "epoch": 5.65, + "learning_rate": 5.2164693877551015e-05, + "loss": 2.412, + "step": 244500 + }, + { + "FLOPS loss": 0.06342844665050507, + "L0_d": 1106.88, + "MLM loss": 2.4152002334594727, + "epoch": 5.66, + "step": 244999 + }, + { + "epoch": 5.66, + "learning_rate": 5.20626530612245e-05, + "loss": 2.4117, + "step": 245000 + }, + { + "FLOPS loss": 0.0744723230600357, + "L0_d": 1098.58, + "MLM loss": 2.1651837825775146, + "epoch": 5.67, + "step": 245499 + }, + { + "epoch": 5.67, + "learning_rate": 5.1960612244897965e-05, + "loss": 2.4073, + "step": 245500 + }, + { + "FLOPS loss": 0.058694589883089066, + "L0_d": 815.58, + "MLM loss": 2.2264058589935303, + "epoch": 5.68, + "step": 245999 + }, + { + "epoch": 5.68, + "learning_rate": 5.185857142857144e-05, + "loss": 2.4112, + "step": 246000 + }, + { + "FLOPS loss": 0.0438019260764122, + "L0_d": 610.3, + "MLM loss": 2.3642964363098145, + "epoch": 5.69, + "step": 246499 + }, + { + "epoch": 5.69, + "learning_rate": 5.17565306122449e-05, + "loss": 2.4093, + "step": 246500 + }, + { + "FLOPS loss": 0.06571359187364578, + "L0_d": 906.2, + "MLM loss": 2.217068910598755, + "epoch": 5.71, + "step": 246999 + }, + { + "epoch": 5.71, + "learning_rate": 5.165469387755102e-05, + "loss": 2.4076, + "step": 247000 + }, + { + "FLOPS loss": 0.057677898555994034, + "L0_d": 782.92, + "MLM loss": 2.2437782287597656, + "epoch": 5.72, + "step": 247499 + }, + { + "epoch": 5.72, + "learning_rate": 5.155265306122449e-05, + "loss": 2.4119, + "step": 247500 + }, + { + "FLOPS loss": 0.0648793950676918, + "L0_d": 905.28, + "MLM loss": 2.2714266777038574, + "epoch": 5.73, + "step": 247999 + }, + { + "epoch": 5.73, + "learning_rate": 5.1450612244897965e-05, + "loss": 2.4116, + "step": 248000 + }, + { + "FLOPS loss": 0.08464235067367554, + "L0_d": 992.86, + "MLM loss": 2.3772592544555664, + "epoch": 5.74, + "step": 248499 + }, + { + "epoch": 5.74, + "learning_rate": 5.1348571428571436e-05, + "loss": 2.4087, + "step": 248500 + }, + { + "FLOPS loss": 0.06436370313167572, + "L0_d": 712.66, + "MLM loss": 2.6194450855255127, + "epoch": 5.75, + "step": 248999 + }, + { + "epoch": 5.75, + "learning_rate": 5.12465306122449e-05, + "loss": 2.4129, + "step": 249000 + }, + { + "FLOPS loss": 0.056343041360378265, + "L0_d": 690.97, + "MLM loss": 2.426379680633545, + "epoch": 5.76, + "step": 249499 + }, + { + "epoch": 5.76, + "learning_rate": 5.114469387755102e-05, + "loss": 2.4098, + "step": 249500 + }, + { + "FLOPS loss": 0.0622674897313118, + "L0_d": 1193.97, + "MLM loss": 2.275702476501465, + "epoch": 5.78, + "step": 249999 + }, + { + "epoch": 5.78, + "learning_rate": 5.1042653061224487e-05, + "loss": 2.4088, + "step": 250000 + }, + { + "FLOPS loss": 0.06920263171195984, + "L0_d": 1360.25, + "MLM loss": 2.4793238639831543, + "epoch": 5.79, + "step": 250499 + }, + { + "epoch": 5.79, + "learning_rate": 5.094061224489796e-05, + "loss": 2.4015, + "step": 250500 + }, + { + "FLOPS loss": 0.061475012451410294, + "L0_d": 759.97, + "MLM loss": 2.1970982551574707, + "epoch": 5.8, + "step": 250999 + }, + { + "epoch": 5.8, + "learning_rate": 5.0838571428571436e-05, + "loss": 2.4074, + "step": 251000 + }, + { + "FLOPS loss": 0.0693504810333252, + "L0_d": 747.38, + "MLM loss": 2.315021514892578, + "epoch": 5.81, + "step": 251499 + }, + { + "epoch": 5.81, + "learning_rate": 5.07365306122449e-05, + "loss": 2.411, + "step": 251500 + }, + { + "FLOPS loss": 0.054750021547079086, + "L0_d": 565.3, + "MLM loss": 2.175985336303711, + "epoch": 5.82, + "step": 251999 + }, + { + "epoch": 5.82, + "learning_rate": 5.063469387755102e-05, + "loss": 2.4061, + "step": 252000 + }, + { + "FLOPS loss": 0.05239395797252655, + "L0_d": 543.91, + "MLM loss": 2.3537206649780273, + "epoch": 5.83, + "step": 252499 + }, + { + "epoch": 5.83, + "learning_rate": 5.0532653061224486e-05, + "loss": 2.4028, + "step": 252500 + }, + { + "FLOPS loss": 0.06858605146408081, + "L0_d": 982.17, + "MLM loss": 2.4172556400299072, + "epoch": 5.84, + "step": 252999 + }, + { + "epoch": 5.84, + "learning_rate": 5.043061224489796e-05, + "loss": 2.405, + "step": 253000 + }, + { + "FLOPS loss": 0.06169794127345085, + "L0_d": 818.91, + "MLM loss": 2.324985980987549, + "epoch": 5.86, + "step": 253499 + }, + { + "epoch": 5.86, + "learning_rate": 5.0328571428571436e-05, + "loss": 2.4049, + "step": 253500 + }, + { + "FLOPS loss": 0.057725466787815094, + "L0_d": 805.22, + "MLM loss": 2.414187431335449, + "epoch": 5.87, + "step": 253999 + }, + { + "epoch": 5.87, + "learning_rate": 5.022653061224491e-05, + "loss": 2.4093, + "step": 254000 + }, + { + "FLOPS loss": 0.06406831741333008, + "L0_d": 830.33, + "MLM loss": 2.3183207511901855, + "epoch": 5.88, + "step": 254499 + }, + { + "epoch": 5.88, + "learning_rate": 5.012469387755102e-05, + "loss": 2.4061, + "step": 254500 + }, + { + "FLOPS loss": 0.07106481492519379, + "L0_d": 870.17, + "MLM loss": 2.090304374694824, + "epoch": 5.89, + "step": 254999 + }, + { + "epoch": 5.89, + "learning_rate": 5.0022653061224486e-05, + "loss": 2.3991, + "step": 255000 + }, + { + "FLOPS loss": 0.05545003339648247, + "L0_d": 826.73, + "MLM loss": 2.331218957901001, + "epoch": 5.9, + "step": 255499 + }, + { + "epoch": 5.9, + "learning_rate": 4.9920612244897964e-05, + "loss": 2.3988, + "step": 255500 + }, + { + "FLOPS loss": 0.05424296855926514, + "L0_d": 608.84, + "MLM loss": 2.439082145690918, + "epoch": 5.91, + "step": 255999 + }, + { + "epoch": 5.91, + "learning_rate": 4.981857142857143e-05, + "loss": 2.4051, + "step": 256000 + }, + { + "FLOPS loss": 0.06380561739206314, + "L0_d": 683.2, + "MLM loss": 2.3173537254333496, + "epoch": 5.93, + "step": 256499 + }, + { + "epoch": 5.93, + "learning_rate": 4.97165306122449e-05, + "loss": 2.4055, + "step": 256500 + }, + { + "FLOPS loss": 0.05620687082409859, + "L0_d": 766.11, + "MLM loss": 2.4309253692626953, + "epoch": 5.94, + "step": 256999 + }, + { + "epoch": 5.94, + "learning_rate": 4.961469387755102e-05, + "loss": 2.4048, + "step": 257000 + }, + { + "FLOPS loss": 0.07935933768749237, + "L0_d": 1019.94, + "MLM loss": 2.3071842193603516, + "epoch": 5.95, + "step": 257499 + }, + { + "epoch": 5.95, + "learning_rate": 4.951265306122449e-05, + "loss": 2.406, + "step": 257500 + }, + { + "FLOPS loss": 0.05459493398666382, + "L0_d": 696.48, + "MLM loss": 2.4048876762390137, + "epoch": 5.96, + "step": 257999 + }, + { + "epoch": 5.96, + "learning_rate": 4.9410612244897964e-05, + "loss": 2.403, + "step": 258000 + }, + { + "FLOPS loss": 0.07205364853143692, + "L0_d": 837.02, + "MLM loss": 2.238459825515747, + "epoch": 5.97, + "step": 258499 + }, + { + "epoch": 5.97, + "learning_rate": 4.930857142857143e-05, + "loss": 2.404, + "step": 258500 + }, + { + "FLOPS loss": 0.07362333685159683, + "L0_d": 850.81, + "MLM loss": 2.3778529167175293, + "epoch": 5.98, + "step": 258999 + }, + { + "epoch": 5.98, + "learning_rate": 4.9206734693877556e-05, + "loss": 2.4001, + "step": 259000 + }, + { + "FLOPS loss": 0.058844566345214844, + "L0_d": 1018.61, + "MLM loss": 2.3497064113616943, + "epoch": 5.99, + "step": 259499 + }, + { + "epoch": 5.99, + "learning_rate": 4.910469387755102e-05, + "loss": 2.4037, + "step": 259500 + }, + { + "FLOPS loss": 0.05885095149278641, + "L0_d": 664.66, + "MLM loss": 2.633054494857788, + "epoch": 6.01, + "step": 259999 + }, + { + "epoch": 6.01, + "learning_rate": 4.900265306122449e-05, + "loss": 2.3968, + "step": 260000 + }, + { + "FLOPS loss": 0.06290178000926971, + "L0_d": 697.53, + "MLM loss": 2.379453659057617, + "epoch": 6.02, + "step": 260499 + }, + { + "epoch": 6.02, + "learning_rate": 4.8900612244897964e-05, + "loss": 2.3957, + "step": 260500 + }, + { + "FLOPS loss": 0.08343258500099182, + "L0_d": 978.75, + "MLM loss": 2.326571464538574, + "epoch": 6.03, + "step": 260999 + }, + { + "epoch": 6.03, + "learning_rate": 4.8798571428571435e-05, + "loss": 2.3938, + "step": 261000 + }, + { + "FLOPS loss": 0.06951142847537994, + "L0_d": 843.02, + "MLM loss": 2.1549177169799805, + "epoch": 6.04, + "step": 261499 + }, + { + "epoch": 6.04, + "learning_rate": 4.8696734693877556e-05, + "loss": 2.3976, + "step": 261500 + }, + { + "FLOPS loss": 0.05598827823996544, + "L0_d": 773.36, + "MLM loss": 2.3736531734466553, + "epoch": 6.05, + "step": 261999 + }, + { + "epoch": 6.05, + "learning_rate": 4.859469387755102e-05, + "loss": 2.3956, + "step": 262000 + }, + { + "FLOPS loss": 0.06191984564065933, + "L0_d": 748.77, + "MLM loss": 2.327627182006836, + "epoch": 6.06, + "step": 262499 + }, + { + "epoch": 6.06, + "learning_rate": 4.849265306122449e-05, + "loss": 2.3923, + "step": 262500 + }, + { + "FLOPS loss": 0.06821770966053009, + "L0_d": 875.41, + "MLM loss": 2.2674753665924072, + "epoch": 6.08, + "step": 262999 + }, + { + "epoch": 6.08, + "learning_rate": 4.839061224489796e-05, + "loss": 2.3949, + "step": 263000 + }, + { + "FLOPS loss": 0.06579490751028061, + "L0_d": 807.78, + "MLM loss": 2.183204412460327, + "epoch": 6.09, + "step": 263499 + }, + { + "epoch": 6.09, + "learning_rate": 4.8288775510204084e-05, + "loss": 2.3952, + "step": 263500 + }, + { + "FLOPS loss": 0.07654042541980743, + "L0_d": 1016.47, + "MLM loss": 2.3310086727142334, + "epoch": 6.1, + "step": 263999 + }, + { + "epoch": 6.1, + "learning_rate": 4.8186734693877556e-05, + "loss": 2.3956, + "step": 264000 + }, + { + "FLOPS loss": 0.0669398382306099, + "L0_d": 1012.41, + "MLM loss": 2.40228533744812, + "epoch": 6.11, + "step": 264499 + }, + { + "epoch": 6.11, + "learning_rate": 4.808469387755102e-05, + "loss": 2.3932, + "step": 264500 + }, + { + "FLOPS loss": 0.06311013549566269, + "L0_d": 763.83, + "MLM loss": 2.2010045051574707, + "epoch": 6.12, + "step": 264999 + }, + { + "epoch": 6.12, + "learning_rate": 4.798265306122449e-05, + "loss": 2.3955, + "step": 265000 + }, + { + "FLOPS loss": 0.0659794807434082, + "L0_d": 844.52, + "MLM loss": 2.238668441772461, + "epoch": 6.13, + "step": 265499 + }, + { + "epoch": 6.13, + "learning_rate": 4.788061224489796e-05, + "loss": 2.39, + "step": 265500 + }, + { + "FLOPS loss": 0.06289741396903992, + "L0_d": 979.72, + "MLM loss": 2.186093807220459, + "epoch": 6.14, + "step": 265999 + }, + { + "epoch": 6.14, + "learning_rate": 4.7778775510204084e-05, + "loss": 2.3908, + "step": 266000 + }, + { + "FLOPS loss": 0.07499533146619797, + "L0_d": 857.14, + "MLM loss": 2.23008131980896, + "epoch": 6.16, + "step": 266499 + }, + { + "epoch": 6.16, + "learning_rate": 4.767673469387755e-05, + "loss": 2.3931, + "step": 266500 + }, + { + "FLOPS loss": 0.06307825446128845, + "L0_d": 879.86, + "MLM loss": 2.5647735595703125, + "epoch": 6.17, + "step": 266999 + }, + { + "epoch": 6.17, + "learning_rate": 4.757469387755103e-05, + "loss": 2.3905, + "step": 267000 + }, + { + "FLOPS loss": 0.06346144527196884, + "L0_d": 887.95, + "MLM loss": 2.338149070739746, + "epoch": 6.18, + "step": 267499 + }, + { + "epoch": 6.18, + "learning_rate": 4.747265306122449e-05, + "loss": 2.3893, + "step": 267500 + }, + { + "FLOPS loss": 0.05799337476491928, + "L0_d": 830.47, + "MLM loss": 2.1602370738983154, + "epoch": 6.19, + "step": 267999 + }, + { + "epoch": 6.19, + "learning_rate": 4.737081632653061e-05, + "loss": 2.3964, + "step": 268000 + }, + { + "FLOPS loss": 0.05315548554062843, + "L0_d": 583.66, + "MLM loss": 2.4497416019439697, + "epoch": 6.2, + "step": 268499 + }, + { + "epoch": 6.2, + "learning_rate": 4.7268775510204084e-05, + "loss": 2.3877, + "step": 268500 + }, + { + "FLOPS loss": 0.07155217230319977, + "L0_d": 950.55, + "MLM loss": 2.4541220664978027, + "epoch": 6.21, + "step": 268999 + }, + { + "epoch": 6.21, + "learning_rate": 4.716673469387755e-05, + "loss": 2.3935, + "step": 269000 + }, + { + "FLOPS loss": 0.060407768934965134, + "L0_d": 864.31, + "MLM loss": 2.3954501152038574, + "epoch": 6.23, + "step": 269499 + }, + { + "epoch": 6.23, + "learning_rate": 4.706469387755103e-05, + "loss": 2.387, + "step": 269500 + }, + { + "FLOPS loss": 0.08911103010177612, + "L0_d": 947.52, + "MLM loss": 2.317873477935791, + "epoch": 6.24, + "step": 269999 + }, + { + "epoch": 6.24, + "learning_rate": 4.696265306122449e-05, + "loss": 2.3911, + "step": 270000 + }, + { + "FLOPS loss": 0.06322460621595383, + "L0_d": 747.06, + "MLM loss": 2.447556257247925, + "epoch": 6.25, + "step": 270499 + }, + { + "epoch": 6.25, + "learning_rate": 4.686061224489796e-05, + "loss": 2.3851, + "step": 270500 + }, + { + "FLOPS loss": 0.06245068833231926, + "L0_d": 842.75, + "MLM loss": 2.0686545372009277, + "epoch": 6.26, + "step": 270999 + }, + { + "epoch": 6.26, + "learning_rate": 4.6758775510204084e-05, + "loss": 2.3861, + "step": 271000 + }, + { + "FLOPS loss": 0.06610134243965149, + "L0_d": 788.77, + "MLM loss": 2.152571201324463, + "epoch": 6.27, + "step": 271499 + }, + { + "epoch": 6.27, + "learning_rate": 4.665673469387755e-05, + "loss": 2.3904, + "step": 271500 + }, + { + "FLOPS loss": 0.052447445690631866, + "L0_d": 803.31, + "MLM loss": 2.2232165336608887, + "epoch": 6.28, + "step": 271999 + }, + { + "epoch": 6.28, + "learning_rate": 4.655469387755102e-05, + "loss": 2.3908, + "step": 272000 + }, + { + "FLOPS loss": 0.056923095136880875, + "L0_d": 617.55, + "MLM loss": 2.160806655883789, + "epoch": 6.3, + "step": 272499 + }, + { + "epoch": 6.3, + "learning_rate": 4.645265306122449e-05, + "loss": 2.3868, + "step": 272500 + }, + { + "FLOPS loss": 0.06665085256099701, + "L0_d": 807.03, + "MLM loss": 2.246150493621826, + "epoch": 6.31, + "step": 272999 + }, + { + "epoch": 6.31, + "learning_rate": 4.635081632653062e-05, + "loss": 2.3861, + "step": 273000 + }, + { + "FLOPS loss": 0.07849167287349701, + "L0_d": 1440.08, + "MLM loss": 2.3028604984283447, + "epoch": 6.32, + "step": 273499 + }, + { + "epoch": 6.32, + "learning_rate": 4.6248775510204084e-05, + "loss": 2.3854, + "step": 273500 + }, + { + "FLOPS loss": 0.055762071162462234, + "L0_d": 688.75, + "MLM loss": 2.46593976020813, + "epoch": 6.33, + "step": 273999 + }, + { + "epoch": 6.33, + "learning_rate": 4.6146734693877555e-05, + "loss": 2.3891, + "step": 274000 + }, + { + "FLOPS loss": 0.06459101289510727, + "L0_d": 755.36, + "MLM loss": 2.303806781768799, + "epoch": 6.34, + "step": 274499 + }, + { + "epoch": 6.34, + "learning_rate": 4.604469387755102e-05, + "loss": 2.3882, + "step": 274500 + }, + { + "FLOPS loss": 0.055319491773843765, + "L0_d": 676.47, + "MLM loss": 2.383361577987671, + "epoch": 6.35, + "step": 274999 + }, + { + "epoch": 6.35, + "learning_rate": 4.594265306122449e-05, + "loss": 2.3903, + "step": 275000 + }, + { + "FLOPS loss": 0.057988785207271576, + "L0_d": 867.92, + "MLM loss": 2.260382652282715, + "epoch": 6.36, + "step": 275499 + }, + { + "epoch": 6.36, + "learning_rate": 4.584081632653062e-05, + "loss": 2.3875, + "step": 275500 + }, + { + "FLOPS loss": 0.068933866918087, + "L0_d": 709.81, + "MLM loss": 2.305701494216919, + "epoch": 6.38, + "step": 275999 + }, + { + "epoch": 6.38, + "learning_rate": 4.5738775510204083e-05, + "loss": 2.3868, + "step": 276000 + }, + { + "FLOPS loss": 0.054191600531339645, + "L0_d": 662.23, + "MLM loss": 2.4663784503936768, + "epoch": 6.39, + "step": 276499 + }, + { + "epoch": 6.39, + "learning_rate": 4.5636734693877555e-05, + "loss": 2.3901, + "step": 276500 + }, + { + "FLOPS loss": 0.06299523264169693, + "L0_d": 715.62, + "MLM loss": 2.32785964012146, + "epoch": 6.4, + "step": 276999 + }, + { + "epoch": 6.4, + "learning_rate": 4.553469387755102e-05, + "loss": 2.3904, + "step": 277000 + }, + { + "FLOPS loss": 0.0625939816236496, + "L0_d": 768.23, + "MLM loss": 2.37985897064209, + "epoch": 6.41, + "step": 277499 + }, + { + "epoch": 6.41, + "learning_rate": 4.543265306122449e-05, + "loss": 2.3827, + "step": 277500 + }, + { + "FLOPS loss": 0.07297017425298691, + "L0_d": 933.28, + "MLM loss": 2.275392770767212, + "epoch": 6.42, + "step": 277999 + }, + { + "epoch": 6.42, + "learning_rate": 4.533061224489796e-05, + "loss": 2.3839, + "step": 278000 + }, + { + "FLOPS loss": 0.07424743473529816, + "L0_d": 1015.55, + "MLM loss": 2.41300106048584, + "epoch": 6.43, + "step": 278499 + }, + { + "epoch": 6.43, + "learning_rate": 4.5228571428571434e-05, + "loss": 2.3819, + "step": 278500 + }, + { + "FLOPS loss": 0.06222039461135864, + "L0_d": 778.44, + "MLM loss": 2.3484959602355957, + "epoch": 6.45, + "step": 278999 + }, + { + "epoch": 6.45, + "learning_rate": 4.5126734693877555e-05, + "loss": 2.3801, + "step": 279000 + }, + { + "FLOPS loss": 0.06579253822565079, + "L0_d": 879.73, + "MLM loss": 2.43776273727417, + "epoch": 6.46, + "step": 279499 + }, + { + "epoch": 6.46, + "learning_rate": 4.502469387755102e-05, + "loss": 2.3828, + "step": 279500 + }, + { + "FLOPS loss": 0.061226122081279755, + "L0_d": 738.83, + "MLM loss": 2.400214433670044, + "epoch": 6.47, + "step": 279999 + }, + { + "epoch": 6.47, + "learning_rate": 4.492265306122449e-05, + "loss": 2.3852, + "step": 280000 + }, + { + "FLOPS loss": 0.05953386053442955, + "L0_d": 837.47, + "MLM loss": 2.478689193725586, + "epoch": 6.48, + "step": 280499 + }, + { + "epoch": 6.48, + "learning_rate": 4.482061224489796e-05, + "loss": 2.3826, + "step": 280500 + }, + { + "FLOPS loss": 0.05504816770553589, + "L0_d": 672.55, + "MLM loss": 2.2007439136505127, + "epoch": 6.49, + "step": 280999 + }, + { + "epoch": 6.49, + "learning_rate": 4.471877551020408e-05, + "loss": 2.3818, + "step": 281000 + }, + { + "FLOPS loss": 0.05578654631972313, + "L0_d": 575.17, + "MLM loss": 2.4325294494628906, + "epoch": 6.5, + "step": 281499 + }, + { + "epoch": 6.5, + "learning_rate": 4.4616734693877555e-05, + "loss": 2.3869, + "step": 281500 + }, + { + "FLOPS loss": 0.06520844995975494, + "L0_d": 792.69, + "MLM loss": 2.347710132598877, + "epoch": 6.51, + "step": 281999 + }, + { + "epoch": 6.51, + "learning_rate": 4.451469387755102e-05, + "loss": 2.3808, + "step": 282000 + }, + { + "FLOPS loss": 0.06950800865888596, + "L0_d": 819.52, + "MLM loss": 2.4303078651428223, + "epoch": 6.53, + "step": 282499 + }, + { + "epoch": 6.53, + "learning_rate": 4.441265306122449e-05, + "loss": 2.3811, + "step": 282500 + }, + { + "FLOPS loss": 0.05945185571908951, + "L0_d": 691.12, + "MLM loss": 2.215554714202881, + "epoch": 6.54, + "step": 282999 + }, + { + "epoch": 6.54, + "learning_rate": 4.431061224489796e-05, + "loss": 2.3854, + "step": 283000 + }, + { + "FLOPS loss": 0.0663042813539505, + "L0_d": 842.08, + "MLM loss": 2.335991859436035, + "epoch": 6.55, + "step": 283499 + }, + { + "epoch": 6.55, + "learning_rate": 4.4208571428571434e-05, + "loss": 2.3834, + "step": 283500 + }, + { + "FLOPS loss": 0.05819135159254074, + "L0_d": 824.59, + "MLM loss": 2.4544339179992676, + "epoch": 6.56, + "step": 283999 + }, + { + "epoch": 6.56, + "learning_rate": 4.4106734693877554e-05, + "loss": 2.3796, + "step": 284000 + }, + { + "FLOPS loss": 0.056056853383779526, + "L0_d": 739.22, + "MLM loss": 2.5581278800964355, + "epoch": 6.57, + "step": 284499 + }, + { + "epoch": 6.57, + "learning_rate": 4.4004693877551026e-05, + "loss": 2.3826, + "step": 284500 + }, + { + "FLOPS loss": 0.07030254602432251, + "L0_d": 863.52, + "MLM loss": 2.2112503051757812, + "epoch": 6.58, + "step": 284999 + }, + { + "epoch": 6.58, + "learning_rate": 4.390265306122449e-05, + "loss": 2.3774, + "step": 285000 + }, + { + "FLOPS loss": 0.0722346305847168, + "L0_d": 921.95, + "MLM loss": 2.2126243114471436, + "epoch": 6.6, + "step": 285499 + }, + { + "epoch": 6.6, + "learning_rate": 4.380061224489796e-05, + "loss": 2.3796, + "step": 285500 + }, + { + "FLOPS loss": 0.0708499550819397, + "L0_d": 987.69, + "MLM loss": 2.3058786392211914, + "epoch": 6.61, + "step": 285999 + }, + { + "epoch": 6.61, + "learning_rate": 4.3698571428571433e-05, + "loss": 2.3796, + "step": 286000 + }, + { + "FLOPS loss": 0.0859348475933075, + "L0_d": 1135.59, + "MLM loss": 2.3978452682495117, + "epoch": 6.62, + "step": 286499 + }, + { + "epoch": 6.62, + "learning_rate": 4.3596734693877554e-05, + "loss": 2.3804, + "step": 286500 + }, + { + "FLOPS loss": 0.058340977877378464, + "L0_d": 1036.36, + "MLM loss": 2.3695602416992188, + "epoch": 6.63, + "step": 286999 + }, + { + "epoch": 6.63, + "learning_rate": 4.3494693877551026e-05, + "loss": 2.3782, + "step": 287000 + }, + { + "FLOPS loss": 0.07051306217908859, + "L0_d": 870.98, + "MLM loss": 2.3154542446136475, + "epoch": 6.64, + "step": 287499 + }, + { + "epoch": 6.64, + "learning_rate": 4.339265306122449e-05, + "loss": 2.3804, + "step": 287500 + }, + { + "FLOPS loss": 0.06338206678628922, + "L0_d": 833.42, + "MLM loss": 2.339228868484497, + "epoch": 6.65, + "step": 287999 + }, + { + "epoch": 6.65, + "learning_rate": 4.329061224489796e-05, + "loss": 2.3813, + "step": 288000 + }, + { + "FLOPS loss": 0.0613948330283165, + "L0_d": 653.11, + "MLM loss": 2.249403715133667, + "epoch": 6.66, + "step": 288499 + }, + { + "epoch": 6.66, + "learning_rate": 4.318877551020408e-05, + "loss": 2.3809, + "step": 288500 + }, + { + "FLOPS loss": 0.07161176204681396, + "L0_d": 1040.56, + "MLM loss": 2.434735059738159, + "epoch": 6.68, + "step": 288999 + }, + { + "epoch": 6.68, + "learning_rate": 4.3086734693877554e-05, + "loss": 2.3818, + "step": 289000 + }, + { + "FLOPS loss": 0.0638524666428566, + "L0_d": 653.52, + "MLM loss": 2.3446688652038574, + "epoch": 6.69, + "step": 289499 + }, + { + "epoch": 6.69, + "learning_rate": 4.2984693877551025e-05, + "loss": 2.3795, + "step": 289500 + }, + { + "FLOPS loss": 0.061977677047252655, + "L0_d": 847.8, + "MLM loss": 2.28072190284729, + "epoch": 6.7, + "step": 289999 + }, + { + "epoch": 6.7, + "learning_rate": 4.288265306122449e-05, + "loss": 2.3776, + "step": 290000 + }, + { + "FLOPS loss": 0.07041425257921219, + "L0_d": 862.5, + "MLM loss": 2.322789192199707, + "epoch": 6.71, + "step": 290499 + }, + { + "epoch": 6.71, + "learning_rate": 4.278061224489796e-05, + "loss": 2.3828, + "step": 290500 + }, + { + "FLOPS loss": 0.06982388347387314, + "L0_d": 939.08, + "MLM loss": 2.4443392753601074, + "epoch": 6.72, + "step": 290999 + }, + { + "epoch": 6.72, + "learning_rate": 4.267877551020408e-05, + "loss": 2.3784, + "step": 291000 + }, + { + "FLOPS loss": 0.07364574819803238, + "L0_d": 973.33, + "MLM loss": 2.296755313873291, + "epoch": 6.73, + "step": 291499 + }, + { + "epoch": 6.73, + "learning_rate": 4.2576734693877554e-05, + "loss": 2.3769, + "step": 291500 + }, + { + "FLOPS loss": 0.06386933475732803, + "L0_d": 815.48, + "MLM loss": 2.282402276992798, + "epoch": 6.75, + "step": 291999 + }, + { + "epoch": 6.75, + "learning_rate": 4.2474693877551025e-05, + "loss": 2.3798, + "step": 292000 + }, + { + "FLOPS loss": 0.07132063060998917, + "L0_d": 1084.69, + "MLM loss": 2.1212337017059326, + "epoch": 6.76, + "step": 292499 + }, + { + "epoch": 6.76, + "learning_rate": 4.237265306122449e-05, + "loss": 2.3731, + "step": 292500 + }, + { + "FLOPS loss": 0.06481712311506271, + "L0_d": 1047.08, + "MLM loss": 2.2411792278289795, + "epoch": 6.77, + "step": 292999 + }, + { + "epoch": 6.77, + "learning_rate": 4.227081632653062e-05, + "loss": 2.3806, + "step": 293000 + }, + { + "FLOPS loss": 0.07611627876758575, + "L0_d": 935.75, + "MLM loss": 2.4179129600524902, + "epoch": 6.78, + "step": 293499 + }, + { + "epoch": 6.78, + "learning_rate": 4.216877551020408e-05, + "loss": 2.3761, + "step": 293500 + }, + { + "FLOPS loss": 0.07053264230489731, + "L0_d": 918.11, + "MLM loss": 2.527155876159668, + "epoch": 6.79, + "step": 293999 + }, + { + "epoch": 6.79, + "learning_rate": 4.2066734693877554e-05, + "loss": 2.3755, + "step": 294000 + }, + { + "FLOPS loss": 0.05917475000023842, + "L0_d": 761.69, + "MLM loss": 2.6052803993225098, + "epoch": 6.8, + "step": 294499 + }, + { + "epoch": 6.8, + "learning_rate": 4.196469387755102e-05, + "loss": 2.3729, + "step": 294500 + }, + { + "FLOPS loss": 0.05868987366557121, + "L0_d": 839.08, + "MLM loss": 2.366290330886841, + "epoch": 6.81, + "step": 294999 + }, + { + "epoch": 6.81, + "learning_rate": 4.1862857142857146e-05, + "loss": 2.376, + "step": 295000 + }, + { + "FLOPS loss": 0.05811339616775513, + "L0_d": 758.78, + "MLM loss": 2.499417543411255, + "epoch": 6.83, + "step": 295499 + }, + { + "epoch": 6.83, + "learning_rate": 4.176081632653062e-05, + "loss": 2.3766, + "step": 295500 + }, + { + "FLOPS loss": 0.06532425433397293, + "L0_d": 995.02, + "MLM loss": 2.3556973934173584, + "epoch": 6.84, + "step": 295999 + }, + { + "epoch": 6.84, + "learning_rate": 4.165877551020408e-05, + "loss": 2.3794, + "step": 296000 + }, + { + "FLOPS loss": 0.05370713397860527, + "L0_d": 829.56, + "MLM loss": 2.430586338043213, + "epoch": 6.85, + "step": 296499 + }, + { + "epoch": 6.85, + "learning_rate": 4.155673469387755e-05, + "loss": 2.3737, + "step": 296500 + }, + { + "FLOPS loss": 0.059047866612672806, + "L0_d": 831.78, + "MLM loss": 2.066843271255493, + "epoch": 6.86, + "step": 296999 + }, + { + "epoch": 6.86, + "learning_rate": 4.145469387755102e-05, + "loss": 2.3742, + "step": 297000 + }, + { + "FLOPS loss": 0.059049107134342194, + "L0_d": 662.92, + "MLM loss": 2.134557008743286, + "epoch": 6.87, + "step": 297499 + }, + { + "epoch": 6.87, + "learning_rate": 4.1352857142857146e-05, + "loss": 2.3729, + "step": 297500 + }, + { + "FLOPS loss": 0.05946721136569977, + "L0_d": 851.62, + "MLM loss": 2.1986374855041504, + "epoch": 6.88, + "step": 297999 + }, + { + "epoch": 6.88, + "learning_rate": 4.125081632653062e-05, + "loss": 2.3735, + "step": 298000 + }, + { + "FLOPS loss": 0.0655231922864914, + "L0_d": 763.11, + "MLM loss": 2.2446515560150146, + "epoch": 6.9, + "step": 298499 + }, + { + "epoch": 6.9, + "learning_rate": 4.114877551020408e-05, + "loss": 2.3695, + "step": 298500 + }, + { + "FLOPS loss": 0.0701904147863388, + "L0_d": 867.0, + "MLM loss": 2.29551100730896, + "epoch": 6.91, + "step": 298999 + }, + { + "epoch": 6.91, + "learning_rate": 4.104673469387755e-05, + "loss": 2.3733, + "step": 299000 + }, + { + "FLOPS loss": 0.06681898236274719, + "L0_d": 761.53, + "MLM loss": 2.2125775814056396, + "epoch": 6.92, + "step": 299499 + }, + { + "epoch": 6.92, + "learning_rate": 4.0944693877551025e-05, + "loss": 2.3744, + "step": 299500 + }, + { + "FLOPS loss": 0.08913971483707428, + "L0_d": 1045.88, + "MLM loss": 2.3854105472564697, + "epoch": 6.93, + "step": 299999 + }, + { + "epoch": 6.93, + "learning_rate": 4.084265306122449e-05, + "loss": 2.3712, + "step": 300000 + }, + { + "FLOPS loss": 0.04997643828392029, + "L0_d": 551.39, + "MLM loss": 2.3603034019470215, + "epoch": 6.94, + "step": 300499 + }, + { + "epoch": 6.94, + "learning_rate": 4.074081632653061e-05, + "loss": 2.3726, + "step": 300500 + }, + { + "FLOPS loss": 0.06283554434776306, + "L0_d": 954.12, + "MLM loss": 2.101763963699341, + "epoch": 6.95, + "step": 300999 + }, + { + "epoch": 6.95, + "learning_rate": 4.063877551020409e-05, + "loss": 2.3714, + "step": 301000 + }, + { + "FLOPS loss": 0.0683046281337738, + "L0_d": 769.17, + "MLM loss": 2.2551887035369873, + "epoch": 6.96, + "step": 301499 + }, + { + "epoch": 6.96, + "learning_rate": 4.053673469387755e-05, + "loss": 2.37, + "step": 301500 + }, + { + "FLOPS loss": 0.06603013724088669, + "L0_d": 890.02, + "MLM loss": 2.2235827445983887, + "epoch": 6.98, + "step": 301999 + }, + { + "epoch": 6.98, + "learning_rate": 4.0434693877551024e-05, + "loss": 2.3691, + "step": 302000 + }, + { + "FLOPS loss": 0.06309976428747177, + "L0_d": 1070.56, + "MLM loss": 2.1997439861297607, + "epoch": 6.99, + "step": 302499 + }, + { + "epoch": 6.99, + "learning_rate": 4.033265306122449e-05, + "loss": 2.3694, + "step": 302500 + }, + { + "FLOPS loss": 0.06498320400714874, + "L0_d": 823.11, + "MLM loss": 2.5637640953063965, + "epoch": 7.0, + "step": 302999 + }, + { + "epoch": 7.0, + "learning_rate": 4.023081632653061e-05, + "loss": 2.3683, + "step": 303000 + }, + { + "FLOPS loss": 0.06452923268079758, + "L0_d": 732.8, + "MLM loss": 2.0735771656036377, + "epoch": 7.01, + "step": 303499 + }, + { + "epoch": 7.01, + "learning_rate": 4.012877551020409e-05, + "loss": 2.3687, + "step": 303500 + }, + { + "FLOPS loss": 0.06292728334665298, + "L0_d": 832.94, + "MLM loss": 2.518744945526123, + "epoch": 7.02, + "step": 303999 + }, + { + "epoch": 7.02, + "learning_rate": 4.002673469387755e-05, + "loss": 2.3689, + "step": 304000 + }, + { + "FLOPS loss": 0.0746520385146141, + "L0_d": 937.45, + "MLM loss": 2.2790918350219727, + "epoch": 7.03, + "step": 304499 + }, + { + "epoch": 7.03, + "learning_rate": 3.9924693877551024e-05, + "loss": 2.3638, + "step": 304500 + }, + { + "FLOPS loss": 0.07710359990596771, + "L0_d": 1098.89, + "MLM loss": 2.253662347793579, + "epoch": 7.05, + "step": 304999 + }, + { + "epoch": 7.05, + "learning_rate": 3.982265306122449e-05, + "loss": 2.3666, + "step": 305000 + }, + { + "FLOPS loss": 0.07163328677415848, + "L0_d": 895.14, + "MLM loss": 2.1064743995666504, + "epoch": 7.06, + "step": 305499 + }, + { + "epoch": 7.06, + "learning_rate": 3.972061224489796e-05, + "loss": 2.3634, + "step": 305500 + }, + { + "FLOPS loss": 0.05717089772224426, + "L0_d": 643.52, + "MLM loss": 2.196295976638794, + "epoch": 7.07, + "step": 305999 + }, + { + "epoch": 7.07, + "learning_rate": 3.961857142857143e-05, + "loss": 2.3612, + "step": 306000 + }, + { + "FLOPS loss": 0.07105471938848495, + "L0_d": 981.28, + "MLM loss": 2.2125749588012695, + "epoch": 7.08, + "step": 306499 + }, + { + "epoch": 7.08, + "learning_rate": 3.951673469387755e-05, + "loss": 2.3651, + "step": 306500 + }, + { + "FLOPS loss": 0.07635489851236343, + "L0_d": 1220.39, + "MLM loss": 2.259152412414551, + "epoch": 7.09, + "step": 306999 + }, + { + "epoch": 7.09, + "learning_rate": 3.9414693877551024e-05, + "loss": 2.3585, + "step": 307000 + }, + { + "FLOPS loss": 0.056164611130952835, + "L0_d": 959.52, + "MLM loss": 2.4167141914367676, + "epoch": 7.1, + "step": 307499 + }, + { + "epoch": 7.1, + "learning_rate": 3.931265306122449e-05, + "loss": 2.3656, + "step": 307500 + }, + { + "FLOPS loss": 0.05773361772298813, + "L0_d": 784.94, + "MLM loss": 2.4181439876556396, + "epoch": 7.12, + "step": 307999 + }, + { + "epoch": 7.12, + "learning_rate": 3.921061224489796e-05, + "loss": 2.3638, + "step": 308000 + }, + { + "FLOPS loss": 0.06944171339273453, + "L0_d": 838.94, + "MLM loss": 2.3983397483825684, + "epoch": 7.13, + "step": 308499 + }, + { + "epoch": 7.13, + "learning_rate": 3.910877551020408e-05, + "loss": 2.3641, + "step": 308500 + }, + { + "FLOPS loss": 0.08090784400701523, + "L0_d": 1157.02, + "MLM loss": 2.3439295291900635, + "epoch": 7.14, + "step": 308999 + }, + { + "epoch": 7.14, + "learning_rate": 3.900673469387755e-05, + "loss": 2.3621, + "step": 309000 + }, + { + "FLOPS loss": 0.06606020033359528, + "L0_d": 867.73, + "MLM loss": 2.192032814025879, + "epoch": 7.15, + "step": 309499 + }, + { + "epoch": 7.15, + "learning_rate": 3.8904693877551024e-05, + "loss": 2.364, + "step": 309500 + }, + { + "FLOPS loss": 0.06273338198661804, + "L0_d": 827.27, + "MLM loss": 2.3575186729431152, + "epoch": 7.16, + "step": 309999 + }, + { + "epoch": 7.16, + "learning_rate": 3.8802653061224495e-05, + "loss": 2.3614, + "step": 310000 + }, + { + "FLOPS loss": 0.06386949867010117, + "L0_d": 798.56, + "MLM loss": 2.291250705718994, + "epoch": 7.17, + "step": 310499 + }, + { + "epoch": 7.17, + "learning_rate": 3.8700816326530616e-05, + "loss": 2.3663, + "step": 310500 + }, + { + "FLOPS loss": 0.0672420784831047, + "L0_d": 785.33, + "MLM loss": 2.1985931396484375, + "epoch": 7.18, + "step": 310999 + }, + { + "epoch": 7.18, + "learning_rate": 3.859877551020408e-05, + "loss": 2.3613, + "step": 311000 + }, + { + "FLOPS loss": 0.05241331830620766, + "L0_d": 664.06, + "MLM loss": 2.3520312309265137, + "epoch": 7.2, + "step": 311499 + }, + { + "epoch": 7.2, + "learning_rate": 3.849673469387755e-05, + "loss": 2.3652, + "step": 311500 + }, + { + "FLOPS loss": 0.07527154684066772, + "L0_d": 1110.08, + "MLM loss": 2.1942214965820312, + "epoch": 7.21, + "step": 311999 + }, + { + "epoch": 7.21, + "learning_rate": 3.8394693877551024e-05, + "loss": 2.367, + "step": 312000 + }, + { + "FLOPS loss": 0.07647871971130371, + "L0_d": 1271.92, + "MLM loss": 2.137394428253174, + "epoch": 7.22, + "step": 312499 + }, + { + "epoch": 7.22, + "learning_rate": 3.8292857142857144e-05, + "loss": 2.3624, + "step": 312500 + }, + { + "FLOPS loss": 0.06865822523832321, + "L0_d": 896.69, + "MLM loss": 2.2419652938842773, + "epoch": 7.23, + "step": 312999 + }, + { + "epoch": 7.23, + "learning_rate": 3.8190816326530616e-05, + "loss": 2.3603, + "step": 313000 + }, + { + "FLOPS loss": 0.061193566769361496, + "L0_d": 932.89, + "MLM loss": 2.2621870040893555, + "epoch": 7.24, + "step": 313499 + }, + { + "epoch": 7.24, + "learning_rate": 3.808877551020408e-05, + "loss": 2.363, + "step": 313500 + }, + { + "FLOPS loss": 0.06347547471523285, + "L0_d": 807.72, + "MLM loss": 2.4694080352783203, + "epoch": 7.25, + "step": 313999 + }, + { + "epoch": 7.25, + "learning_rate": 3.798673469387755e-05, + "loss": 2.3612, + "step": 314000 + }, + { + "FLOPS loss": 0.059469565749168396, + "L0_d": 726.23, + "MLM loss": 2.3274638652801514, + "epoch": 7.27, + "step": 314499 + }, + { + "epoch": 7.27, + "learning_rate": 3.788489795918367e-05, + "loss": 2.3589, + "step": 314500 + }, + { + "FLOPS loss": 0.056309543550014496, + "L0_d": 703.62, + "MLM loss": 2.3300890922546387, + "epoch": 7.28, + "step": 314999 + }, + { + "epoch": 7.28, + "learning_rate": 3.7782857142857144e-05, + "loss": 2.3569, + "step": 315000 + }, + { + "FLOPS loss": 0.06213608756661415, + "L0_d": 892.17, + "MLM loss": 2.4156923294067383, + "epoch": 7.29, + "step": 315499 + }, + { + "epoch": 7.29, + "learning_rate": 3.7680816326530616e-05, + "loss": 2.3593, + "step": 315500 + }, + { + "FLOPS loss": 0.05070233717560768, + "L0_d": 730.11, + "MLM loss": 2.285841703414917, + "epoch": 7.3, + "step": 315999 + }, + { + "epoch": 7.3, + "learning_rate": 3.757877551020409e-05, + "loss": 2.3574, + "step": 316000 + }, + { + "FLOPS loss": 0.05257587879896164, + "L0_d": 803.59, + "MLM loss": 2.432795524597168, + "epoch": 7.31, + "step": 316499 + }, + { + "epoch": 7.31, + "learning_rate": 3.747693877551021e-05, + "loss": 2.3581, + "step": 316500 + }, + { + "FLOPS loss": 0.08210153877735138, + "L0_d": 1165.5, + "MLM loss": 2.3357038497924805, + "epoch": 7.32, + "step": 316999 + }, + { + "epoch": 7.32, + "learning_rate": 3.737489795918367e-05, + "loss": 2.362, + "step": 317000 + }, + { + "FLOPS loss": 0.07381358742713928, + "L0_d": 1099.89, + "MLM loss": 2.410074234008789, + "epoch": 7.33, + "step": 317499 + }, + { + "epoch": 7.33, + "learning_rate": 3.7272857142857144e-05, + "loss": 2.3552, + "step": 317500 + }, + { + "FLOPS loss": 0.06650281697511673, + "L0_d": 752.72, + "MLM loss": 2.2369513511657715, + "epoch": 7.35, + "step": 317999 + }, + { + "epoch": 7.35, + "learning_rate": 3.7170816326530615e-05, + "loss": 2.3599, + "step": 318000 + }, + { + "FLOPS loss": 0.05438707023859024, + "L0_d": 769.48, + "MLM loss": 2.160310745239258, + "epoch": 7.36, + "step": 318499 + }, + { + "epoch": 7.36, + "learning_rate": 3.706877551020409e-05, + "loss": 2.36, + "step": 318500 + }, + { + "FLOPS loss": 0.06458336114883423, + "L0_d": 743.03, + "MLM loss": 2.202986240386963, + "epoch": 7.37, + "step": 318999 + }, + { + "epoch": 7.37, + "learning_rate": 3.696693877551021e-05, + "loss": 2.3602, + "step": 319000 + }, + { + "FLOPS loss": 0.08015909790992737, + "L0_d": 981.23, + "MLM loss": 2.1715171337127686, + "epoch": 7.38, + "step": 319499 + }, + { + "epoch": 7.38, + "learning_rate": 3.686489795918367e-05, + "loss": 2.3598, + "step": 319500 + }, + { + "FLOPS loss": 0.0799102857708931, + "L0_d": 1328.16, + "MLM loss": 2.2394657135009766, + "epoch": 7.39, + "step": 319999 + }, + { + "epoch": 7.39, + "learning_rate": 3.6762857142857144e-05, + "loss": 2.3564, + "step": 320000 + }, + { + "FLOPS loss": 0.05967347323894501, + "L0_d": 700.19, + "MLM loss": 2.1987807750701904, + "epoch": 7.4, + "step": 320499 + }, + { + "epoch": 7.4, + "learning_rate": 3.666081632653061e-05, + "loss": 2.3597, + "step": 320500 + }, + { + "FLOPS loss": 0.060638427734375, + "L0_d": 749.98, + "MLM loss": 2.3340020179748535, + "epoch": 7.42, + "step": 320999 + }, + { + "epoch": 7.42, + "learning_rate": 3.655877551020409e-05, + "loss": 2.3565, + "step": 321000 + }, + { + "FLOPS loss": 0.06832794100046158, + "L0_d": 881.05, + "MLM loss": 2.452484369277954, + "epoch": 7.43, + "step": 321499 + }, + { + "epoch": 7.43, + "learning_rate": 3.645693877551021e-05, + "loss": 2.3592, + "step": 321500 + }, + { + "FLOPS loss": 0.06505056470632553, + "L0_d": 859.77, + "MLM loss": 2.3983802795410156, + "epoch": 7.44, + "step": 321999 + }, + { + "epoch": 7.44, + "learning_rate": 3.635489795918368e-05, + "loss": 2.3595, + "step": 322000 + }, + { + "FLOPS loss": 0.06231040507555008, + "L0_d": 796.2, + "MLM loss": 2.3952202796936035, + "epoch": 7.45, + "step": 322499 + }, + { + "epoch": 7.45, + "learning_rate": 3.6252857142857144e-05, + "loss": 2.3584, + "step": 322500 + }, + { + "FLOPS loss": 0.05062004178762436, + "L0_d": 647.19, + "MLM loss": 2.355503559112549, + "epoch": 7.46, + "step": 322999 + }, + { + "epoch": 7.46, + "learning_rate": 3.6150816326530615e-05, + "loss": 2.3588, + "step": 323000 + }, + { + "FLOPS loss": 0.05564975365996361, + "L0_d": 723.34, + "MLM loss": 2.186389207839966, + "epoch": 7.47, + "step": 323499 + }, + { + "epoch": 7.47, + "learning_rate": 3.6048775510204086e-05, + "loss": 2.35, + "step": 323500 + }, + { + "FLOPS loss": 0.06510195136070251, + "L0_d": 705.69, + "MLM loss": 2.2318332195281982, + "epoch": 7.48, + "step": 323999 + }, + { + "epoch": 7.48, + "learning_rate": 3.594693877551021e-05, + "loss": 2.3493, + "step": 324000 + }, + { + "FLOPS loss": 0.07114795595407486, + "L0_d": 1248.36, + "MLM loss": 2.2258198261260986, + "epoch": 7.5, + "step": 324499 + }, + { + "epoch": 7.5, + "learning_rate": 3.584489795918368e-05, + "loss": 2.3566, + "step": 324500 + }, + { + "FLOPS loss": 0.06501613557338715, + "L0_d": 806.12, + "MLM loss": 2.154510021209717, + "epoch": 7.51, + "step": 324999 + }, + { + "epoch": 7.51, + "learning_rate": 3.574285714285714e-05, + "loss": 2.3544, + "step": 325000 + }, + { + "FLOPS loss": 0.06274013221263885, + "L0_d": 906.39, + "MLM loss": 2.1443748474121094, + "epoch": 7.52, + "step": 325499 + }, + { + "epoch": 7.52, + "learning_rate": 3.5640816326530615e-05, + "loss": 2.3583, + "step": 325500 + }, + { + "FLOPS loss": 0.06960269808769226, + "L0_d": 1022.05, + "MLM loss": 2.3154430389404297, + "epoch": 7.53, + "step": 325999 + }, + { + "epoch": 7.53, + "learning_rate": 3.5538979591836735e-05, + "loss": 2.3535, + "step": 326000 + }, + { + "FLOPS loss": 0.05796834081411362, + "L0_d": 738.72, + "MLM loss": 2.11289119720459, + "epoch": 7.54, + "step": 326499 + }, + { + "epoch": 7.54, + "learning_rate": 3.54369387755102e-05, + "loss": 2.3555, + "step": 326500 + }, + { + "FLOPS loss": 0.08279315382242203, + "L0_d": 936.34, + "MLM loss": 2.1696484088897705, + "epoch": 7.55, + "step": 326999 + }, + { + "epoch": 7.55, + "learning_rate": 3.533489795918368e-05, + "loss": 2.3557, + "step": 327000 + }, + { + "FLOPS loss": 0.07448771595954895, + "L0_d": 977.88, + "MLM loss": 2.3000853061676025, + "epoch": 7.57, + "step": 327499 + }, + { + "epoch": 7.57, + "learning_rate": 3.523285714285714e-05, + "loss": 2.3518, + "step": 327500 + }, + { + "FLOPS loss": 0.06067269667983055, + "L0_d": 716.92, + "MLM loss": 2.23938250541687, + "epoch": 7.58, + "step": 327999 + }, + { + "epoch": 7.58, + "learning_rate": 3.5130816326530615e-05, + "loss": 2.3565, + "step": 328000 + }, + { + "FLOPS loss": 0.056932076811790466, + "L0_d": 786.47, + "MLM loss": 2.4538161754608154, + "epoch": 7.59, + "step": 328499 + }, + { + "epoch": 7.59, + "learning_rate": 3.5028979591836735e-05, + "loss": 2.3513, + "step": 328500 + }, + { + "FLOPS loss": 0.0752127468585968, + "L0_d": 935.5, + "MLM loss": 2.1751489639282227, + "epoch": 7.6, + "step": 328999 + }, + { + "epoch": 7.6, + "learning_rate": 3.492693877551021e-05, + "loss": 2.3496, + "step": 329000 + }, + { + "FLOPS loss": 0.055434972047805786, + "L0_d": 711.95, + "MLM loss": 2.378598690032959, + "epoch": 7.61, + "step": 329499 + }, + { + "epoch": 7.61, + "learning_rate": 3.482489795918368e-05, + "loss": 2.3567, + "step": 329500 + }, + { + "FLOPS loss": 0.07803203910589218, + "L0_d": 961.14, + "MLM loss": 2.332076072692871, + "epoch": 7.62, + "step": 329999 + }, + { + "epoch": 7.62, + "learning_rate": 3.472285714285714e-05, + "loss": 2.3558, + "step": 330000 + }, + { + "FLOPS loss": 0.0613698773086071, + "L0_d": 854.08, + "MLM loss": 2.2925326824188232, + "epoch": 7.63, + "step": 330499 + }, + { + "epoch": 7.63, + "learning_rate": 3.4620816326530614e-05, + "loss": 2.3536, + "step": 330500 + }, + { + "FLOPS loss": 0.0583004504442215, + "L0_d": 613.38, + "MLM loss": 2.266010284423828, + "epoch": 7.65, + "step": 330999 + }, + { + "epoch": 7.65, + "learning_rate": 3.4518979591836735e-05, + "loss": 2.3546, + "step": 331000 + }, + { + "FLOPS loss": 0.06375960260629654, + "L0_d": 875.72, + "MLM loss": 2.4512927532196045, + "epoch": 7.66, + "step": 331499 + }, + { + "epoch": 7.66, + "learning_rate": 3.4416938775510207e-05, + "loss": 2.3504, + "step": 331500 + }, + { + "FLOPS loss": 0.06886997818946838, + "L0_d": 880.61, + "MLM loss": 2.2343506813049316, + "epoch": 7.67, + "step": 331999 + }, + { + "epoch": 7.67, + "learning_rate": 3.431489795918367e-05, + "loss": 2.3526, + "step": 332000 + }, + { + "FLOPS loss": 0.06465931236743927, + "L0_d": 1193.62, + "MLM loss": 2.2527096271514893, + "epoch": 7.68, + "step": 332499 + }, + { + "epoch": 7.68, + "learning_rate": 3.421285714285715e-05, + "loss": 2.3474, + "step": 332500 + }, + { + "FLOPS loss": 0.05739612132310867, + "L0_d": 848.39, + "MLM loss": 2.299609422683716, + "epoch": 7.69, + "step": 332999 + }, + { + "epoch": 7.69, + "learning_rate": 3.411102040816327e-05, + "loss": 2.3476, + "step": 333000 + }, + { + "FLOPS loss": 0.05512355640530586, + "L0_d": 688.33, + "MLM loss": 2.35714054107666, + "epoch": 7.7, + "step": 333499 + }, + { + "epoch": 7.7, + "learning_rate": 3.4008979591836735e-05, + "loss": 2.3566, + "step": 333500 + }, + { + "FLOPS loss": 0.07219377160072327, + "L0_d": 1120.45, + "MLM loss": 2.3413336277008057, + "epoch": 7.72, + "step": 333999 + }, + { + "epoch": 7.72, + "learning_rate": 3.3906938775510206e-05, + "loss": 2.3516, + "step": 334000 + }, + { + "FLOPS loss": 0.059764862060546875, + "L0_d": 846.83, + "MLM loss": 2.2598955631256104, + "epoch": 7.73, + "step": 334499 + }, + { + "epoch": 7.73, + "learning_rate": 3.380489795918367e-05, + "loss": 2.3526, + "step": 334500 + }, + { + "FLOPS loss": 0.057846374809741974, + "L0_d": 834.73, + "MLM loss": 2.469571113586426, + "epoch": 7.74, + "step": 334999 + }, + { + "epoch": 7.74, + "learning_rate": 3.37030612244898e-05, + "loss": 2.3504, + "step": 335000 + }, + { + "FLOPS loss": 0.06122131273150444, + "L0_d": 781.16, + "MLM loss": 2.2299857139587402, + "epoch": 7.75, + "step": 335499 + }, + { + "epoch": 7.75, + "learning_rate": 3.360102040816327e-05, + "loss": 2.3489, + "step": 335500 + }, + { + "FLOPS loss": 0.057046886533498764, + "L0_d": 740.44, + "MLM loss": 2.0816891193389893, + "epoch": 7.76, + "step": 335999 + }, + { + "epoch": 7.76, + "learning_rate": 3.3498979591836735e-05, + "loss": 2.3499, + "step": 336000 + }, + { + "FLOPS loss": 0.06181034818291664, + "L0_d": 745.03, + "MLM loss": 2.248274087905884, + "epoch": 7.77, + "step": 336499 + }, + { + "epoch": 7.77, + "learning_rate": 3.3396938775510206e-05, + "loss": 2.3468, + "step": 336500 + }, + { + "FLOPS loss": 0.07484866678714752, + "L0_d": 1056.61, + "MLM loss": 2.207113027572632, + "epoch": 7.79, + "step": 336999 + }, + { + "epoch": 7.79, + "learning_rate": 3.329489795918367e-05, + "loss": 2.3452, + "step": 337000 + }, + { + "FLOPS loss": 0.06287125498056412, + "L0_d": 827.58, + "MLM loss": 2.3059585094451904, + "epoch": 7.8, + "step": 337499 + }, + { + "epoch": 7.8, + "learning_rate": 3.31930612244898e-05, + "loss": 2.3487, + "step": 337500 + }, + { + "FLOPS loss": 0.06856507062911987, + "L0_d": 1210.05, + "MLM loss": 2.2399425506591797, + "epoch": 7.81, + "step": 337999 + }, + { + "epoch": 7.81, + "learning_rate": 3.309102040816326e-05, + "loss": 2.3499, + "step": 338000 + }, + { + "FLOPS loss": 0.06494159996509552, + "L0_d": 690.55, + "MLM loss": 2.3285300731658936, + "epoch": 7.82, + "step": 338499 + }, + { + "epoch": 7.82, + "learning_rate": 3.298897959183674e-05, + "loss": 2.349, + "step": 338500 + }, + { + "FLOPS loss": 0.06229567155241966, + "L0_d": 870.64, + "MLM loss": 2.2471065521240234, + "epoch": 7.83, + "step": 338999 + }, + { + "epoch": 7.83, + "learning_rate": 3.2886938775510206e-05, + "loss": 2.3463, + "step": 339000 + }, + { + "FLOPS loss": 0.07021673768758774, + "L0_d": 757.09, + "MLM loss": 2.137368679046631, + "epoch": 7.84, + "step": 339499 + }, + { + "epoch": 7.84, + "learning_rate": 3.278510204081633e-05, + "loss": 2.3485, + "step": 339500 + }, + { + "FLOPS loss": 0.058832865208387375, + "L0_d": 844.09, + "MLM loss": 2.342620849609375, + "epoch": 7.85, + "step": 339999 + }, + { + "epoch": 7.85, + "learning_rate": 3.26830612244898e-05, + "loss": 2.3447, + "step": 340000 + }, + { + "FLOPS loss": 0.07662883400917053, + "L0_d": 809.62, + "MLM loss": 2.345280408859253, + "epoch": 7.87, + "step": 340499 + }, + { + "epoch": 7.87, + "learning_rate": 3.258102040816326e-05, + "loss": 2.3478, + "step": 340500 + }, + { + "FLOPS loss": 0.06886345148086548, + "L0_d": 936.47, + "MLM loss": 2.2469725608825684, + "epoch": 7.88, + "step": 340999 + }, + { + "epoch": 7.88, + "learning_rate": 3.247918367346939e-05, + "loss": 2.3451, + "step": 341000 + }, + { + "FLOPS loss": 0.07367608696222305, + "L0_d": 989.2, + "MLM loss": 2.325202465057373, + "epoch": 7.89, + "step": 341499 + }, + { + "epoch": 7.89, + "learning_rate": 3.237714285714286e-05, + "loss": 2.3468, + "step": 341500 + }, + { + "FLOPS loss": 0.0521760955452919, + "L0_d": 719.47, + "MLM loss": 2.2234766483306885, + "epoch": 7.9, + "step": 341999 + }, + { + "epoch": 7.9, + "learning_rate": 3.2275102040816326e-05, + "loss": 2.3465, + "step": 342000 + }, + { + "FLOPS loss": 0.04794573411345482, + "L0_d": 775.98, + "MLM loss": 2.3546948432922363, + "epoch": 7.91, + "step": 342499 + }, + { + "epoch": 7.91, + "learning_rate": 3.21730612244898e-05, + "loss": 2.3463, + "step": 342500 + }, + { + "FLOPS loss": 0.07742329686880112, + "L0_d": 1222.39, + "MLM loss": 2.299515962600708, + "epoch": 7.92, + "step": 342999 + }, + { + "epoch": 7.92, + "learning_rate": 3.207102040816326e-05, + "loss": 2.3515, + "step": 343000 + }, + { + "FLOPS loss": 0.06620312482118607, + "L0_d": 818.12, + "MLM loss": 2.2101948261260986, + "epoch": 7.94, + "step": 343499 + }, + { + "epoch": 7.94, + "learning_rate": 3.1968979591836734e-05, + "loss": 2.3442, + "step": 343500 + }, + { + "FLOPS loss": 0.06726676225662231, + "L0_d": 865.52, + "MLM loss": 2.2596335411071777, + "epoch": 7.95, + "step": 343999 + }, + { + "epoch": 7.95, + "learning_rate": 3.1866938775510206e-05, + "loss": 2.343, + "step": 344000 + }, + { + "FLOPS loss": 0.07519533485174179, + "L0_d": 1003.86, + "MLM loss": 2.146808385848999, + "epoch": 7.96, + "step": 344499 + }, + { + "epoch": 7.96, + "learning_rate": 3.176489795918368e-05, + "loss": 2.3454, + "step": 344500 + }, + { + "FLOPS loss": 0.05884729325771332, + "L0_d": 785.52, + "MLM loss": 2.313364267349243, + "epoch": 7.97, + "step": 344999 + }, + { + "epoch": 7.97, + "learning_rate": 3.16630612244898e-05, + "loss": 2.3491, + "step": 345000 + }, + { + "FLOPS loss": 0.06093428656458855, + "L0_d": 809.14, + "MLM loss": 2.34356689453125, + "epoch": 7.98, + "step": 345499 + }, + { + "epoch": 7.98, + "learning_rate": 3.156102040816327e-05, + "loss": 2.3459, + "step": 345500 + }, + { + "FLOPS loss": 0.06816817075014114, + "L0_d": 1047.36, + "MLM loss": 2.244194507598877, + "epoch": 7.99, + "step": 345999 + }, + { + "epoch": 7.99, + "learning_rate": 3.1458979591836734e-05, + "loss": 2.3464, + "step": 346000 + }, + { + "FLOPS loss": 0.05808025971055031, + "L0_d": 676.52, + "MLM loss": 2.2438483238220215, + "epoch": 8.0, + "step": 346499 + }, + { + "epoch": 8.0, + "learning_rate": 3.1356938775510205e-05, + "loss": 2.345, + "step": 346500 + }, + { + "FLOPS loss": 0.06837411969900131, + "L0_d": 1041.8, + "MLM loss": 2.158121347427368, + "epoch": 8.02, + "step": 346999 + }, + { + "epoch": 8.02, + "learning_rate": 3.125489795918368e-05, + "loss": 2.3381, + "step": 347000 + }, + { + "FLOPS loss": 0.0699104368686676, + "L0_d": 1003.94, + "MLM loss": 2.311729907989502, + "epoch": 8.03, + "step": 347499 + }, + { + "epoch": 8.03, + "learning_rate": 3.11530612244898e-05, + "loss": 2.341, + "step": 347500 + }, + { + "FLOPS loss": 0.06548525393009186, + "L0_d": 820.19, + "MLM loss": 2.227398633956909, + "epoch": 8.04, + "step": 347999 + }, + { + "epoch": 8.04, + "learning_rate": 3.105102040816327e-05, + "loss": 2.344, + "step": 348000 + }, + { + "FLOPS loss": 0.054751232266426086, + "L0_d": 688.12, + "MLM loss": 1.976632833480835, + "epoch": 8.05, + "step": 348499 + }, + { + "epoch": 8.05, + "learning_rate": 3.0948979591836734e-05, + "loss": 2.338, + "step": 348500 + }, + { + "FLOPS loss": 0.0631544440984726, + "L0_d": 868.38, + "MLM loss": 2.347715377807617, + "epoch": 8.06, + "step": 348999 + }, + { + "epoch": 8.06, + "learning_rate": 3.0846938775510205e-05, + "loss": 2.3413, + "step": 349000 + }, + { + "FLOPS loss": 0.065920390188694, + "L0_d": 734.41, + "MLM loss": 2.276221990585327, + "epoch": 8.07, + "step": 349499 + }, + { + "epoch": 8.07, + "learning_rate": 3.074489795918368e-05, + "loss": 2.3381, + "step": 349500 + }, + { + "FLOPS loss": 0.06035129353404045, + "L0_d": 884.69, + "MLM loss": 2.314701557159424, + "epoch": 8.09, + "step": 349999 + }, + { + "epoch": 8.09, + "learning_rate": 3.06430612244898e-05, + "loss": 2.3396, + "step": 350000 + }, + { + "FLOPS loss": 0.06962145864963531, + "L0_d": 1142.48, + "MLM loss": 2.310885190963745, + "epoch": 8.1, + "step": 350499 + }, + { + "epoch": 8.1, + "learning_rate": 3.054102040816327e-05, + "loss": 2.3369, + "step": 350500 + }, + { + "FLOPS loss": 0.06199546530842781, + "L0_d": 866.83, + "MLM loss": 2.0542640686035156, + "epoch": 8.11, + "step": 350999 + }, + { + "epoch": 8.11, + "learning_rate": 3.0438979591836737e-05, + "loss": 2.3395, + "step": 351000 + }, + { + "FLOPS loss": 0.0696890726685524, + "L0_d": 1098.45, + "MLM loss": 2.324911117553711, + "epoch": 8.12, + "step": 351499 + }, + { + "epoch": 8.12, + "learning_rate": 3.0336938775510205e-05, + "loss": 2.344, + "step": 351500 + }, + { + "FLOPS loss": 0.0821695551276207, + "L0_d": 939.31, + "MLM loss": 2.1598875522613525, + "epoch": 8.13, + "step": 351999 + }, + { + "epoch": 8.13, + "learning_rate": 3.0235102040816326e-05, + "loss": 2.337, + "step": 352000 + }, + { + "FLOPS loss": 0.07730408757925034, + "L0_d": 957.91, + "MLM loss": 2.1525473594665527, + "epoch": 8.14, + "step": 352499 + }, + { + "epoch": 8.14, + "learning_rate": 3.0133061224489794e-05, + "loss": 2.3394, + "step": 352500 + }, + { + "FLOPS loss": 0.066657654941082, + "L0_d": 874.75, + "MLM loss": 2.327643871307373, + "epoch": 8.15, + "step": 352999 + }, + { + "epoch": 8.15, + "learning_rate": 3.003102040816327e-05, + "loss": 2.331, + "step": 353000 + }, + { + "FLOPS loss": 0.05348766967654228, + "L0_d": 813.14, + "MLM loss": 2.308769702911377, + "epoch": 8.17, + "step": 353499 + }, + { + "epoch": 8.17, + "learning_rate": 2.9928979591836737e-05, + "loss": 2.3411, + "step": 353500 + }, + { + "FLOPS loss": 0.06494653224945068, + "L0_d": 882.59, + "MLM loss": 2.115314245223999, + "epoch": 8.18, + "step": 353999 + }, + { + "epoch": 8.18, + "learning_rate": 2.9826938775510205e-05, + "loss": 2.3423, + "step": 354000 + }, + { + "FLOPS loss": 0.0652025118470192, + "L0_d": 910.12, + "MLM loss": 2.3116602897644043, + "epoch": 8.19, + "step": 354499 + }, + { + "epoch": 8.19, + "learning_rate": 2.972510204081633e-05, + "loss": 2.3361, + "step": 354500 + }, + { + "FLOPS loss": 0.05392623692750931, + "L0_d": 715.48, + "MLM loss": 2.3743042945861816, + "epoch": 8.2, + "step": 354999 + }, + { + "epoch": 8.2, + "learning_rate": 2.9623061224489797e-05, + "loss": 2.3366, + "step": 355000 + }, + { + "FLOPS loss": 0.061027780175209045, + "L0_d": 796.05, + "MLM loss": 2.131939172744751, + "epoch": 8.21, + "step": 355499 + }, + { + "epoch": 8.21, + "learning_rate": 2.9521224489795918e-05, + "loss": 2.3333, + "step": 355500 + }, + { + "FLOPS loss": 0.05769571289420128, + "L0_d": 797.92, + "MLM loss": 2.1019766330718994, + "epoch": 8.22, + "step": 355999 + }, + { + "epoch": 8.22, + "learning_rate": 2.9419183673469392e-05, + "loss": 2.3366, + "step": 356000 + }, + { + "FLOPS loss": 0.0655391588807106, + "L0_d": 1163.17, + "MLM loss": 2.2659382820129395, + "epoch": 8.24, + "step": 356499 + }, + { + "epoch": 8.24, + "learning_rate": 2.931714285714286e-05, + "loss": 2.331, + "step": 356500 + }, + { + "FLOPS loss": 0.06379566341638565, + "L0_d": 839.8, + "MLM loss": 2.1597399711608887, + "epoch": 8.25, + "step": 356999 + }, + { + "epoch": 8.25, + "learning_rate": 2.921510204081633e-05, + "loss": 2.335, + "step": 357000 + }, + { + "FLOPS loss": 0.06905535608530045, + "L0_d": 840.22, + "MLM loss": 2.254056215286255, + "epoch": 8.26, + "step": 357499 + }, + { + "epoch": 8.26, + "learning_rate": 2.9113061224489797e-05, + "loss": 2.3367, + "step": 357500 + }, + { + "FLOPS loss": 0.07997802644968033, + "L0_d": 798.45, + "MLM loss": 2.198784828186035, + "epoch": 8.27, + "step": 357999 + }, + { + "epoch": 8.27, + "learning_rate": 2.9011020408163265e-05, + "loss": 2.3338, + "step": 358000 + }, + { + "FLOPS loss": 0.05796641856431961, + "L0_d": 737.81, + "MLM loss": 2.426778793334961, + "epoch": 8.28, + "step": 358499 + }, + { + "epoch": 8.28, + "learning_rate": 2.8908979591836736e-05, + "loss": 2.3367, + "step": 358500 + }, + { + "FLOPS loss": 0.06763458997011185, + "L0_d": 1093.58, + "MLM loss": 2.2339916229248047, + "epoch": 8.29, + "step": 358999 + }, + { + "epoch": 8.29, + "learning_rate": 2.8806938775510204e-05, + "loss": 2.3366, + "step": 359000 + }, + { + "FLOPS loss": 0.07436629384756088, + "L0_d": 956.61, + "MLM loss": 2.32150936126709, + "epoch": 8.3, + "step": 359499 + }, + { + "epoch": 8.3, + "learning_rate": 2.870510204081633e-05, + "loss": 2.3345, + "step": 359500 + }, + { + "FLOPS loss": 0.06429058313369751, + "L0_d": 748.19, + "MLM loss": 2.3280255794525146, + "epoch": 8.32, + "step": 359999 + }, + { + "epoch": 8.32, + "learning_rate": 2.8603061224489797e-05, + "loss": 2.3306, + "step": 360000 + }, + { + "FLOPS loss": 0.05643421411514282, + "L0_d": 732.22, + "MLM loss": 2.1442980766296387, + "epoch": 8.33, + "step": 360499 + }, + { + "epoch": 8.33, + "learning_rate": 2.8501020408163265e-05, + "loss": 2.3343, + "step": 360500 + }, + { + "FLOPS loss": 0.05485772714018822, + "L0_d": 830.95, + "MLM loss": 2.435837507247925, + "epoch": 8.34, + "step": 360999 + }, + { + "epoch": 8.34, + "learning_rate": 2.839897959183674e-05, + "loss": 2.3321, + "step": 361000 + }, + { + "FLOPS loss": 0.05842866748571396, + "L0_d": 631.92, + "MLM loss": 2.117673397064209, + "epoch": 8.35, + "step": 361499 + }, + { + "epoch": 8.35, + "learning_rate": 2.829714285714286e-05, + "loss": 2.3324, + "step": 361500 + }, + { + "FLOPS loss": 0.05581725761294365, + "L0_d": 723.44, + "MLM loss": 2.2919936180114746, + "epoch": 8.36, + "step": 361999 + }, + { + "epoch": 8.36, + "learning_rate": 2.8195102040816328e-05, + "loss": 2.3353, + "step": 362000 + }, + { + "FLOPS loss": 0.06750074028968811, + "L0_d": 1013.06, + "MLM loss": 2.2367117404937744, + "epoch": 8.37, + "step": 362499 + }, + { + "epoch": 8.37, + "learning_rate": 2.8093061224489796e-05, + "loss": 2.3355, + "step": 362500 + }, + { + "FLOPS loss": 0.06170298904180527, + "L0_d": 883.92, + "MLM loss": 2.2028086185455322, + "epoch": 8.39, + "step": 362999 + }, + { + "epoch": 8.39, + "learning_rate": 2.7991020408163264e-05, + "loss": 2.3373, + "step": 363000 + }, + { + "FLOPS loss": 0.07157032936811447, + "L0_d": 1057.62, + "MLM loss": 2.5820722579956055, + "epoch": 8.4, + "step": 363499 + }, + { + "epoch": 8.4, + "learning_rate": 2.788918367346939e-05, + "loss": 2.3328, + "step": 363500 + }, + { + "FLOPS loss": 0.06726321578025818, + "L0_d": 904.16, + "MLM loss": 2.196951389312744, + "epoch": 8.41, + "step": 363999 + }, + { + "epoch": 8.41, + "learning_rate": 2.7787142857142857e-05, + "loss": 2.3322, + "step": 364000 + }, + { + "FLOPS loss": 0.06428459286689758, + "L0_d": 955.66, + "MLM loss": 2.373371124267578, + "epoch": 8.42, + "step": 364499 + }, + { + "epoch": 8.42, + "learning_rate": 2.7685102040816328e-05, + "loss": 2.3342, + "step": 364500 + }, + { + "FLOPS loss": 0.07280469685792923, + "L0_d": 1063.44, + "MLM loss": 2.370555877685547, + "epoch": 8.43, + "step": 364999 + }, + { + "epoch": 8.43, + "learning_rate": 2.75830612244898e-05, + "loss": 2.3317, + "step": 365000 + }, + { + "FLOPS loss": 0.08263550698757172, + "L0_d": 976.44, + "MLM loss": 2.2560606002807617, + "epoch": 8.44, + "step": 365499 + }, + { + "epoch": 8.44, + "learning_rate": 2.7481020408163268e-05, + "loss": 2.3333, + "step": 365500 + }, + { + "FLOPS loss": 0.06574293226003647, + "L0_d": 815.53, + "MLM loss": 2.3375725746154785, + "epoch": 8.45, + "step": 365999 + }, + { + "epoch": 8.45, + "learning_rate": 2.7379183673469388e-05, + "loss": 2.3365, + "step": 366000 + }, + { + "FLOPS loss": 0.06853322684764862, + "L0_d": 1019.62, + "MLM loss": 2.271479606628418, + "epoch": 8.47, + "step": 366499 + }, + { + "epoch": 8.47, + "learning_rate": 2.7277142857142856e-05, + "loss": 2.3327, + "step": 366500 + }, + { + "FLOPS loss": 0.05954744666814804, + "L0_d": 780.14, + "MLM loss": 2.1975862979888916, + "epoch": 8.48, + "step": 366999 + }, + { + "epoch": 8.48, + "learning_rate": 2.717510204081633e-05, + "loss": 2.3286, + "step": 367000 + }, + { + "FLOPS loss": 0.059766821563243866, + "L0_d": 682.48, + "MLM loss": 2.3481855392456055, + "epoch": 8.49, + "step": 367499 + }, + { + "epoch": 8.49, + "learning_rate": 2.70730612244898e-05, + "loss": 2.3316, + "step": 367500 + }, + { + "FLOPS loss": 0.06330662965774536, + "L0_d": 932.05, + "MLM loss": 2.3544859886169434, + "epoch": 8.5, + "step": 367999 + }, + { + "epoch": 8.5, + "learning_rate": 2.6971020408163267e-05, + "loss": 2.3291, + "step": 368000 + }, + { + "FLOPS loss": 0.06118059530854225, + "L0_d": 1056.3, + "MLM loss": 2.271243095397949, + "epoch": 8.51, + "step": 368499 + }, + { + "epoch": 8.51, + "learning_rate": 2.6869183673469388e-05, + "loss": 2.3322, + "step": 368500 + }, + { + "FLOPS loss": 0.05249141529202461, + "L0_d": 698.84, + "MLM loss": 2.2780275344848633, + "epoch": 8.52, + "step": 368999 + }, + { + "epoch": 8.52, + "learning_rate": 2.6767142857142856e-05, + "loss": 2.3301, + "step": 369000 + }, + { + "FLOPS loss": 0.06862546503543854, + "L0_d": 836.22, + "MLM loss": 2.4001739025115967, + "epoch": 8.54, + "step": 369499 + }, + { + "epoch": 8.54, + "learning_rate": 2.6665102040816324e-05, + "loss": 2.3289, + "step": 369500 + }, + { + "FLOPS loss": 0.05359656736254692, + "L0_d": 714.91, + "MLM loss": 2.456220865249634, + "epoch": 8.55, + "step": 369999 + }, + { + "epoch": 8.55, + "learning_rate": 2.65630612244898e-05, + "loss": 2.3307, + "step": 370000 + }, + { + "FLOPS loss": 0.06717686355113983, + "L0_d": 1018.62, + "MLM loss": 2.279381275177002, + "epoch": 8.56, + "step": 370499 + }, + { + "epoch": 8.56, + "learning_rate": 2.6461020408163267e-05, + "loss": 2.3312, + "step": 370500 + }, + { + "FLOPS loss": 0.059827450662851334, + "L0_d": 758.89, + "MLM loss": 2.482081174850464, + "epoch": 8.57, + "step": 370999 + }, + { + "epoch": 8.57, + "learning_rate": 2.635918367346939e-05, + "loss": 2.3335, + "step": 371000 + }, + { + "FLOPS loss": 0.08209186792373657, + "L0_d": 1095.8, + "MLM loss": 2.0456771850585938, + "epoch": 8.58, + "step": 371499 + }, + { + "epoch": 8.58, + "learning_rate": 2.625714285714286e-05, + "loss": 2.327, + "step": 371500 + }, + { + "FLOPS loss": 0.0643707886338234, + "L0_d": 944.06, + "MLM loss": 2.3206868171691895, + "epoch": 8.59, + "step": 371999 + }, + { + "epoch": 8.59, + "learning_rate": 2.6155102040816327e-05, + "loss": 2.3267, + "step": 372000 + }, + { + "FLOPS loss": 0.06913337111473083, + "L0_d": 1096.0, + "MLM loss": 2.271151065826416, + "epoch": 8.61, + "step": 372499 + }, + { + "epoch": 8.61, + "learning_rate": 2.60530612244898e-05, + "loss": 2.33, + "step": 372500 + }, + { + "FLOPS loss": 0.07182160764932632, + "L0_d": 845.78, + "MLM loss": 2.113877773284912, + "epoch": 8.62, + "step": 372999 + }, + { + "epoch": 8.62, + "learning_rate": 2.5951224489795923e-05, + "loss": 2.3322, + "step": 373000 + }, + { + "FLOPS loss": 0.06277167797088623, + "L0_d": 864.33, + "MLM loss": 2.166372060775757, + "epoch": 8.63, + "step": 373499 + }, + { + "epoch": 8.63, + "learning_rate": 2.584918367346939e-05, + "loss": 2.3296, + "step": 373500 + }, + { + "FLOPS loss": 0.07612694054841995, + "L0_d": 1035.11, + "MLM loss": 2.098829507827759, + "epoch": 8.64, + "step": 373999 + }, + { + "epoch": 8.64, + "learning_rate": 2.574714285714286e-05, + "loss": 2.3319, + "step": 374000 + }, + { + "FLOPS loss": 0.07450399547815323, + "L0_d": 1027.17, + "MLM loss": 2.2074460983276367, + "epoch": 8.65, + "step": 374499 + }, + { + "epoch": 8.65, + "learning_rate": 2.5645102040816327e-05, + "loss": 2.3302, + "step": 374500 + }, + { + "FLOPS loss": 0.05644816905260086, + "L0_d": 777.75, + "MLM loss": 2.3011317253112793, + "epoch": 8.66, + "step": 374999 + }, + { + "epoch": 8.66, + "learning_rate": 2.5543265306122448e-05, + "loss": 2.328, + "step": 375000 + }, + { + "FLOPS loss": 0.07566533237695694, + "L0_d": 1021.97, + "MLM loss": 2.2594408988952637, + "epoch": 8.67, + "step": 375499 + }, + { + "epoch": 8.67, + "learning_rate": 2.5441224489795916e-05, + "loss": 2.3286, + "step": 375500 + }, + { + "FLOPS loss": 0.07324004918336868, + "L0_d": 980.56, + "MLM loss": 2.3295974731445312, + "epoch": 8.69, + "step": 375999 + }, + { + "epoch": 8.69, + "learning_rate": 2.533918367346939e-05, + "loss": 2.329, + "step": 376000 + }, + { + "FLOPS loss": 0.06034684181213379, + "L0_d": 641.38, + "MLM loss": 2.2357232570648193, + "epoch": 8.7, + "step": 376499 + }, + { + "epoch": 8.7, + "learning_rate": 2.523714285714286e-05, + "loss": 2.3332, + "step": 376500 + }, + { + "FLOPS loss": 0.06770546734333038, + "L0_d": 754.91, + "MLM loss": 2.13252329826355, + "epoch": 8.71, + "step": 376999 + }, + { + "epoch": 8.71, + "learning_rate": 2.5135102040816327e-05, + "loss": 2.3253, + "step": 377000 + }, + { + "FLOPS loss": 0.06255783885717392, + "L0_d": 936.42, + "MLM loss": 2.3120524883270264, + "epoch": 8.72, + "step": 377499 + }, + { + "epoch": 8.72, + "learning_rate": 2.5033061224489795e-05, + "loss": 2.3254, + "step": 377500 + }, + { + "FLOPS loss": 0.05889131501317024, + "L0_d": 851.02, + "MLM loss": 2.2504754066467285, + "epoch": 8.73, + "step": 377999 + }, + { + "epoch": 8.73, + "learning_rate": 2.4931020408163267e-05, + "loss": 2.3329, + "step": 378000 + }, + { + "FLOPS loss": 0.07563289999961853, + "L0_d": 977.06, + "MLM loss": 2.1566548347473145, + "epoch": 8.74, + "step": 378499 + }, + { + "epoch": 8.74, + "learning_rate": 2.4828979591836735e-05, + "loss": 2.3283, + "step": 378500 + }, + { + "FLOPS loss": 0.062325071543455124, + "L0_d": 886.58, + "MLM loss": 2.2504539489746094, + "epoch": 8.76, + "step": 378999 + }, + { + "epoch": 8.76, + "learning_rate": 2.4727142857142855e-05, + "loss": 2.3259, + "step": 379000 + }, + { + "FLOPS loss": 0.06517352163791656, + "L0_d": 938.56, + "MLM loss": 2.065889358520508, + "epoch": 8.77, + "step": 379499 + }, + { + "epoch": 8.77, + "learning_rate": 2.4625102040816327e-05, + "loss": 2.3258, + "step": 379500 + }, + { + "FLOPS loss": 0.0534166656434536, + "L0_d": 696.45, + "MLM loss": 2.1012320518493652, + "epoch": 8.78, + "step": 379999 + }, + { + "epoch": 8.78, + "learning_rate": 2.4523061224489795e-05, + "loss": 2.3226, + "step": 380000 + }, + { + "FLOPS loss": 0.07024350017309189, + "L0_d": 941.0, + "MLM loss": 2.2870864868164062, + "epoch": 8.79, + "step": 380499 + }, + { + "epoch": 8.79, + "learning_rate": 2.4421020408163266e-05, + "loss": 2.3299, + "step": 380500 + }, + { + "FLOPS loss": 0.06589783728122711, + "L0_d": 771.78, + "MLM loss": 2.293532371520996, + "epoch": 8.8, + "step": 380999 + }, + { + "epoch": 8.8, + "learning_rate": 2.4318979591836734e-05, + "loss": 2.3282, + "step": 381000 + }, + { + "FLOPS loss": 0.06442509591579437, + "L0_d": 754.53, + "MLM loss": 2.274405002593994, + "epoch": 8.81, + "step": 381499 + }, + { + "epoch": 8.81, + "learning_rate": 2.421714285714286e-05, + "loss": 2.3228, + "step": 381500 + }, + { + "FLOPS loss": 0.061390411108732224, + "L0_d": 938.61, + "MLM loss": 2.2242677211761475, + "epoch": 8.82, + "step": 381999 + }, + { + "epoch": 8.82, + "learning_rate": 2.411510204081633e-05, + "loss": 2.3263, + "step": 382000 + }, + { + "FLOPS loss": 0.053517624735832214, + "L0_d": 791.31, + "MLM loss": 2.1186835765838623, + "epoch": 8.84, + "step": 382499 + }, + { + "epoch": 8.84, + "learning_rate": 2.4013061224489798e-05, + "loss": 2.3184, + "step": 382500 + }, + { + "FLOPS loss": 0.07673989981412888, + "L0_d": 1034.72, + "MLM loss": 2.1242787837982178, + "epoch": 8.85, + "step": 382999 + }, + { + "epoch": 8.85, + "learning_rate": 2.3911020408163266e-05, + "loss": 2.3287, + "step": 383000 + }, + { + "FLOPS loss": 0.05768826603889465, + "L0_d": 748.44, + "MLM loss": 2.2013959884643555, + "epoch": 8.86, + "step": 383499 + }, + { + "epoch": 8.86, + "learning_rate": 2.380918367346939e-05, + "loss": 2.3237, + "step": 383500 + }, + { + "FLOPS loss": 0.07118546217679977, + "L0_d": 847.38, + "MLM loss": 2.2173261642456055, + "epoch": 8.87, + "step": 383999 + }, + { + "epoch": 8.87, + "learning_rate": 2.370714285714286e-05, + "loss": 2.3261, + "step": 384000 + }, + { + "FLOPS loss": 0.07411494851112366, + "L0_d": 881.77, + "MLM loss": 2.3400588035583496, + "epoch": 8.88, + "step": 384499 + }, + { + "epoch": 8.88, + "learning_rate": 2.3605102040816326e-05, + "loss": 2.3276, + "step": 384500 + }, + { + "FLOPS loss": 0.05761401355266571, + "L0_d": 958.17, + "MLM loss": 2.2552809715270996, + "epoch": 8.89, + "step": 384999 + }, + { + "epoch": 8.89, + "learning_rate": 2.3503061224489798e-05, + "loss": 2.3257, + "step": 385000 + }, + { + "FLOPS loss": 0.0631277784705162, + "L0_d": 898.61, + "MLM loss": 2.143202781677246, + "epoch": 8.91, + "step": 385499 + }, + { + "epoch": 8.91, + "learning_rate": 2.340122448979592e-05, + "loss": 2.3234, + "step": 385500 + }, + { + "FLOPS loss": 0.05597534030675888, + "L0_d": 722.88, + "MLM loss": 2.2366416454315186, + "epoch": 8.92, + "step": 385999 + }, + { + "epoch": 8.92, + "learning_rate": 2.3299183673469387e-05, + "loss": 2.3207, + "step": 386000 + }, + { + "FLOPS loss": 0.05632057785987854, + "L0_d": 992.81, + "MLM loss": 2.2561912536621094, + "epoch": 8.93, + "step": 386499 + }, + { + "epoch": 8.93, + "learning_rate": 2.3197142857142858e-05, + "loss": 2.3251, + "step": 386500 + }, + { + "FLOPS loss": 0.061308152973651886, + "L0_d": 797.61, + "MLM loss": 2.1445047855377197, + "epoch": 8.94, + "step": 386999 + }, + { + "epoch": 8.94, + "learning_rate": 2.3095102040816326e-05, + "loss": 2.3227, + "step": 387000 + }, + { + "FLOPS loss": 0.06084592267870903, + "L0_d": 671.08, + "MLM loss": 2.295675754547119, + "epoch": 8.95, + "step": 387499 + }, + { + "epoch": 8.95, + "learning_rate": 2.299326530612245e-05, + "loss": 2.3214, + "step": 387500 + }, + { + "FLOPS loss": 0.06371425837278366, + "L0_d": 816.7, + "MLM loss": 2.254873037338257, + "epoch": 8.96, + "step": 387999 + }, + { + "epoch": 8.96, + "learning_rate": 2.2891224489795922e-05, + "loss": 2.3197, + "step": 388000 + }, + { + "FLOPS loss": 0.054075125604867935, + "L0_d": 661.2, + "MLM loss": 2.2977006435394287, + "epoch": 8.97, + "step": 388499 + }, + { + "epoch": 8.97, + "learning_rate": 2.278918367346939e-05, + "loss": 2.3265, + "step": 388500 + }, + { + "FLOPS loss": 0.06039130687713623, + "L0_d": 934.31, + "MLM loss": 2.118281126022339, + "epoch": 8.99, + "step": 388999 + }, + { + "epoch": 8.99, + "learning_rate": 2.2687142857142858e-05, + "loss": 2.3191, + "step": 389000 + }, + { + "FLOPS loss": 0.060361187905073166, + "L0_d": 1027.16, + "MLM loss": 2.3029537200927734, + "epoch": 9.0, + "step": 389499 + }, + { + "epoch": 9.0, + "learning_rate": 2.2585306122448982e-05, + "loss": 2.3242, + "step": 389500 + }, + { + "FLOPS loss": 0.05790635943412781, + "L0_d": 642.02, + "MLM loss": 2.3230996131896973, + "epoch": 9.01, + "step": 389999 + }, + { + "epoch": 9.01, + "learning_rate": 2.248326530612245e-05, + "loss": 2.3203, + "step": 390000 + }, + { + "FLOPS loss": 0.06403976678848267, + "L0_d": 807.78, + "MLM loss": 2.265406370162964, + "epoch": 9.02, + "step": 390499 + }, + { + "epoch": 9.02, + "learning_rate": 2.2381224489795918e-05, + "loss": 2.3134, + "step": 390500 + }, + { + "FLOPS loss": 0.05303754657506943, + "L0_d": 676.48, + "MLM loss": 2.174380302429199, + "epoch": 9.03, + "step": 390999 + }, + { + "epoch": 9.03, + "learning_rate": 2.227918367346939e-05, + "loss": 2.3196, + "step": 391000 + }, + { + "FLOPS loss": 0.05659378692507744, + "L0_d": 665.75, + "MLM loss": 2.2747392654418945, + "epoch": 9.04, + "step": 391499 + }, + { + "epoch": 9.04, + "learning_rate": 2.2177142857142858e-05, + "loss": 2.315, + "step": 391500 + }, + { + "FLOPS loss": 0.05577780678868294, + "L0_d": 716.7, + "MLM loss": 2.3399062156677246, + "epoch": 9.06, + "step": 391999 + }, + { + "epoch": 9.06, + "learning_rate": 2.207530612244898e-05, + "loss": 2.3206, + "step": 392000 + }, + { + "FLOPS loss": 0.07858218997716904, + "L0_d": 882.84, + "MLM loss": 2.1923105716705322, + "epoch": 9.07, + "step": 392499 + }, + { + "epoch": 9.07, + "learning_rate": 2.197326530612245e-05, + "loss": 2.3193, + "step": 392500 + }, + { + "FLOPS loss": 0.06561288982629776, + "L0_d": 1004.69, + "MLM loss": 2.4301981925964355, + "epoch": 9.08, + "step": 392999 + }, + { + "epoch": 9.08, + "learning_rate": 2.1871224489795918e-05, + "loss": 2.3172, + "step": 393000 + }, + { + "FLOPS loss": 0.05799808353185654, + "L0_d": 736.95, + "MLM loss": 2.291177988052368, + "epoch": 9.09, + "step": 393499 + }, + { + "epoch": 9.09, + "learning_rate": 2.176918367346939e-05, + "loss": 2.3225, + "step": 393500 + }, + { + "FLOPS loss": 0.06807775795459747, + "L0_d": 922.12, + "MLM loss": 2.2046947479248047, + "epoch": 9.1, + "step": 393999 + }, + { + "epoch": 9.1, + "learning_rate": 2.1667142857142858e-05, + "loss": 2.3189, + "step": 394000 + }, + { + "FLOPS loss": 0.07165581732988358, + "L0_d": 943.2, + "MLM loss": 2.2551870346069336, + "epoch": 9.11, + "step": 394499 + }, + { + "epoch": 9.11, + "learning_rate": 2.1565102040816326e-05, + "loss": 2.3178, + "step": 394500 + }, + { + "FLOPS loss": 0.05095088481903076, + "L0_d": 776.98, + "MLM loss": 2.3392550945281982, + "epoch": 9.12, + "step": 394999 + }, + { + "epoch": 9.12, + "learning_rate": 2.146326530612245e-05, + "loss": 2.3171, + "step": 395000 + }, + { + "FLOPS loss": 0.06212861090898514, + "L0_d": 827.08, + "MLM loss": 2.1800572872161865, + "epoch": 9.14, + "step": 395499 + }, + { + "epoch": 9.14, + "learning_rate": 2.136122448979592e-05, + "loss": 2.3185, + "step": 395500 + }, + { + "FLOPS loss": 0.06836603581905365, + "L0_d": 1011.97, + "MLM loss": 2.172955274581909, + "epoch": 9.15, + "step": 395999 + }, + { + "epoch": 9.15, + "learning_rate": 2.125918367346939e-05, + "loss": 2.3153, + "step": 396000 + }, + { + "FLOPS loss": 0.05926735699176788, + "L0_d": 921.53, + "MLM loss": 2.3053805828094482, + "epoch": 9.16, + "step": 396499 + }, + { + "epoch": 9.16, + "learning_rate": 2.1157142857142857e-05, + "loss": 2.3188, + "step": 396500 + }, + { + "FLOPS loss": 0.07315704971551895, + "L0_d": 921.8, + "MLM loss": 2.3621654510498047, + "epoch": 9.17, + "step": 396999 + }, + { + "epoch": 9.17, + "learning_rate": 2.1055102040816325e-05, + "loss": 2.3167, + "step": 397000 + }, + { + "FLOPS loss": 0.062325432896614075, + "L0_d": 829.0, + "MLM loss": 2.2659590244293213, + "epoch": 9.18, + "step": 397499 + }, + { + "epoch": 9.18, + "learning_rate": 2.095326530612245e-05, + "loss": 2.3158, + "step": 397500 + }, + { + "FLOPS loss": 0.061964452266693115, + "L0_d": 868.09, + "MLM loss": 2.215615749359131, + "epoch": 9.19, + "step": 397999 + }, + { + "epoch": 9.19, + "learning_rate": 2.085122448979592e-05, + "loss": 2.315, + "step": 398000 + }, + { + "FLOPS loss": 0.05633864551782608, + "L0_d": 734.2, + "MLM loss": 2.322652816772461, + "epoch": 9.21, + "step": 398499 + }, + { + "epoch": 9.21, + "learning_rate": 2.074918367346939e-05, + "loss": 2.319, + "step": 398500 + }, + { + "FLOPS loss": 0.0727355107665062, + "L0_d": 1047.61, + "MLM loss": 2.278925657272339, + "epoch": 9.22, + "step": 398999 + }, + { + "epoch": 9.22, + "learning_rate": 2.064714285714286e-05, + "loss": 2.3147, + "step": 399000 + }, + { + "FLOPS loss": 0.052611831575632095, + "L0_d": 711.98, + "MLM loss": 2.3901736736297607, + "epoch": 9.23, + "step": 399499 + }, + { + "epoch": 9.23, + "learning_rate": 2.054530612244898e-05, + "loss": 2.3119, + "step": 399500 + }, + { + "FLOPS loss": 0.05829920992255211, + "L0_d": 716.16, + "MLM loss": 2.512298583984375, + "epoch": 9.24, + "step": 399999 + }, + { + "epoch": 9.24, + "learning_rate": 2.044326530612245e-05, + "loss": 2.313, + "step": 400000 + }, + { + "FLOPS loss": 0.059103552252054214, + "L0_d": 747.2, + "MLM loss": 2.4075937271118164, + "epoch": 9.25, + "step": 400499 + }, + { + "epoch": 9.25, + "learning_rate": 2.0341224489795917e-05, + "loss": 2.3155, + "step": 400500 + }, + { + "FLOPS loss": 0.06063724309206009, + "L0_d": 813.27, + "MLM loss": 2.265842914581299, + "epoch": 9.26, + "step": 400999 + }, + { + "epoch": 9.26, + "learning_rate": 2.023918367346939e-05, + "loss": 2.3103, + "step": 401000 + }, + { + "FLOPS loss": 0.0646892711520195, + "L0_d": 869.34, + "MLM loss": 2.1556687355041504, + "epoch": 9.28, + "step": 401499 + }, + { + "epoch": 9.28, + "learning_rate": 2.0137346938775513e-05, + "loss": 2.3162, + "step": 401500 + }, + { + "FLOPS loss": 0.06266539543867111, + "L0_d": 819.53, + "MLM loss": 2.1329922676086426, + "epoch": 9.29, + "step": 401999 + }, + { + "epoch": 9.29, + "learning_rate": 2.0035510204081634e-05, + "loss": 2.3117, + "step": 402000 + }, + { + "FLOPS loss": 0.0540151484310627, + "L0_d": 850.94, + "MLM loss": 2.3327624797821045, + "epoch": 9.3, + "step": 402499 + }, + { + "epoch": 9.3, + "learning_rate": 1.9933469387755102e-05, + "loss": 2.3149, + "step": 402500 + }, + { + "FLOPS loss": 0.061880338937044144, + "L0_d": 796.36, + "MLM loss": 2.212796688079834, + "epoch": 9.31, + "step": 402999 + }, + { + "epoch": 9.31, + "learning_rate": 1.9831428571428573e-05, + "loss": 2.3146, + "step": 403000 + }, + { + "FLOPS loss": 0.060214437544345856, + "L0_d": 981.72, + "MLM loss": 2.1206552982330322, + "epoch": 9.32, + "step": 403499 + }, + { + "epoch": 9.32, + "learning_rate": 1.972938775510204e-05, + "loss": 2.3135, + "step": 403500 + }, + { + "FLOPS loss": 0.06157975271344185, + "L0_d": 777.95, + "MLM loss": 2.2471861839294434, + "epoch": 9.33, + "step": 403999 + }, + { + "epoch": 9.33, + "learning_rate": 1.9627346938775513e-05, + "loss": 2.3132, + "step": 404000 + }, + { + "FLOPS loss": 0.06235690787434578, + "L0_d": 889.75, + "MLM loss": 2.3554129600524902, + "epoch": 9.34, + "step": 404499 + }, + { + "epoch": 9.34, + "learning_rate": 1.952530612244898e-05, + "loss": 2.3109, + "step": 404500 + }, + { + "FLOPS loss": 0.06397629529237747, + "L0_d": 991.97, + "MLM loss": 2.151730537414551, + "epoch": 9.36, + "step": 404999 + }, + { + "epoch": 9.36, + "learning_rate": 1.9423265306122452e-05, + "loss": 2.3159, + "step": 405000 + }, + { + "FLOPS loss": 0.10298942774534225, + "L0_d": 1378.95, + "MLM loss": 2.2610526084899902, + "epoch": 9.37, + "step": 405499 + }, + { + "epoch": 9.37, + "learning_rate": 1.932122448979592e-05, + "loss": 2.3077, + "step": 405500 + }, + { + "FLOPS loss": 0.07055822759866714, + "L0_d": 773.44, + "MLM loss": 2.2387983798980713, + "epoch": 9.38, + "step": 405999 + }, + { + "epoch": 9.38, + "learning_rate": 1.921938775510204e-05, + "loss": 2.3157, + "step": 406000 + }, + { + "FLOPS loss": 0.07199456542730331, + "L0_d": 1106.86, + "MLM loss": 2.1209793090820312, + "epoch": 9.39, + "step": 406499 + }, + { + "epoch": 9.39, + "learning_rate": 1.911734693877551e-05, + "loss": 2.3132, + "step": 406500 + }, + { + "FLOPS loss": 0.0611785389482975, + "L0_d": 818.05, + "MLM loss": 2.2507667541503906, + "epoch": 9.4, + "step": 406999 + }, + { + "epoch": 9.4, + "learning_rate": 1.901530612244898e-05, + "loss": 2.3099, + "step": 407000 + }, + { + "FLOPS loss": 0.07224996387958527, + "L0_d": 808.06, + "MLM loss": 2.213388204574585, + "epoch": 9.41, + "step": 407499 + }, + { + "epoch": 9.41, + "learning_rate": 1.891326530612245e-05, + "loss": 2.3127, + "step": 407500 + }, + { + "FLOPS loss": 0.08076408505439758, + "L0_d": 990.81, + "MLM loss": 2.188027858734131, + "epoch": 9.43, + "step": 407999 + }, + { + "epoch": 9.43, + "learning_rate": 1.881122448979592e-05, + "loss": 2.3148, + "step": 408000 + }, + { + "FLOPS loss": 0.06700234115123749, + "L0_d": 1071.58, + "MLM loss": 2.417141914367676, + "epoch": 9.44, + "step": 408499 + }, + { + "epoch": 9.44, + "learning_rate": 1.870938775510204e-05, + "loss": 2.3142, + "step": 408500 + }, + { + "FLOPS loss": 0.05891914665699005, + "L0_d": 648.22, + "MLM loss": 2.059519052505493, + "epoch": 9.45, + "step": 408999 + }, + { + "epoch": 9.45, + "learning_rate": 1.860734693877551e-05, + "loss": 2.3107, + "step": 409000 + }, + { + "FLOPS loss": 0.060197729617357254, + "L0_d": 740.95, + "MLM loss": 2.3801562786102295, + "epoch": 9.46, + "step": 409499 + }, + { + "epoch": 9.46, + "learning_rate": 1.850530612244898e-05, + "loss": 2.3118, + "step": 409500 + }, + { + "FLOPS loss": 0.07512973248958588, + "L0_d": 730.03, + "MLM loss": 2.2139153480529785, + "epoch": 9.47, + "step": 409999 + }, + { + "epoch": 9.47, + "learning_rate": 1.840326530612245e-05, + "loss": 2.3114, + "step": 410000 + }, + { + "FLOPS loss": 0.061726443469524384, + "L0_d": 945.25, + "MLM loss": 2.4507713317871094, + "epoch": 9.48, + "step": 410499 + }, + { + "epoch": 9.48, + "learning_rate": 1.8301428571428573e-05, + "loss": 2.3125, + "step": 410500 + }, + { + "FLOPS loss": 0.07949730008840561, + "L0_d": 1145.23, + "MLM loss": 2.116482734680176, + "epoch": 9.49, + "step": 410999 + }, + { + "epoch": 9.49, + "learning_rate": 1.819938775510204e-05, + "loss": 2.3099, + "step": 411000 + }, + { + "FLOPS loss": 0.07129678875207901, + "L0_d": 848.55, + "MLM loss": 2.09537672996521, + "epoch": 9.51, + "step": 411499 + }, + { + "epoch": 9.51, + "learning_rate": 1.8097346938775512e-05, + "loss": 2.3094, + "step": 411500 + }, + { + "FLOPS loss": 0.07410188019275665, + "L0_d": 856.41, + "MLM loss": 2.248734712600708, + "epoch": 9.52, + "step": 411999 + }, + { + "epoch": 9.52, + "learning_rate": 1.799530612244898e-05, + "loss": 2.3142, + "step": 412000 + }, + { + "FLOPS loss": 0.06592919677495956, + "L0_d": 784.77, + "MLM loss": 2.303936719894409, + "epoch": 9.53, + "step": 412499 + }, + { + "epoch": 9.53, + "learning_rate": 1.7893265306122452e-05, + "loss": 2.3109, + "step": 412500 + }, + { + "FLOPS loss": 0.06576629728078842, + "L0_d": 809.38, + "MLM loss": 2.3034827709198, + "epoch": 9.54, + "step": 412999 + }, + { + "epoch": 9.54, + "learning_rate": 1.7791428571428572e-05, + "loss": 2.3131, + "step": 413000 + }, + { + "FLOPS loss": 0.06040722504258156, + "L0_d": 905.58, + "MLM loss": 2.0183334350585938, + "epoch": 9.55, + "step": 413499 + }, + { + "epoch": 9.55, + "learning_rate": 1.768938775510204e-05, + "loss": 2.3093, + "step": 413500 + }, + { + "FLOPS loss": 0.069266177713871, + "L0_d": 861.3, + "MLM loss": 2.247541904449463, + "epoch": 9.56, + "step": 413999 + }, + { + "epoch": 9.56, + "learning_rate": 1.7587346938775512e-05, + "loss": 2.3099, + "step": 414000 + }, + { + "FLOPS loss": 0.059506528079509735, + "L0_d": 794.53, + "MLM loss": 2.3322439193725586, + "epoch": 9.58, + "step": 414499 + }, + { + "epoch": 9.58, + "learning_rate": 1.748530612244898e-05, + "loss": 2.3096, + "step": 414500 + }, + { + "FLOPS loss": 0.06200943514704704, + "L0_d": 938.34, + "MLM loss": 2.3402085304260254, + "epoch": 9.59, + "step": 414999 + }, + { + "epoch": 9.59, + "learning_rate": 1.7383469387755104e-05, + "loss": 2.3089, + "step": 415000 + }, + { + "FLOPS loss": 0.06389506906270981, + "L0_d": 851.53, + "MLM loss": 2.2279720306396484, + "epoch": 9.6, + "step": 415499 + }, + { + "epoch": 9.6, + "learning_rate": 1.7281428571428572e-05, + "loss": 2.3068, + "step": 415500 + }, + { + "FLOPS loss": 0.0726497620344162, + "L0_d": 1118.03, + "MLM loss": 2.2254295349121094, + "epoch": 9.61, + "step": 415999 + }, + { + "epoch": 9.61, + "learning_rate": 1.717938775510204e-05, + "loss": 2.3155, + "step": 416000 + }, + { + "FLOPS loss": 0.0513957142829895, + "L0_d": 812.94, + "MLM loss": 2.3883609771728516, + "epoch": 9.62, + "step": 416499 + }, + { + "epoch": 9.62, + "learning_rate": 1.707734693877551e-05, + "loss": 2.3086, + "step": 416500 + }, + { + "FLOPS loss": 0.06384290754795074, + "L0_d": 716.12, + "MLM loss": 2.221311092376709, + "epoch": 9.63, + "step": 416999 + }, + { + "epoch": 9.63, + "learning_rate": 1.6975510204081632e-05, + "loss": 2.3098, + "step": 417000 + }, + { + "FLOPS loss": 0.05616021156311035, + "L0_d": 742.58, + "MLM loss": 2.1164889335632324, + "epoch": 9.64, + "step": 417499 + }, + { + "epoch": 9.64, + "learning_rate": 1.6873469387755104e-05, + "loss": 2.3073, + "step": 417500 + }, + { + "FLOPS loss": 0.06069912016391754, + "L0_d": 945.22, + "MLM loss": 2.2470812797546387, + "epoch": 9.66, + "step": 417999 + }, + { + "epoch": 9.66, + "learning_rate": 1.6771428571428572e-05, + "loss": 2.3097, + "step": 418000 + }, + { + "FLOPS loss": 0.05973891541361809, + "L0_d": 870.66, + "MLM loss": 2.144036054611206, + "epoch": 9.67, + "step": 418499 + }, + { + "epoch": 9.67, + "learning_rate": 1.6669387755102044e-05, + "loss": 2.3065, + "step": 418500 + }, + { + "FLOPS loss": 0.06439421325922012, + "L0_d": 986.64, + "MLM loss": 1.9868640899658203, + "epoch": 9.68, + "step": 418999 + }, + { + "epoch": 9.68, + "learning_rate": 1.656734693877551e-05, + "loss": 2.308, + "step": 419000 + }, + { + "FLOPS loss": 0.06345753371715546, + "L0_d": 728.62, + "MLM loss": 2.2475640773773193, + "epoch": 9.69, + "step": 419499 + }, + { + "epoch": 9.69, + "learning_rate": 1.6465510204081632e-05, + "loss": 2.3098, + "step": 419500 + }, + { + "FLOPS loss": 0.05375149846076965, + "L0_d": 691.36, + "MLM loss": 2.2957427501678467, + "epoch": 9.7, + "step": 419999 + }, + { + "epoch": 9.7, + "learning_rate": 1.6363469387755104e-05, + "loss": 2.3042, + "step": 420000 + }, + { + "FLOPS loss": 0.06595911830663681, + "L0_d": 857.59, + "MLM loss": 2.0994808673858643, + "epoch": 9.71, + "step": 420499 + }, + { + "epoch": 9.71, + "learning_rate": 1.6261428571428572e-05, + "loss": 2.3052, + "step": 420500 + }, + { + "FLOPS loss": 0.062460485845804214, + "L0_d": 906.17, + "MLM loss": 2.2028586864471436, + "epoch": 9.73, + "step": 420999 + }, + { + "epoch": 9.73, + "learning_rate": 1.6159387755102043e-05, + "loss": 2.3014, + "step": 421000 + }, + { + "FLOPS loss": 0.07285495102405548, + "L0_d": 1217.08, + "MLM loss": 2.313466787338257, + "epoch": 9.74, + "step": 421499 + }, + { + "epoch": 9.74, + "learning_rate": 1.6057551020408164e-05, + "loss": 2.3095, + "step": 421500 + }, + { + "FLOPS loss": 0.08623205125331879, + "L0_d": 1016.34, + "MLM loss": 2.080904960632324, + "epoch": 9.75, + "step": 421999 + }, + { + "epoch": 9.75, + "learning_rate": 1.5955510204081632e-05, + "loss": 2.3081, + "step": 422000 + }, + { + "FLOPS loss": 0.06952735781669617, + "L0_d": 831.94, + "MLM loss": 2.1841421127319336, + "epoch": 9.76, + "step": 422499 + }, + { + "epoch": 9.76, + "learning_rate": 1.58534693877551e-05, + "loss": 2.3092, + "step": 422500 + }, + { + "FLOPS loss": 0.06762253493070602, + "L0_d": 919.55, + "MLM loss": 2.146679639816284, + "epoch": 9.77, + "step": 422999 + }, + { + "epoch": 9.77, + "learning_rate": 1.575142857142857e-05, + "loss": 2.3091, + "step": 423000 + }, + { + "FLOPS loss": 0.05603819340467453, + "L0_d": 740.88, + "MLM loss": 2.2936301231384277, + "epoch": 9.78, + "step": 423499 + }, + { + "epoch": 9.78, + "learning_rate": 1.564938775510204e-05, + "loss": 2.3056, + "step": 423500 + }, + { + "FLOPS loss": 0.06984943896532059, + "L0_d": 789.66, + "MLM loss": 2.2071163654327393, + "epoch": 9.79, + "step": 423999 + }, + { + "epoch": 9.79, + "learning_rate": 1.5547551020408164e-05, + "loss": 2.3048, + "step": 424000 + }, + { + "FLOPS loss": 0.05715217813849449, + "L0_d": 874.83, + "MLM loss": 2.1982874870300293, + "epoch": 9.81, + "step": 424499 + }, + { + "epoch": 9.81, + "learning_rate": 1.5445510204081635e-05, + "loss": 2.3063, + "step": 424500 + }, + { + "FLOPS loss": 0.07178690284490585, + "L0_d": 923.66, + "MLM loss": 2.177065849304199, + "epoch": 9.82, + "step": 424999 + }, + { + "epoch": 9.82, + "learning_rate": 1.5343469387755103e-05, + "loss": 2.3086, + "step": 425000 + }, + { + "FLOPS loss": 0.0699949711561203, + "L0_d": 1009.09, + "MLM loss": 2.053196430206299, + "epoch": 9.83, + "step": 425499 + }, + { + "epoch": 9.83, + "learning_rate": 1.5241632653061224e-05, + "loss": 2.3076, + "step": 425500 + }, + { + "FLOPS loss": 0.06477265805006027, + "L0_d": 866.89, + "MLM loss": 2.051081657409668, + "epoch": 9.84, + "step": 425999 + }, + { + "epoch": 9.84, + "learning_rate": 1.5139591836734696e-05, + "loss": 2.305, + "step": 426000 + }, + { + "FLOPS loss": 0.06138918921351433, + "L0_d": 841.31, + "MLM loss": 2.3111472129821777, + "epoch": 9.85, + "step": 426499 + }, + { + "epoch": 9.85, + "learning_rate": 1.5037551020408164e-05, + "loss": 2.3028, + "step": 426500 + }, + { + "FLOPS loss": 0.056554101407527924, + "L0_d": 701.53, + "MLM loss": 2.369089365005493, + "epoch": 9.86, + "step": 426999 + }, + { + "epoch": 9.86, + "learning_rate": 1.4935510204081635e-05, + "loss": 2.3083, + "step": 427000 + }, + { + "FLOPS loss": 0.06950782984495163, + "L0_d": 992.92, + "MLM loss": 2.152656316757202, + "epoch": 9.88, + "step": 427499 + }, + { + "epoch": 9.88, + "learning_rate": 1.4833469387755103e-05, + "loss": 2.3074, + "step": 427500 + }, + { + "FLOPS loss": 0.06892652064561844, + "L0_d": 788.14, + "MLM loss": 2.2031593322753906, + "epoch": 9.89, + "step": 427999 + }, + { + "epoch": 9.89, + "learning_rate": 1.4731428571428571e-05, + "loss": 2.3043, + "step": 428000 + }, + { + "FLOPS loss": 0.06714266538619995, + "L0_d": 781.45, + "MLM loss": 2.1796374320983887, + "epoch": 9.9, + "step": 428499 + }, + { + "epoch": 9.9, + "learning_rate": 1.4629387755102043e-05, + "loss": 2.3016, + "step": 428500 + }, + { + "FLOPS loss": 0.06731442362070084, + "L0_d": 892.73, + "MLM loss": 2.282862663269043, + "epoch": 9.91, + "step": 428999 + }, + { + "epoch": 9.91, + "learning_rate": 1.452734693877551e-05, + "loss": 2.3041, + "step": 429000 + }, + { + "FLOPS loss": 0.06822635233402252, + "L0_d": 978.84, + "MLM loss": 2.432333469390869, + "epoch": 9.92, + "step": 429499 + }, + { + "epoch": 9.92, + "learning_rate": 1.442530612244898e-05, + "loss": 2.3024, + "step": 429500 + }, + { + "FLOPS loss": 0.06567897647619247, + "L0_d": 1059.81, + "MLM loss": 2.2899770736694336, + "epoch": 9.93, + "step": 429999 + }, + { + "epoch": 9.93, + "learning_rate": 1.4323469387755103e-05, + "loss": 2.3042, + "step": 430000 + }, + { + "FLOPS loss": 0.06040837988257408, + "L0_d": 883.02, + "MLM loss": 2.23317289352417, + "epoch": 9.94, + "step": 430499 + }, + { + "epoch": 9.95, + "learning_rate": 1.4221428571428571e-05, + "loss": 2.2991, + "step": 430500 + }, + { + "FLOPS loss": 0.054521579295396805, + "L0_d": 714.81, + "MLM loss": 2.4254093170166016, + "epoch": 9.96, + "step": 430999 + }, + { + "epoch": 9.96, + "learning_rate": 1.4119387755102043e-05, + "loss": 2.302, + "step": 431000 + }, + { + "FLOPS loss": 0.0858311727643013, + "L0_d": 1071.42, + "MLM loss": 2.1953415870666504, + "epoch": 9.97, + "step": 431499 + }, + { + "epoch": 9.97, + "learning_rate": 1.401734693877551e-05, + "loss": 2.3025, + "step": 431500 + }, + { + "FLOPS loss": 0.05570479482412338, + "L0_d": 688.91, + "MLM loss": 2.1967906951904297, + "epoch": 9.98, + "step": 431999 + }, + { + "epoch": 9.98, + "learning_rate": 1.3915510204081633e-05, + "loss": 2.3029, + "step": 432000 + }, + { + "FLOPS loss": 0.0755709856748581, + "L0_d": 966.83, + "MLM loss": 2.218096971511841, + "epoch": 9.99, + "step": 432499 + }, + { + "epoch": 9.99, + "learning_rate": 1.3813469387755101e-05, + "loss": 2.3022, + "step": 432500 + }, + { + "FLOPS loss": 0.06049109250307083, + "L0_d": 864.84, + "MLM loss": 2.365525245666504, + "epoch": 10.0, + "step": 432999 + }, + { + "epoch": 10.0, + "learning_rate": 1.3711428571428573e-05, + "loss": 2.302, + "step": 433000 + }, + { + "FLOPS loss": 0.05764416232705116, + "L0_d": 792.52, + "MLM loss": 2.233534812927246, + "epoch": 10.01, + "step": 433499 + }, + { + "epoch": 10.01, + "learning_rate": 1.360938775510204e-05, + "loss": 2.3035, + "step": 433500 + }, + { + "FLOPS loss": 0.07010982185602188, + "L0_d": 886.73, + "MLM loss": 2.5419046878814697, + "epoch": 10.03, + "step": 433999 + }, + { + "epoch": 10.03, + "learning_rate": 1.3507551020408163e-05, + "loss": 2.3015, + "step": 434000 + }, + { + "FLOPS loss": 0.06383447349071503, + "L0_d": 905.14, + "MLM loss": 2.1592020988464355, + "epoch": 10.04, + "step": 434499 + }, + { + "epoch": 10.04, + "learning_rate": 1.3405510204081634e-05, + "loss": 2.2996, + "step": 434500 + }, + { + "FLOPS loss": 0.06052009016275406, + "L0_d": 726.97, + "MLM loss": 2.085092306137085, + "epoch": 10.05, + "step": 434999 + }, + { + "epoch": 10.05, + "learning_rate": 1.3303469387755103e-05, + "loss": 2.2996, + "step": 435000 + }, + { + "FLOPS loss": 0.06284129619598389, + "L0_d": 836.41, + "MLM loss": 2.291759967803955, + "epoch": 10.06, + "step": 435499 + }, + { + "epoch": 10.06, + "learning_rate": 1.3201632653061225e-05, + "loss": 2.2995, + "step": 435500 + }, + { + "FLOPS loss": 0.07444918155670166, + "L0_d": 872.38, + "MLM loss": 2.4256529808044434, + "epoch": 10.07, + "step": 435999 + }, + { + "epoch": 10.07, + "learning_rate": 1.3099591836734695e-05, + "loss": 2.3009, + "step": 436000 + }, + { + "FLOPS loss": 0.06040747091174126, + "L0_d": 693.17, + "MLM loss": 2.163949966430664, + "epoch": 10.08, + "step": 436499 + }, + { + "epoch": 10.08, + "learning_rate": 1.2997551020408163e-05, + "loss": 2.3036, + "step": 436500 + }, + { + "FLOPS loss": 0.062491338700056076, + "L0_d": 836.11, + "MLM loss": 2.4563846588134766, + "epoch": 10.1, + "step": 436999 + }, + { + "epoch": 10.1, + "learning_rate": 1.2895510204081634e-05, + "loss": 2.2969, + "step": 437000 + }, + { + "FLOPS loss": 0.06521119922399521, + "L0_d": 845.45, + "MLM loss": 2.5408596992492676, + "epoch": 10.11, + "step": 437499 + }, + { + "epoch": 10.11, + "learning_rate": 1.2793469387755102e-05, + "loss": 2.299, + "step": 437500 + }, + { + "FLOPS loss": 0.06396574527025223, + "L0_d": 834.34, + "MLM loss": 2.319390296936035, + "epoch": 10.12, + "step": 437999 + }, + { + "epoch": 10.12, + "learning_rate": 1.269142857142857e-05, + "loss": 2.3018, + "step": 438000 + }, + { + "FLOPS loss": 0.07069003582000732, + "L0_d": 950.38, + "MLM loss": 2.2196543216705322, + "epoch": 10.13, + "step": 438499 + }, + { + "epoch": 10.13, + "learning_rate": 1.2589387755102042e-05, + "loss": 2.3015, + "step": 438500 + }, + { + "FLOPS loss": 0.06846750527620316, + "L0_d": 1224.31, + "MLM loss": 2.2630181312561035, + "epoch": 10.14, + "step": 438999 + }, + { + "epoch": 10.14, + "learning_rate": 1.2487346938775512e-05, + "loss": 2.2973, + "step": 439000 + }, + { + "FLOPS loss": 0.05883738771080971, + "L0_d": 825.19, + "MLM loss": 2.2000060081481934, + "epoch": 10.15, + "step": 439499 + }, + { + "epoch": 10.15, + "learning_rate": 1.2385510204081634e-05, + "loss": 2.2976, + "step": 439500 + }, + { + "FLOPS loss": 0.0656290054321289, + "L0_d": 852.75, + "MLM loss": 2.289665937423706, + "epoch": 10.16, + "step": 439999 + }, + { + "epoch": 10.16, + "learning_rate": 1.2283469387755104e-05, + "loss": 2.2963, + "step": 440000 + }, + { + "FLOPS loss": 0.06831687688827515, + "L0_d": 893.62, + "MLM loss": 2.349213123321533, + "epoch": 10.18, + "step": 440499 + }, + { + "epoch": 10.18, + "learning_rate": 1.2181428571428572e-05, + "loss": 2.2919, + "step": 440500 + }, + { + "FLOPS loss": 0.06690240651369095, + "L0_d": 897.55, + "MLM loss": 2.0150651931762695, + "epoch": 10.19, + "step": 440999 + }, + { + "epoch": 10.19, + "learning_rate": 1.2079591836734694e-05, + "loss": 2.2969, + "step": 441000 + }, + { + "FLOPS loss": 0.06606750190258026, + "L0_d": 1001.64, + "MLM loss": 2.107846975326538, + "epoch": 10.2, + "step": 441499 + }, + { + "epoch": 10.2, + "learning_rate": 1.1977551020408164e-05, + "loss": 2.2954, + "step": 441500 + }, + { + "FLOPS loss": 0.06601283699274063, + "L0_d": 1012.19, + "MLM loss": 2.3590779304504395, + "epoch": 10.21, + "step": 441999 + }, + { + "epoch": 10.21, + "learning_rate": 1.1875510204081632e-05, + "loss": 2.3013, + "step": 442000 + }, + { + "FLOPS loss": 0.056500278413295746, + "L0_d": 760.25, + "MLM loss": 2.1543731689453125, + "epoch": 10.22, + "step": 442499 + }, + { + "epoch": 10.22, + "learning_rate": 1.1773469387755102e-05, + "loss": 2.3026, + "step": 442500 + }, + { + "FLOPS loss": 0.07678142189979553, + "L0_d": 1012.47, + "MLM loss": 2.2874748706817627, + "epoch": 10.23, + "step": 442999 + }, + { + "epoch": 10.23, + "learning_rate": 1.1671428571428572e-05, + "loss": 2.2946, + "step": 443000 + }, + { + "FLOPS loss": 0.0532967634499073, + "L0_d": 725.55, + "MLM loss": 2.112173318862915, + "epoch": 10.25, + "step": 443499 + }, + { + "epoch": 10.25, + "learning_rate": 1.1569387755102042e-05, + "loss": 2.2919, + "step": 443500 + }, + { + "FLOPS loss": 0.07034281641244888, + "L0_d": 948.98, + "MLM loss": 2.24344539642334, + "epoch": 10.26, + "step": 443999 + }, + { + "epoch": 10.26, + "learning_rate": 1.1467346938775511e-05, + "loss": 2.296, + "step": 444000 + }, + { + "FLOPS loss": 0.05562940984964371, + "L0_d": 747.36, + "MLM loss": 2.1856186389923096, + "epoch": 10.27, + "step": 444499 + }, + { + "epoch": 10.27, + "learning_rate": 1.1365306122448981e-05, + "loss": 2.296, + "step": 444500 + }, + { + "FLOPS loss": 0.0713595598936081, + "L0_d": 809.14, + "MLM loss": 2.2879340648651123, + "epoch": 10.28, + "step": 444999 + }, + { + "epoch": 10.28, + "learning_rate": 1.1263469387755103e-05, + "loss": 2.2967, + "step": 445000 + }, + { + "FLOPS loss": 0.062412329018116, + "L0_d": 759.11, + "MLM loss": 2.179746627807617, + "epoch": 10.29, + "step": 445499 + }, + { + "epoch": 10.29, + "learning_rate": 1.1161428571428572e-05, + "loss": 2.2951, + "step": 445500 + }, + { + "FLOPS loss": 0.060674265027046204, + "L0_d": 851.06, + "MLM loss": 2.2669856548309326, + "epoch": 10.3, + "step": 445999 + }, + { + "epoch": 10.3, + "learning_rate": 1.1059387755102041e-05, + "loss": 2.2965, + "step": 446000 + }, + { + "FLOPS loss": 0.06142638996243477, + "L0_d": 834.64, + "MLM loss": 2.347390651702881, + "epoch": 10.31, + "step": 446499 + }, + { + "epoch": 10.31, + "learning_rate": 1.0957346938775511e-05, + "loss": 2.2897, + "step": 446500 + }, + { + "FLOPS loss": 0.06716670095920563, + "L0_d": 921.64, + "MLM loss": 2.276606798171997, + "epoch": 10.33, + "step": 446999 + }, + { + "epoch": 10.33, + "learning_rate": 1.0855510204081633e-05, + "loss": 2.295, + "step": 447000 + }, + { + "FLOPS loss": 0.06482750177383423, + "L0_d": 959.17, + "MLM loss": 2.3427610397338867, + "epoch": 10.34, + "step": 447499 + }, + { + "epoch": 10.34, + "learning_rate": 1.0753469387755102e-05, + "loss": 2.2996, + "step": 447500 + }, + { + "FLOPS loss": 0.06260459870100021, + "L0_d": 833.97, + "MLM loss": 2.090470790863037, + "epoch": 10.35, + "step": 447999 + }, + { + "epoch": 10.35, + "learning_rate": 1.0651428571428571e-05, + "loss": 2.2926, + "step": 448000 + }, + { + "FLOPS loss": 0.06450529396533966, + "L0_d": 851.77, + "MLM loss": 2.002493143081665, + "epoch": 10.36, + "step": 448499 + }, + { + "epoch": 10.36, + "learning_rate": 1.0549387755102041e-05, + "loss": 2.2925, + "step": 448500 + }, + { + "FLOPS loss": 0.07590842247009277, + "L0_d": 1052.16, + "MLM loss": 2.1307899951934814, + "epoch": 10.37, + "step": 448999 + }, + { + "epoch": 10.37, + "learning_rate": 1.0447346938775511e-05, + "loss": 2.2996, + "step": 449000 + }, + { + "FLOPS loss": 0.07303966581821442, + "L0_d": 1016.38, + "MLM loss": 2.1589527130126953, + "epoch": 10.38, + "step": 449499 + }, + { + "epoch": 10.38, + "learning_rate": 1.0345510204081633e-05, + "loss": 2.2974, + "step": 449500 + }, + { + "FLOPS loss": 0.053986258804798126, + "L0_d": 647.03, + "MLM loss": 2.321556568145752, + "epoch": 10.4, + "step": 449999 + }, + { + "epoch": 10.4, + "learning_rate": 1.0243469387755103e-05, + "loss": 2.2924, + "step": 450000 + }, + { + "FLOPS loss": 0.06856471300125122, + "L0_d": 924.59, + "MLM loss": 2.3103318214416504, + "epoch": 10.41, + "step": 450499 + }, + { + "epoch": 10.41, + "learning_rate": 1.0141428571428573e-05, + "loss": 2.2974, + "step": 450500 + }, + { + "FLOPS loss": 0.07135065644979477, + "L0_d": 973.39, + "MLM loss": 2.135502338409424, + "epoch": 10.42, + "step": 450999 + }, + { + "epoch": 10.42, + "learning_rate": 1.0039387755102041e-05, + "loss": 2.2953, + "step": 451000 + }, + { + "FLOPS loss": 0.058795761317014694, + "L0_d": 866.27, + "MLM loss": 2.257445812225342, + "epoch": 10.43, + "step": 451499 + }, + { + "epoch": 10.43, + "learning_rate": 9.937551020408163e-06, + "loss": 2.2953, + "step": 451500 + }, + { + "FLOPS loss": 0.06395602971315384, + "L0_d": 891.98, + "MLM loss": 2.267632246017456, + "epoch": 10.44, + "step": 451999 + }, + { + "epoch": 10.44, + "learning_rate": 9.835510204081633e-06, + "loss": 2.2962, + "step": 452000 + }, + { + "FLOPS loss": 0.05605367198586464, + "L0_d": 919.94, + "MLM loss": 2.185353994369507, + "epoch": 10.45, + "step": 452499 + }, + { + "epoch": 10.45, + "learning_rate": 9.733469387755103e-06, + "loss": 2.291, + "step": 452500 + }, + { + "FLOPS loss": 0.056460727006196976, + "L0_d": 827.66, + "MLM loss": 2.2944674491882324, + "epoch": 10.46, + "step": 452999 + }, + { + "epoch": 10.46, + "learning_rate": 9.631428571428573e-06, + "loss": 2.2941, + "step": 453000 + }, + { + "FLOPS loss": 0.06284444779157639, + "L0_d": 758.91, + "MLM loss": 2.1061954498291016, + "epoch": 10.48, + "step": 453499 + }, + { + "epoch": 10.48, + "learning_rate": 9.52938775510204e-06, + "loss": 2.2939, + "step": 453500 + }, + { + "FLOPS loss": 0.054719168692827225, + "L0_d": 649.95, + "MLM loss": 2.1994714736938477, + "epoch": 10.49, + "step": 453999 + }, + { + "epoch": 10.49, + "learning_rate": 9.427551020408163e-06, + "loss": 2.292, + "step": 454000 + }, + { + "FLOPS loss": 0.06368663907051086, + "L0_d": 844.17, + "MLM loss": 2.1803951263427734, + "epoch": 10.5, + "step": 454499 + }, + { + "epoch": 10.5, + "learning_rate": 9.325510204081633e-06, + "loss": 2.2931, + "step": 454500 + }, + { + "FLOPS loss": 0.08477547019720078, + "L0_d": 1192.67, + "MLM loss": 2.25724458694458, + "epoch": 10.51, + "step": 454999 + }, + { + "epoch": 10.51, + "learning_rate": 9.223469387755103e-06, + "loss": 2.2921, + "step": 455000 + }, + { + "FLOPS loss": 0.05767339468002319, + "L0_d": 670.22, + "MLM loss": 2.289708137512207, + "epoch": 10.52, + "step": 455499 + }, + { + "epoch": 10.52, + "learning_rate": 9.121428571428572e-06, + "loss": 2.2942, + "step": 455500 + }, + { + "FLOPS loss": 0.055055826902389526, + "L0_d": 861.41, + "MLM loss": 2.225193738937378, + "epoch": 10.53, + "step": 455999 + }, + { + "epoch": 10.53, + "learning_rate": 9.019591836734695e-06, + "loss": 2.296, + "step": 456000 + }, + { + "FLOPS loss": 0.06768262386322021, + "L0_d": 778.47, + "MLM loss": 2.0815377235412598, + "epoch": 10.55, + "step": 456499 + }, + { + "epoch": 10.55, + "learning_rate": 8.917551020408163e-06, + "loss": 2.2913, + "step": 456500 + }, + { + "FLOPS loss": 0.06390358507633209, + "L0_d": 767.75, + "MLM loss": 2.110469341278076, + "epoch": 10.56, + "step": 456999 + }, + { + "epoch": 10.56, + "learning_rate": 8.815510204081633e-06, + "loss": 2.2901, + "step": 457000 + }, + { + "FLOPS loss": 0.05460204929113388, + "L0_d": 885.66, + "MLM loss": 2.217503070831299, + "epoch": 10.57, + "step": 457499 + }, + { + "epoch": 10.57, + "learning_rate": 8.713469387755102e-06, + "loss": 2.2921, + "step": 457500 + }, + { + "FLOPS loss": 0.060463305562734604, + "L0_d": 938.72, + "MLM loss": 2.1308157444000244, + "epoch": 10.58, + "step": 457999 + }, + { + "epoch": 10.58, + "learning_rate": 8.611428571428572e-06, + "loss": 2.2944, + "step": 458000 + }, + { + "FLOPS loss": 0.06594925373792648, + "L0_d": 911.19, + "MLM loss": 2.3237662315368652, + "epoch": 10.59, + "step": 458499 + }, + { + "epoch": 10.59, + "learning_rate": 8.509591836734695e-06, + "loss": 2.2917, + "step": 458500 + }, + { + "FLOPS loss": 0.08003830164670944, + "L0_d": 973.66, + "MLM loss": 2.3158817291259766, + "epoch": 10.6, + "step": 458999 + }, + { + "epoch": 10.6, + "learning_rate": 8.407755102040817e-06, + "loss": 2.2932, + "step": 459000 + }, + { + "FLOPS loss": 0.06033896282315254, + "L0_d": 647.02, + "MLM loss": 2.24031400680542, + "epoch": 10.61, + "step": 459499 + }, + { + "epoch": 10.61, + "learning_rate": 8.305714285714287e-06, + "loss": 2.2928, + "step": 459500 + }, + { + "FLOPS loss": 0.0587412565946579, + "L0_d": 741.28, + "MLM loss": 2.185731887817383, + "epoch": 10.63, + "step": 459999 + }, + { + "epoch": 10.63, + "learning_rate": 8.203673469387755e-06, + "loss": 2.2938, + "step": 460000 + }, + { + "FLOPS loss": 0.06728052347898483, + "L0_d": 794.55, + "MLM loss": 2.4783236980438232, + "epoch": 10.64, + "step": 460499 + }, + { + "epoch": 10.64, + "learning_rate": 8.101632653061225e-06, + "loss": 2.2952, + "step": 460500 + }, + { + "FLOPS loss": 0.07311265915632248, + "L0_d": 912.84, + "MLM loss": 2.2954416275024414, + "epoch": 10.65, + "step": 460999 + }, + { + "epoch": 10.65, + "learning_rate": 7.999591836734694e-06, + "loss": 2.2947, + "step": 461000 + }, + { + "FLOPS loss": 0.06787005066871643, + "L0_d": 876.83, + "MLM loss": 2.2662861347198486, + "epoch": 10.66, + "step": 461499 + }, + { + "epoch": 10.66, + "learning_rate": 7.897551020408164e-06, + "loss": 2.2881, + "step": 461500 + }, + { + "FLOPS loss": 0.052038803696632385, + "L0_d": 670.36, + "MLM loss": 2.4589502811431885, + "epoch": 10.67, + "step": 461999 + }, + { + "epoch": 10.67, + "learning_rate": 7.795510204081632e-06, + "loss": 2.2944, + "step": 462000 + }, + { + "FLOPS loss": 0.05376419052481651, + "L0_d": 768.39, + "MLM loss": 2.2146544456481934, + "epoch": 10.68, + "step": 462499 + }, + { + "epoch": 10.68, + "learning_rate": 7.693469387755102e-06, + "loss": 2.2914, + "step": 462500 + }, + { + "FLOPS loss": 0.05922848358750343, + "L0_d": 819.77, + "MLM loss": 2.292515516281128, + "epoch": 10.7, + "step": 462999 + }, + { + "epoch": 10.7, + "learning_rate": 7.591428571428572e-06, + "loss": 2.2913, + "step": 463000 + }, + { + "FLOPS loss": 0.05460599809885025, + "L0_d": 802.72, + "MLM loss": 2.2540738582611084, + "epoch": 10.71, + "step": 463499 + }, + { + "epoch": 10.71, + "learning_rate": 7.489591836734694e-06, + "loss": 2.2922, + "step": 463500 + }, + { + "FLOPS loss": 0.06698250025510788, + "L0_d": 889.16, + "MLM loss": 2.2473649978637695, + "epoch": 10.72, + "step": 463999 + }, + { + "epoch": 10.72, + "learning_rate": 7.387551020408163e-06, + "loss": 2.2892, + "step": 464000 + }, + { + "FLOPS loss": 0.057049185037612915, + "L0_d": 764.62, + "MLM loss": 2.392953395843506, + "epoch": 10.73, + "step": 464499 + }, + { + "epoch": 10.73, + "learning_rate": 7.285510204081633e-06, + "loss": 2.2937, + "step": 464500 + }, + { + "FLOPS loss": 0.0670197457075119, + "L0_d": 987.97, + "MLM loss": 2.170027732849121, + "epoch": 10.74, + "step": 464999 + }, + { + "epoch": 10.74, + "learning_rate": 7.183469387755103e-06, + "loss": 2.2883, + "step": 465000 + }, + { + "FLOPS loss": 0.07646064460277557, + "L0_d": 931.58, + "MLM loss": 2.1344852447509766, + "epoch": 10.75, + "step": 465499 + }, + { + "epoch": 10.75, + "learning_rate": 7.081632653061225e-06, + "loss": 2.2881, + "step": 465500 + }, + { + "FLOPS loss": 0.05379510298371315, + "L0_d": 915.58, + "MLM loss": 2.1789708137512207, + "epoch": 10.77, + "step": 465999 + }, + { + "epoch": 10.77, + "learning_rate": 6.979591836734695e-06, + "loss": 2.2925, + "step": 466000 + }, + { + "FLOPS loss": 0.060537777841091156, + "L0_d": 804.53, + "MLM loss": 2.200517177581787, + "epoch": 10.78, + "step": 466499 + }, + { + "epoch": 10.78, + "learning_rate": 6.877551020408164e-06, + "loss": 2.2956, + "step": 466500 + }, + { + "FLOPS loss": 0.06291097402572632, + "L0_d": 866.05, + "MLM loss": 2.135246515274048, + "epoch": 10.79, + "step": 466999 + }, + { + "epoch": 10.79, + "learning_rate": 6.775510204081633e-06, + "loss": 2.2913, + "step": 467000 + }, + { + "FLOPS loss": 0.05674765631556511, + "L0_d": 680.55, + "MLM loss": 2.1429038047790527, + "epoch": 10.8, + "step": 467499 + }, + { + "epoch": 10.8, + "learning_rate": 6.673673469387755e-06, + "loss": 2.2889, + "step": 467500 + }, + { + "FLOPS loss": 0.07323261350393295, + "L0_d": 945.06, + "MLM loss": 2.293457508087158, + "epoch": 10.81, + "step": 467999 + }, + { + "epoch": 10.81, + "learning_rate": 6.571632653061224e-06, + "loss": 2.2878, + "step": 468000 + }, + { + "FLOPS loss": 0.07292532175779343, + "L0_d": 844.14, + "MLM loss": 2.3283419609069824, + "epoch": 10.82, + "step": 468499 + }, + { + "epoch": 10.82, + "learning_rate": 6.469591836734694e-06, + "loss": 2.2917, + "step": 468500 + }, + { + "FLOPS loss": 0.06475947052240372, + "L0_d": 1061.98, + "MLM loss": 2.2273287773132324, + "epoch": 10.83, + "step": 468999 + }, + { + "epoch": 10.83, + "learning_rate": 6.367551020408164e-06, + "loss": 2.2884, + "step": 469000 + }, + { + "FLOPS loss": 0.06330478191375732, + "L0_d": 812.52, + "MLM loss": 2.358989953994751, + "epoch": 10.85, + "step": 469499 + }, + { + "epoch": 10.85, + "learning_rate": 6.265714285714286e-06, + "loss": 2.2847, + "step": 469500 + }, + { + "FLOPS loss": 0.05554909631609917, + "L0_d": 683.17, + "MLM loss": 2.204073667526245, + "epoch": 10.86, + "step": 469999 + }, + { + "epoch": 10.86, + "learning_rate": 6.163673469387756e-06, + "loss": 2.2977, + "step": 470000 + }, + { + "FLOPS loss": 0.07192515581846237, + "L0_d": 838.58, + "MLM loss": 2.146226406097412, + "epoch": 10.87, + "step": 470499 + }, + { + "epoch": 10.87, + "learning_rate": 6.061632653061225e-06, + "loss": 2.2869, + "step": 470500 + }, + { + "FLOPS loss": 0.06218468397855759, + "L0_d": 894.47, + "MLM loss": 2.3167052268981934, + "epoch": 10.88, + "step": 470999 + }, + { + "epoch": 10.88, + "learning_rate": 5.959591836734694e-06, + "loss": 2.288, + "step": 471000 + }, + { + "FLOPS loss": 0.07072613388299942, + "L0_d": 963.66, + "MLM loss": 2.1581053733825684, + "epoch": 10.89, + "step": 471499 + }, + { + "epoch": 10.89, + "learning_rate": 5.857755102040816e-06, + "loss": 2.2867, + "step": 471500 + }, + { + "FLOPS loss": 0.05788072943687439, + "L0_d": 876.33, + "MLM loss": 2.335477828979492, + "epoch": 10.9, + "step": 471999 + }, + { + "epoch": 10.9, + "learning_rate": 5.755714285714286e-06, + "loss": 2.2861, + "step": 472000 + }, + { + "FLOPS loss": 0.05549454689025879, + "L0_d": 770.08, + "MLM loss": 2.025320291519165, + "epoch": 10.92, + "step": 472499 + }, + { + "epoch": 10.92, + "learning_rate": 5.6536734693877556e-06, + "loss": 2.2834, + "step": 472500 + }, + { + "FLOPS loss": 0.06566669046878815, + "L0_d": 902.3, + "MLM loss": 2.3450229167938232, + "epoch": 10.93, + "step": 472999 + }, + { + "epoch": 10.93, + "learning_rate": 5.5516326530612245e-06, + "loss": 2.287, + "step": 473000 + }, + { + "FLOPS loss": 0.05576305836439133, + "L0_d": 733.09, + "MLM loss": 2.2725815773010254, + "epoch": 10.94, + "step": 473499 + }, + { + "epoch": 10.94, + "learning_rate": 5.449591836734694e-06, + "loss": 2.2889, + "step": 473500 + }, + { + "FLOPS loss": 0.05368854105472565, + "L0_d": 714.02, + "MLM loss": 2.2860021591186523, + "epoch": 10.95, + "step": 473999 + }, + { + "epoch": 10.95, + "learning_rate": 5.347755102040817e-06, + "loss": 2.289, + "step": 474000 + }, + { + "FLOPS loss": 0.058568090200424194, + "L0_d": 872.48, + "MLM loss": 2.3067092895507812, + "epoch": 10.96, + "step": 474499 + }, + { + "epoch": 10.96, + "learning_rate": 5.245714285714286e-06, + "loss": 2.2879, + "step": 474500 + }, + { + "FLOPS loss": 0.06257157772779465, + "L0_d": 826.19, + "MLM loss": 2.350205421447754, + "epoch": 10.97, + "step": 474999 + }, + { + "epoch": 10.97, + "learning_rate": 5.143673469387755e-06, + "loss": 2.2859, + "step": 475000 + }, + { + "FLOPS loss": 0.06970212608575821, + "L0_d": 883.05, + "MLM loss": 2.2712743282318115, + "epoch": 10.98, + "step": 475499 + }, + { + "epoch": 10.98, + "learning_rate": 5.041632653061225e-06, + "loss": 2.2862, + "step": 475500 + }, + { + "FLOPS loss": 0.051716398447752, + "L0_d": 697.44, + "MLM loss": 2.2224364280700684, + "epoch": 11.0, + "step": 475999 + }, + { + "epoch": 11.0, + "learning_rate": 4.9397959183673475e-06, + "loss": 2.2881, + "step": 476000 + }, + { + "FLOPS loss": 0.054959673434495926, + "L0_d": 784.42, + "MLM loss": 2.4748611450195312, + "epoch": 11.01, + "step": 476499 + }, + { + "epoch": 11.01, + "learning_rate": 4.8377551020408165e-06, + "loss": 2.2866, + "step": 476500 + }, + { + "FLOPS loss": 0.05612470582127571, + "L0_d": 797.98, + "MLM loss": 2.285034418106079, + "epoch": 11.02, + "step": 476999 + }, + { + "epoch": 11.02, + "learning_rate": 4.735714285714285e-06, + "loss": 2.2905, + "step": 477000 + }, + { + "FLOPS loss": 0.059794358909130096, + "L0_d": 824.89, + "MLM loss": 2.137261152267456, + "epoch": 11.03, + "step": 477499 + }, + { + "epoch": 11.03, + "learning_rate": 4.633673469387755e-06, + "loss": 2.2846, + "step": 477500 + }, + { + "FLOPS loss": 0.06297947466373444, + "L0_d": 829.75, + "MLM loss": 2.0144095420837402, + "epoch": 11.04, + "step": 477999 + }, + { + "epoch": 11.04, + "learning_rate": 4.5318367346938776e-06, + "loss": 2.2832, + "step": 478000 + }, + { + "FLOPS loss": 0.06466788053512573, + "L0_d": 868.56, + "MLM loss": 2.188913345336914, + "epoch": 11.05, + "step": 478499 + }, + { + "epoch": 11.05, + "learning_rate": 4.429795918367347e-06, + "loss": 2.2858, + "step": 478500 + }, + { + "FLOPS loss": 0.06475504487752914, + "L0_d": 915.92, + "MLM loss": 2.1946072578430176, + "epoch": 11.07, + "step": 478999 + }, + { + "epoch": 11.07, + "learning_rate": 4.327755102040817e-06, + "loss": 2.2838, + "step": 479000 + }, + { + "FLOPS loss": 0.06781303882598877, + "L0_d": 900.3, + "MLM loss": 2.2967867851257324, + "epoch": 11.08, + "step": 479499 + }, + { + "epoch": 11.08, + "learning_rate": 4.225714285714286e-06, + "loss": 2.2855, + "step": 479500 + }, + { + "FLOPS loss": 0.06631471961736679, + "L0_d": 864.19, + "MLM loss": 2.262704372406006, + "epoch": 11.09, + "step": 479999 + }, + { + "epoch": 11.09, + "learning_rate": 4.123877551020408e-06, + "loss": 2.2866, + "step": 480000 + }, + { + "FLOPS loss": 0.0421132929623127, + "L0_d": 671.28, + "MLM loss": 2.4269192218780518, + "epoch": 11.1, + "step": 480499 + }, + { + "epoch": 11.1, + "learning_rate": 4.021836734693877e-06, + "loss": 2.2837, + "step": 480500 + }, + { + "FLOPS loss": 0.06580214202404022, + "L0_d": 907.77, + "MLM loss": 2.096919536590576, + "epoch": 11.11, + "step": 480999 + }, + { + "epoch": 11.11, + "learning_rate": 3.919795918367347e-06, + "loss": 2.2826, + "step": 481000 + }, + { + "FLOPS loss": 0.0580022819340229, + "L0_d": 816.48, + "MLM loss": 2.2413530349731445, + "epoch": 11.12, + "step": 481499 + }, + { + "epoch": 11.12, + "learning_rate": 3.817755102040817e-06, + "loss": 2.2858, + "step": 481500 + }, + { + "FLOPS loss": 0.0691581517457962, + "L0_d": 988.39, + "MLM loss": 2.397145986557007, + "epoch": 11.13, + "step": 481999 + }, + { + "epoch": 11.13, + "learning_rate": 3.7159183673469393e-06, + "loss": 2.2866, + "step": 482000 + }, + { + "FLOPS loss": 0.053739335387945175, + "L0_d": 679.95, + "MLM loss": 2.0748467445373535, + "epoch": 11.15, + "step": 482499 + }, + { + "epoch": 11.15, + "learning_rate": 3.613877551020408e-06, + "loss": 2.2872, + "step": 482500 + }, + { + "FLOPS loss": 0.05533215031027794, + "L0_d": 825.25, + "MLM loss": 2.2202677726745605, + "epoch": 11.16, + "step": 482999 + }, + { + "epoch": 11.16, + "learning_rate": 3.5118367346938776e-06, + "loss": 2.2851, + "step": 483000 + }, + { + "FLOPS loss": 0.058010417968034744, + "L0_d": 726.12, + "MLM loss": 2.253462314605713, + "epoch": 11.17, + "step": 483499 + }, + { + "epoch": 11.17, + "learning_rate": 3.409795918367347e-06, + "loss": 2.2854, + "step": 483500 + }, + { + "FLOPS loss": 0.057092875242233276, + "L0_d": 834.2, + "MLM loss": 2.214354991912842, + "epoch": 11.18, + "step": 483999 + }, + { + "epoch": 11.18, + "learning_rate": 3.3079591836734697e-06, + "loss": 2.2797, + "step": 484000 + }, + { + "FLOPS loss": 0.057993967086076736, + "L0_d": 937.23, + "MLM loss": 2.2875499725341797, + "epoch": 11.19, + "step": 484499 + }, + { + "epoch": 11.19, + "learning_rate": 3.205918367346939e-06, + "loss": 2.2846, + "step": 484500 + }, + { + "FLOPS loss": 0.07085564732551575, + "L0_d": 1015.91, + "MLM loss": 2.227050304412842, + "epoch": 11.2, + "step": 484999 + }, + { + "epoch": 11.2, + "learning_rate": 3.1038775510204084e-06, + "loss": 2.2838, + "step": 485000 + }, + { + "FLOPS loss": 0.054904911667108536, + "L0_d": 781.31, + "MLM loss": 2.2337186336517334, + "epoch": 11.22, + "step": 485499 + }, + { + "epoch": 11.22, + "learning_rate": 3.001836734693878e-06, + "loss": 2.2847, + "step": 485500 + }, + { + "FLOPS loss": 0.07090209424495697, + "L0_d": 930.45, + "MLM loss": 2.343792676925659, + "epoch": 11.23, + "step": 485999 + }, + { + "epoch": 11.23, + "learning_rate": 2.899795918367347e-06, + "loss": 2.2835, + "step": 486000 + }, + { + "FLOPS loss": 0.06975448131561279, + "L0_d": 1004.09, + "MLM loss": 2.3023698329925537, + "epoch": 11.24, + "step": 486499 + }, + { + "epoch": 11.24, + "learning_rate": 2.7979591836734695e-06, + "loss": 2.2819, + "step": 486500 + }, + { + "FLOPS loss": 0.07231532782316208, + "L0_d": 883.98, + "MLM loss": 2.448282241821289, + "epoch": 11.25, + "step": 486999 + }, + { + "epoch": 11.25, + "learning_rate": 2.695918367346939e-06, + "loss": 2.2827, + "step": 487000 + }, + { + "FLOPS loss": 0.06464584916830063, + "L0_d": 878.47, + "MLM loss": 2.4496078491210938, + "epoch": 11.26, + "step": 487499 + }, + { + "epoch": 11.26, + "learning_rate": 2.5938775510204082e-06, + "loss": 2.2875, + "step": 487500 + }, + { + "FLOPS loss": 0.06287199258804321, + "L0_d": 914.27, + "MLM loss": 2.293144702911377, + "epoch": 11.27, + "step": 487999 + }, + { + "epoch": 11.27, + "learning_rate": 2.4918367346938776e-06, + "loss": 2.2859, + "step": 488000 + }, + { + "FLOPS loss": 0.06138462945818901, + "L0_d": 809.25, + "MLM loss": 2.25559139251709, + "epoch": 11.28, + "step": 488499 + }, + { + "epoch": 11.28, + "learning_rate": 2.3900000000000004e-06, + "loss": 2.2864, + "step": 488500 + }, + { + "FLOPS loss": 0.0626230463385582, + "L0_d": 787.83, + "MLM loss": 2.248427391052246, + "epoch": 11.3, + "step": 488999 + }, + { + "epoch": 11.3, + "learning_rate": 2.2879591836734693e-06, + "loss": 2.284, + "step": 489000 + }, + { + "FLOPS loss": 0.06445842236280441, + "L0_d": 913.44, + "MLM loss": 2.2899110317230225, + "epoch": 11.31, + "step": 489499 + }, + { + "epoch": 11.31, + "learning_rate": 2.185918367346939e-06, + "loss": 2.2865, + "step": 489500 + }, + { + "FLOPS loss": 0.06436813622713089, + "L0_d": 843.73, + "MLM loss": 2.0450210571289062, + "epoch": 11.32, + "step": 489999 + }, + { + "epoch": 11.32, + "learning_rate": 2.083877551020408e-06, + "loss": 2.2795, + "step": 490000 + }, + { + "FLOPS loss": 0.05308640003204346, + "L0_d": 662.95, + "MLM loss": 2.297100305557251, + "epoch": 11.33, + "step": 490499 + }, + { + "epoch": 11.33, + "learning_rate": 1.981836734693878e-06, + "loss": 2.2855, + "step": 490500 + }, + { + "FLOPS loss": 0.07051984965801239, + "L0_d": 930.33, + "MLM loss": 2.173715353012085, + "epoch": 11.34, + "step": 490999 + }, + { + "epoch": 11.34, + "learning_rate": 1.8800000000000002e-06, + "loss": 2.2768, + "step": 491000 + }, + { + "FLOPS loss": 0.06497853994369507, + "L0_d": 941.98, + "MLM loss": 2.1293177604675293, + "epoch": 11.35, + "step": 491499 + }, + { + "epoch": 11.35, + "learning_rate": 1.7779591836734694e-06, + "loss": 2.2856, + "step": 491500 + }, + { + "FLOPS loss": 0.05521395802497864, + "L0_d": 848.78, + "MLM loss": 2.2102599143981934, + "epoch": 11.37, + "step": 491999 + }, + { + "epoch": 11.37, + "learning_rate": 1.675918367346939e-06, + "loss": 2.2816, + "step": 492000 + }, + { + "FLOPS loss": 0.07123885303735733, + "L0_d": 958.03, + "MLM loss": 2.224113702774048, + "epoch": 11.38, + "step": 492499 + }, + { + "epoch": 11.38, + "learning_rate": 1.573877551020408e-06, + "loss": 2.2853, + "step": 492500 + }, + { + "FLOPS loss": 0.0590461902320385, + "L0_d": 896.55, + "MLM loss": 2.306436538696289, + "epoch": 11.39, + "step": 492999 + }, + { + "epoch": 11.39, + "learning_rate": 1.4720408163265307e-06, + "loss": 2.2788, + "step": 493000 + }, + { + "FLOPS loss": 0.057050079107284546, + "L0_d": 733.53, + "MLM loss": 2.209432363510132, + "epoch": 11.4, + "step": 493499 + }, + { + "epoch": 11.4, + "learning_rate": 1.37e-06, + "loss": 2.2844, + "step": 493500 + }, + { + "FLOPS loss": 0.047397587448358536, + "L0_d": 655.14, + "MLM loss": 2.215719699859619, + "epoch": 11.41, + "step": 493999 + }, + { + "epoch": 11.41, + "learning_rate": 1.2679591836734696e-06, + "loss": 2.2796, + "step": 494000 + }, + { + "FLOPS loss": 0.0556488037109375, + "L0_d": 752.7, + "MLM loss": 2.3418068885803223, + "epoch": 11.42, + "step": 494499 + }, + { + "epoch": 11.42, + "learning_rate": 1.165918367346939e-06, + "loss": 2.2841, + "step": 494500 + }, + { + "FLOPS loss": 0.0699276253581047, + "L0_d": 918.27, + "MLM loss": 2.2648048400878906, + "epoch": 11.44, + "step": 494999 + }, + { + "epoch": 11.44, + "learning_rate": 1.0638775510204083e-06, + "loss": 2.2847, + "step": 495000 + }, + { + "FLOPS loss": 0.06734742224216461, + "L0_d": 917.17, + "MLM loss": 2.244868516921997, + "epoch": 11.45, + "step": 495499 + }, + { + "epoch": 11.45, + "learning_rate": 9.620408163265307e-07, + "loss": 2.2869, + "step": 495500 + }, + { + "FLOPS loss": 0.06317664682865143, + "L0_d": 786.08, + "MLM loss": 2.1748099327087402, + "epoch": 11.46, + "step": 495999 + }, + { + "epoch": 11.46, + "learning_rate": 8.6e-07, + "loss": 2.2775, + "step": 496000 + }, + { + "FLOPS loss": 0.05801856517791748, + "L0_d": 728.44, + "MLM loss": 2.228121757507324, + "epoch": 11.47, + "step": 496499 + }, + { + "epoch": 11.47, + "learning_rate": 7.579591836734694e-07, + "loss": 2.2885, + "step": 496500 + }, + { + "FLOPS loss": 0.06091460585594177, + "L0_d": 796.03, + "MLM loss": 2.3856430053710938, + "epoch": 11.48, + "step": 496999 + }, + { + "epoch": 11.48, + "learning_rate": 6.559183673469388e-07, + "loss": 2.2859, + "step": 497000 + }, + { + "FLOPS loss": 0.0647670179605484, + "L0_d": 960.53, + "MLM loss": 2.304900646209717, + "epoch": 11.49, + "step": 497499 + }, + { + "epoch": 11.49, + "learning_rate": 5.540816326530612e-07, + "loss": 2.2799, + "step": 497500 + }, + { + "FLOPS loss": 0.06450213491916656, + "L0_d": 1017.69, + "MLM loss": 2.2775943279266357, + "epoch": 11.5, + "step": 497999 + }, + { + "epoch": 11.5, + "learning_rate": 4.5204081632653063e-07, + "loss": 2.2798, + "step": 498000 + }, + { + "FLOPS loss": 0.0563599094748497, + "L0_d": 762.67, + "MLM loss": 2.046053171157837, + "epoch": 11.52, + "step": 498499 + }, + { + "epoch": 11.52, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.2774, + "step": 498500 + }, + { + "FLOPS loss": 0.06709830462932587, + "L0_d": 899.8, + "MLM loss": 2.305211305618286, + "epoch": 11.53, + "step": 498999 + }, + { + "epoch": 11.53, + "learning_rate": 2.479591836734694e-07, + "loss": 2.2826, + "step": 499000 + }, + { + "FLOPS loss": 0.08217326551675797, + "L0_d": 960.14, + "MLM loss": 2.168755054473877, + "epoch": 11.54, + "step": 499499 + }, + { + "epoch": 11.54, + "learning_rate": 1.4612244897959183e-07, + "loss": 2.282, + "step": 499500 + }, + { + "FLOPS loss": 0.06401801854372025, + "L0_d": 961.06, + "MLM loss": 2.2633349895477295, + "epoch": 11.55, + "step": 499999 + }, + { + "epoch": 11.55, + "learning_rate": 4.4081632653061224e-08, + "loss": 2.2808, + "step": 500000 + }, + { + "epoch": 11.55, + "step": 500000, + "total_flos": 8.483549701629542e+18, + "train_loss": 2.541904963623047, + "train_runtime": 186540.8186, + "train_samples_per_second": 686.177, + "train_steps_per_second": 2.68 + } + ], + "max_steps": 500000, + "num_train_epochs": 12, + "total_flos": 8.483549701629542e+18, + "trial_name": null, + "trial_params": null +}