|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 6135, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008149959250203748, |
|
"grad_norm": 0.18579324653121165, |
|
"learning_rate": 0.0007999910290180627, |
|
"loss": 0.6628, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016299918500407497, |
|
"grad_norm": 0.17807681860788138, |
|
"learning_rate": 0.0007997893323384012, |
|
"loss": 0.5165, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02444987775061125, |
|
"grad_norm": 0.19350751669754768, |
|
"learning_rate": 0.0007993223754535443, |
|
"loss": 0.4975, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.032599837000814993, |
|
"grad_norm": 0.1628741338405071, |
|
"learning_rate": 0.0007985904681893655, |
|
"loss": 0.4859, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.040749796251018745, |
|
"grad_norm": 0.1692654309709468, |
|
"learning_rate": 0.0007975940961663036, |
|
"loss": 0.4698, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0488997555012225, |
|
"grad_norm": 0.16755101172401649, |
|
"learning_rate": 0.0007963339204771541, |
|
"loss": 0.4584, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05704971475142624, |
|
"grad_norm": 0.16981406073926134, |
|
"learning_rate": 0.0007948107772484337, |
|
"loss": 0.4497, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06519967400162999, |
|
"grad_norm": 0.13393142327752477, |
|
"learning_rate": 0.0007930256770856106, |
|
"loss": 0.435, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07334963325183375, |
|
"grad_norm": 0.14862318790293866, |
|
"learning_rate": 0.000790979804402568, |
|
"loss": 0.4379, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08149959250203749, |
|
"grad_norm": 0.14157872785652958, |
|
"learning_rate": 0.0007886745166357449, |
|
"loss": 0.4389, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08964955175224124, |
|
"grad_norm": 0.14597403141832885, |
|
"learning_rate": 0.0007861113433434774, |
|
"loss": 0.4248, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.097799511002445, |
|
"grad_norm": 0.14439776167516288, |
|
"learning_rate": 0.0007832919851911376, |
|
"loss": 0.4316, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.10594947025264874, |
|
"grad_norm": 0.2043224587005451, |
|
"learning_rate": 0.0007802183128227408, |
|
"loss": 0.4168, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.11409942950285248, |
|
"grad_norm": 0.13509782410904447, |
|
"learning_rate": 0.0007768923656197741, |
|
"loss": 0.4198, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12224938875305623, |
|
"grad_norm": 0.21692882701447652, |
|
"learning_rate": 0.0007734641630410531, |
|
"loss": 0.4286, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.13039934800325997, |
|
"grad_norm": 0.18392281414572664, |
|
"learning_rate": 0.000769650312579427, |
|
"loss": 0.4466, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.13854930725346373, |
|
"grad_norm": 0.17037687429265078, |
|
"learning_rate": 0.000765591199150948, |
|
"loss": 0.4376, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.1466992665036675, |
|
"grad_norm": 0.17552399857385595, |
|
"learning_rate": 0.0007612895159772056, |
|
"loss": 0.4264, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.15484922575387122, |
|
"grad_norm": 0.1438151353983127, |
|
"learning_rate": 0.0007567481172248043, |
|
"loss": 0.41, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.16299918500407498, |
|
"grad_norm": 0.1447417216259079, |
|
"learning_rate": 0.0007519700161116256, |
|
"loss": 0.4119, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.17114914425427874, |
|
"grad_norm": 0.15199420470790867, |
|
"learning_rate": 0.000746958382907557, |
|
"loss": 0.412, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.17929910350448247, |
|
"grad_norm": 0.14727418743789575, |
|
"learning_rate": 0.0007417165428310189, |
|
"loss": 0.3946, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.18744906275468623, |
|
"grad_norm": 0.34678998038165904, |
|
"learning_rate": 0.000736471024768781, |
|
"loss": 0.393, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.19559902200489, |
|
"grad_norm": 0.1482936724234658, |
|
"learning_rate": 0.0007307882077545133, |
|
"loss": 0.3992, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.20374898125509372, |
|
"grad_norm": 0.14346039255661802, |
|
"learning_rate": 0.00072488591277831, |
|
"loss": 0.39, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.21189894050529748, |
|
"grad_norm": 0.14534505858629046, |
|
"learning_rate": 0.0007187680560126396, |
|
"loss": 0.3878, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2200488997555012, |
|
"grad_norm": 0.1388631218472249, |
|
"learning_rate": 0.0007124386966552088, |
|
"loss": 0.3746, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.22819885900570497, |
|
"grad_norm": 0.15086917327804153, |
|
"learning_rate": 0.0007059020342356855, |
|
"loss": 0.3859, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.23634881825590873, |
|
"grad_norm": 0.15384912931521313, |
|
"learning_rate": 0.0006991624058293096, |
|
"loss": 0.3731, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.24449877750611246, |
|
"grad_norm": 0.15914151929826, |
|
"learning_rate": 0.000692224283179246, |
|
"loss": 0.3754, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2526487367563162, |
|
"grad_norm": 0.11506887001881892, |
|
"learning_rate": 0.0006850922697295807, |
|
"loss": 0.3666, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.26079869600651995, |
|
"grad_norm": 0.13052208168058513, |
|
"learning_rate": 0.0006777710975709381, |
|
"loss": 0.3766, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.26894865525672373, |
|
"grad_norm": 0.11941691123999496, |
|
"learning_rate": 0.0006702656243007372, |
|
"loss": 0.3602, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.27709861450692747, |
|
"grad_norm": 0.11925894836007636, |
|
"learning_rate": 0.0006625808298001773, |
|
"loss": 0.3612, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.2852485737571312, |
|
"grad_norm": 0.14308144825812302, |
|
"learning_rate": 0.0006547218129300866, |
|
"loss": 0.3609, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.293398533007335, |
|
"grad_norm": 0.14967189995049074, |
|
"learning_rate": 0.0006466937881478278, |
|
"loss": 0.3561, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3015484922575387, |
|
"grad_norm": 0.13228828753052413, |
|
"learning_rate": 0.0006385020820475062, |
|
"loss": 0.348, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.30969845150774244, |
|
"grad_norm": 0.13322170574643283, |
|
"learning_rate": 0.000630152129825775, |
|
"loss": 0.3697, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.31784841075794623, |
|
"grad_norm": 0.12720122919651028, |
|
"learning_rate": 0.0006216494716755822, |
|
"loss": 0.3448, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.32599837000814996, |
|
"grad_norm": 0.13582416923221627, |
|
"learning_rate": 0.0006129997491102531, |
|
"loss": 0.3567, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3341483292583537, |
|
"grad_norm": 0.14925459708686192, |
|
"learning_rate": 0.000604208701220346, |
|
"loss": 0.3484, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.3422982885085575, |
|
"grad_norm": 0.1243721342991605, |
|
"learning_rate": 0.000595282160865766, |
|
"loss": 0.3484, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3504482477587612, |
|
"grad_norm": 0.11953241203626794, |
|
"learning_rate": 0.0005862260508056631, |
|
"loss": 0.3458, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.35859820700896494, |
|
"grad_norm": 0.11268782331878287, |
|
"learning_rate": 0.0005770463797686815, |
|
"loss": 0.339, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.36674816625916873, |
|
"grad_norm": 0.11687380056427817, |
|
"learning_rate": 0.0005677492384661679, |
|
"loss": 0.3337, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.37489812550937246, |
|
"grad_norm": 0.12673741702704472, |
|
"learning_rate": 0.0005583407955509861, |
|
"loss": 0.3346, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.3830480847595762, |
|
"grad_norm": 0.11915610120268014, |
|
"learning_rate": 0.0005488272935246143, |
|
"loss": 0.333, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.39119804400978, |
|
"grad_norm": 0.13402357429565842, |
|
"learning_rate": 0.0005392150445952471, |
|
"loss": 0.3305, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.3993480032599837, |
|
"grad_norm": 0.1160573866470615, |
|
"learning_rate": 0.0005295104264896449, |
|
"loss": 0.34, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.40749796251018744, |
|
"grad_norm": 0.11766125729550878, |
|
"learning_rate": 0.0005197198782215126, |
|
"loss": 0.3282, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4156479217603912, |
|
"grad_norm": 0.10590606446287697, |
|
"learning_rate": 0.0005098498958192145, |
|
"loss": 0.3299, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.42379788101059496, |
|
"grad_norm": 0.11241439393785868, |
|
"learning_rate": 0.0004999070280156597, |
|
"loss": 0.3298, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4319478402607987, |
|
"grad_norm": 0.11125810401522715, |
|
"learning_rate": 0.0004898978719032175, |
|
"loss": 0.3215, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.4400977995110024, |
|
"grad_norm": 0.11191318960071489, |
|
"learning_rate": 0.0004798290685565476, |
|
"loss": 0.3249, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4482477587612062, |
|
"grad_norm": 0.1287365840788498, |
|
"learning_rate": 0.0004697072986262474, |
|
"loss": 0.3161, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.45639771801140994, |
|
"grad_norm": 0.11552326003349313, |
|
"learning_rate": 0.00045953927790623976, |
|
"loss": 0.3124, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.46454767726161367, |
|
"grad_norm": 0.11748617144150195, |
|
"learning_rate": 0.0004493317528778449, |
|
"loss": 0.3218, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.47269763651181745, |
|
"grad_norm": 0.10367103487291035, |
|
"learning_rate": 0.00043909149623349, |
|
"loss": 0.3038, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4808475957620212, |
|
"grad_norm": 0.13997935598004313, |
|
"learning_rate": 0.00042882530238302793, |
|
"loss": 0.3079, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.4889975550122249, |
|
"grad_norm": 0.11405505781771122, |
|
"learning_rate": 0.000418539982945647, |
|
"loss": 0.3161, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4971475142624287, |
|
"grad_norm": 0.10244350957803788, |
|
"learning_rate": 0.000408242362230361, |
|
"loss": 0.3121, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5052974735126324, |
|
"grad_norm": 0.11487103054153286, |
|
"learning_rate": 0.0003979392727080819, |
|
"loss": 0.3048, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5134474327628362, |
|
"grad_norm": 0.1105131381250625, |
|
"learning_rate": 0.0003876375504782742, |
|
"loss": 0.2951, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5215973920130399, |
|
"grad_norm": 0.1114339713919424, |
|
"learning_rate": 0.00037734403073320455, |
|
"loss": 0.2978, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5297473512632437, |
|
"grad_norm": 0.12439423946252776, |
|
"learning_rate": 0.0003670655432227906, |
|
"loss": 0.2977, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.5378973105134475, |
|
"grad_norm": 0.10946864392870362, |
|
"learning_rate": 0.0003568089077230634, |
|
"loss": 0.2966, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5460472697636511, |
|
"grad_norm": 0.09312964606321276, |
|
"learning_rate": 0.00034658092951124573, |
|
"loss": 0.2877, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.5541972290138549, |
|
"grad_norm": 0.1251440088268339, |
|
"learning_rate": 0.00033638839485045124, |
|
"loss": 0.2953, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5623471882640587, |
|
"grad_norm": 0.11260490319865255, |
|
"learning_rate": 0.00032623806648699865, |
|
"loss": 0.2836, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.5704971475142624, |
|
"grad_norm": 0.10260196893419725, |
|
"learning_rate": 0.00031613667916333013, |
|
"loss": 0.2883, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5786471067644662, |
|
"grad_norm": 0.11067247135798225, |
|
"learning_rate": 0.0003060909351495104, |
|
"loss": 0.2919, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.58679706601467, |
|
"grad_norm": 0.13049869044328746, |
|
"learning_rate": 0.00029610749979627, |
|
"loss": 0.2801, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5949470252648736, |
|
"grad_norm": 0.0994668923515135, |
|
"learning_rate": 0.0002861929971125462, |
|
"loss": 0.2764, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.6030969845150774, |
|
"grad_norm": 0.12787979565602525, |
|
"learning_rate": 0.0002763540053704528, |
|
"loss": 0.2828, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6112469437652812, |
|
"grad_norm": 0.11702233580318924, |
|
"learning_rate": 0.0002665970527405966, |
|
"loss": 0.275, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.6193969030154849, |
|
"grad_norm": 0.12216378448453033, |
|
"learning_rate": 0.0002569286129606376, |
|
"loss": 0.2781, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6275468622656887, |
|
"grad_norm": 0.11991224657683039, |
|
"learning_rate": 0.00024735510103996296, |
|
"loss": 0.2779, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.6356968215158925, |
|
"grad_norm": 0.11355699073704052, |
|
"learning_rate": 0.00023788286900332977, |
|
"loss": 0.278, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6438467807660961, |
|
"grad_norm": 0.10548043571718951, |
|
"learning_rate": 0.00022851820167629582, |
|
"loss": 0.2737, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.6519967400162999, |
|
"grad_norm": 0.10359945452094396, |
|
"learning_rate": 0.0002192673125152389, |
|
"loss": 0.2724, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6601466992665037, |
|
"grad_norm": 0.11760044543664713, |
|
"learning_rate": 0.0002101363394847284, |
|
"loss": 0.2629, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.6682966585167074, |
|
"grad_norm": 0.1061394336514801, |
|
"learning_rate": 0.00020113134098498586, |
|
"loss": 0.2686, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6764466177669112, |
|
"grad_norm": 0.12584575464846712, |
|
"learning_rate": 0.00019225829183213756, |
|
"loss": 0.2699, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.684596577017115, |
|
"grad_norm": 0.11274050205692296, |
|
"learning_rate": 0.00018352307929392337, |
|
"loss": 0.26, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6927465362673186, |
|
"grad_norm": 0.10624178207114862, |
|
"learning_rate": 0.0001749314991834945, |
|
"loss": 0.2676, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.7008964955175224, |
|
"grad_norm": 0.10585200267430328, |
|
"learning_rate": 0.00016648925201389348, |
|
"loss": 0.2699, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7090464547677262, |
|
"grad_norm": 0.1036684915414757, |
|
"learning_rate": 0.00015820193921576214, |
|
"loss": 0.266, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.7171964140179299, |
|
"grad_norm": 0.11470877255663611, |
|
"learning_rate": 0.00015007505942079362, |
|
"loss": 0.2574, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7253463732681337, |
|
"grad_norm": 0.11442215173910815, |
|
"learning_rate": 0.00014211400481339013, |
|
"loss": 0.2536, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.7334963325183375, |
|
"grad_norm": 0.10579448760201741, |
|
"learning_rate": 0.00013432405755294893, |
|
"loss": 0.2518, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7416462917685411, |
|
"grad_norm": 0.11019413291456029, |
|
"learning_rate": 0.0001267103862691497, |
|
"loss": 0.2528, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.7497962510187449, |
|
"grad_norm": 0.1237941013535732, |
|
"learning_rate": 0.00011927804263256903, |
|
"loss": 0.2506, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7579462102689487, |
|
"grad_norm": 0.11747893253572085, |
|
"learning_rate": 0.0001120319580028975, |
|
"loss": 0.2432, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.7660961695191524, |
|
"grad_norm": 0.13243835140024088, |
|
"learning_rate": 0.00010497694015698214, |
|
"loss": 0.2502, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7742461287693562, |
|
"grad_norm": 0.12096359907685643, |
|
"learning_rate": 9.811767009886681e-05, |
|
"loss": 0.2515, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.78239608801956, |
|
"grad_norm": 0.1010867422514499, |
|
"learning_rate": 9.145869895394685e-05, |
|
"loss": 0.2471, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7905460472697636, |
|
"grad_norm": 0.12108612955883465, |
|
"learning_rate": 8.500444494929692e-05, |
|
"loss": 0.2508, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.7986960065199674, |
|
"grad_norm": 0.11827129819048823, |
|
"learning_rate": 7.875919048217753e-05, |
|
"loss": 0.2421, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8068459657701712, |
|
"grad_norm": 0.10889141696060357, |
|
"learning_rate": 7.272707927866531e-05, |
|
"loss": 0.2444, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.8149959250203749, |
|
"grad_norm": 0.11478478031330713, |
|
"learning_rate": 6.691211364428989e-05, |
|
"loss": 0.239, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8231458842705787, |
|
"grad_norm": 0.12806490540385113, |
|
"learning_rate": 6.131815180850508e-05, |
|
"loss": 0.2429, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.8312958435207825, |
|
"grad_norm": 0.10357727349126965, |
|
"learning_rate": 5.5948905364753945e-05, |
|
"loss": 0.2467, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8394458027709861, |
|
"grad_norm": 0.1101292515864723, |
|
"learning_rate": 5.080793680782607e-05, |
|
"loss": 0.2405, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.8475957620211899, |
|
"grad_norm": 0.10144980764898691, |
|
"learning_rate": 4.5898657170142746e-05, |
|
"loss": 0.235, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8557457212713936, |
|
"grad_norm": 0.11277819168557472, |
|
"learning_rate": 4.1224323758537155e-05, |
|
"loss": 0.2341, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.8638956805215974, |
|
"grad_norm": 0.10800888686256803, |
|
"learning_rate": 3.678803799303134e-05, |
|
"loss": 0.24, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.8720456397718012, |
|
"grad_norm": 0.11901575566402685, |
|
"learning_rate": 3.2592743349044186e-05, |
|
"loss": 0.2341, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.8801955990220048, |
|
"grad_norm": 0.12444346139158177, |
|
"learning_rate": 2.8641223404395524e-05, |
|
"loss": 0.2291, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8883455582722086, |
|
"grad_norm": 0.10818161109308624, |
|
"learning_rate": 2.4936099992402606e-05, |
|
"loss": 0.2357, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.8964955175224124, |
|
"grad_norm": 0.12545021889499927, |
|
"learning_rate": 2.1479831462293265e-05, |
|
"loss": 0.2349, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9046454767726161, |
|
"grad_norm": 0.14021615830916254, |
|
"learning_rate": 1.8274711048092084e-05, |
|
"loss": 0.2306, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.9127954360228199, |
|
"grad_norm": 0.10349748867815296, |
|
"learning_rate": 1.5322865347059044e-05, |
|
"loss": 0.2243, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9209453952730237, |
|
"grad_norm": 0.11963300064608776, |
|
"learning_rate": 1.2626252908692638e-05, |
|
"loss": 0.2333, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.9290953545232273, |
|
"grad_norm": 0.10953396833548994, |
|
"learning_rate": 1.0186662935232384e-05, |
|
"loss": 0.2339, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.9372453137734311, |
|
"grad_norm": 0.1224263724970897, |
|
"learning_rate": 8.00571409452302e-06, |
|
"loss": 0.2313, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.9453952730236349, |
|
"grad_norm": 0.12352858513849092, |
|
"learning_rate": 6.084853446028671e-06, |
|
"loss": 0.2296, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9535452322738386, |
|
"grad_norm": 0.12633679586576274, |
|
"learning_rate": 4.425355480708859e-06, |
|
"loss": 0.2354, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.9616951915240424, |
|
"grad_norm": 0.10627066652464928, |
|
"learning_rate": 3.028321275393786e-06, |
|
"loss": 0.2332, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.9698451507742462, |
|
"grad_norm": 0.12495472546410136, |
|
"learning_rate": 1.8946777622199652e-06, |
|
"loss": 0.2297, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.9779951100244498, |
|
"grad_norm": 0.11479937267870474, |
|
"learning_rate": 1.0251771136106314e-06, |
|
"loss": 0.2372, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9861450692746536, |
|
"grad_norm": 0.1274989460799885, |
|
"learning_rate": 4.203962432096642e-07, |
|
"loss": 0.2316, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.9942950285248574, |
|
"grad_norm": 0.10997701182931913, |
|
"learning_rate": 8.073642309907036e-08, |
|
"loss": 0.2289, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 6135, |
|
"total_flos": 307270109495296.0, |
|
"train_loss": 0.320441021565606, |
|
"train_runtime": 141453.5974, |
|
"train_samples_per_second": 6.245, |
|
"train_steps_per_second": 0.043 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 6135, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 307270109495296.0, |
|
"train_batch_size": 18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|