|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.857035364936042, |
|
"eval_steps": 500, |
|
"global_step": 39300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 98.3414306640625, |
|
"learning_rate": 1.4082317531978931e-05, |
|
"loss": 1.674, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 21.889772415161133, |
|
"learning_rate": 1.4064635063957864e-05, |
|
"loss": 1.7321, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 78.81497955322266, |
|
"learning_rate": 1.4046952595936794e-05, |
|
"loss": 1.3246, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 117.79057312011719, |
|
"learning_rate": 1.4029270127915727e-05, |
|
"loss": 1.8399, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 89.93197631835938, |
|
"learning_rate": 1.4011587659894659e-05, |
|
"loss": 1.7021, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.327052116394043, |
|
"learning_rate": 1.399390519187359e-05, |
|
"loss": 1.3229, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 104.67691802978516, |
|
"learning_rate": 1.397622272385252e-05, |
|
"loss": 1.0449, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 62.50383377075195, |
|
"learning_rate": 1.3958540255831453e-05, |
|
"loss": 1.2135, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 30.1390380859375, |
|
"learning_rate": 1.3940857787810384e-05, |
|
"loss": 1.1312, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 172.32058715820312, |
|
"learning_rate": 1.3923175319789316e-05, |
|
"loss": 1.1339, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 149.6029052734375, |
|
"learning_rate": 1.3905492851768248e-05, |
|
"loss": 0.9226, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 104.08654022216797, |
|
"learning_rate": 1.3887810383747179e-05, |
|
"loss": 0.9141, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 28.90251350402832, |
|
"learning_rate": 1.387012791572611e-05, |
|
"loss": 0.7194, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 78.85499572753906, |
|
"learning_rate": 1.3852445447705042e-05, |
|
"loss": 1.051, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 59.84476089477539, |
|
"learning_rate": 1.3834762979683973e-05, |
|
"loss": 0.8815, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 47.683658599853516, |
|
"learning_rate": 1.3817080511662905e-05, |
|
"loss": 1.1052, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 73.24783325195312, |
|
"learning_rate": 1.3799398043641836e-05, |
|
"loss": 0.6957, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 121.98059844970703, |
|
"learning_rate": 1.3781715575620768e-05, |
|
"loss": 1.1512, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 115.57231140136719, |
|
"learning_rate": 1.3764033107599699e-05, |
|
"loss": 0.8512, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 40.25959014892578, |
|
"learning_rate": 1.374635063957863e-05, |
|
"loss": 0.873, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 10.869709014892578, |
|
"learning_rate": 1.3728668171557562e-05, |
|
"loss": 0.7834, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 128.24893188476562, |
|
"learning_rate": 1.3710985703536495e-05, |
|
"loss": 0.8042, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 60.73322677612305, |
|
"learning_rate": 1.3693303235515425e-05, |
|
"loss": 1.0092, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 19.39624786376953, |
|
"learning_rate": 1.3675620767494358e-05, |
|
"loss": 0.662, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.13774849474430084, |
|
"learning_rate": 1.3657938299473288e-05, |
|
"loss": 0.98, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 79.46333312988281, |
|
"learning_rate": 1.3640255831452219e-05, |
|
"loss": 0.7967, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 13.158239364624023, |
|
"learning_rate": 1.3622573363431151e-05, |
|
"loss": 1.0218, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 17.267330169677734, |
|
"learning_rate": 1.3604890895410084e-05, |
|
"loss": 0.8711, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 174.72537231445312, |
|
"learning_rate": 1.3587208427389015e-05, |
|
"loss": 0.8711, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 77.13172149658203, |
|
"learning_rate": 1.3569525959367947e-05, |
|
"loss": 1.0233, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 48.417015075683594, |
|
"learning_rate": 1.3551843491346878e-05, |
|
"loss": 0.7682, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.1959638595581055, |
|
"learning_rate": 1.3534161023325808e-05, |
|
"loss": 0.8792, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 49.98043441772461, |
|
"learning_rate": 1.351647855530474e-05, |
|
"loss": 0.9868, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 45.13309860229492, |
|
"learning_rate": 1.3498796087283673e-05, |
|
"loss": 0.5272, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 8.423553466796875, |
|
"learning_rate": 1.3481113619262604e-05, |
|
"loss": 1.1983, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 17.5786190032959, |
|
"learning_rate": 1.3463431151241536e-05, |
|
"loss": 0.7065, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 5.939927577972412, |
|
"learning_rate": 1.3445748683220467e-05, |
|
"loss": 0.6674, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 23.781694412231445, |
|
"learning_rate": 1.3428066215199398e-05, |
|
"loss": 0.7267, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.4960607886314392, |
|
"learning_rate": 1.341038374717833e-05, |
|
"loss": 1.0549, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 83.99737548828125, |
|
"learning_rate": 1.3392701279157262e-05, |
|
"loss": 0.786, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 20.65607261657715, |
|
"learning_rate": 1.3375018811136193e-05, |
|
"loss": 0.9709, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0673532485961914, |
|
"learning_rate": 1.3357336343115126e-05, |
|
"loss": 0.8208, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 10.350920677185059, |
|
"learning_rate": 1.3339653875094056e-05, |
|
"loss": 1.1503, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7176612019538879, |
|
"learning_rate": 1.3321971407072987e-05, |
|
"loss": 0.5841, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 16.532655715942383, |
|
"learning_rate": 1.330428893905192e-05, |
|
"loss": 1.1618, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.24398092925548553, |
|
"learning_rate": 1.3286606471030852e-05, |
|
"loss": 0.6052, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 45.761695861816406, |
|
"learning_rate": 1.3268924003009782e-05, |
|
"loss": 1.0618, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.3656911849975586, |
|
"learning_rate": 1.3251241534988713e-05, |
|
"loss": 0.8395, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 56.36614227294922, |
|
"learning_rate": 1.3233559066967646e-05, |
|
"loss": 0.6547, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 50.591705322265625, |
|
"learning_rate": 1.3215876598946576e-05, |
|
"loss": 0.9528, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 11.290885925292969, |
|
"learning_rate": 1.3198194130925507e-05, |
|
"loss": 0.6811, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.10668418556451797, |
|
"learning_rate": 1.3180511662904441e-05, |
|
"loss": 0.7421, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0529690980911255, |
|
"learning_rate": 1.3162829194883372e-05, |
|
"loss": 0.7665, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 35.5570068359375, |
|
"learning_rate": 1.3145146726862302e-05, |
|
"loss": 0.6587, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 47.973697662353516, |
|
"learning_rate": 1.3127464258841235e-05, |
|
"loss": 0.8273, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 42.45454788208008, |
|
"learning_rate": 1.3109781790820166e-05, |
|
"loss": 0.8512, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 52.255821228027344, |
|
"learning_rate": 1.3092099322799096e-05, |
|
"loss": 0.5748, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 55.622413635253906, |
|
"learning_rate": 1.307441685477803e-05, |
|
"loss": 0.6585, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 6.804417610168457, |
|
"learning_rate": 1.3056734386756961e-05, |
|
"loss": 0.9276, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 8.9085054397583, |
|
"learning_rate": 1.3039051918735892e-05, |
|
"loss": 0.9573, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.399890422821045, |
|
"learning_rate": 1.3021369450714824e-05, |
|
"loss": 0.815, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 9.621098518371582, |
|
"learning_rate": 1.3003686982693755e-05, |
|
"loss": 0.6272, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 34.52663803100586, |
|
"learning_rate": 1.2986004514672686e-05, |
|
"loss": 0.7548, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 38.8935661315918, |
|
"learning_rate": 1.296832204665162e-05, |
|
"loss": 1.0272, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 53.31705093383789, |
|
"learning_rate": 1.295063957863055e-05, |
|
"loss": 0.8594, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 24.726455688476562, |
|
"learning_rate": 1.2932957110609481e-05, |
|
"loss": 0.7025, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 35.29804992675781, |
|
"learning_rate": 1.2915274642588413e-05, |
|
"loss": 0.8359, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 15.382336616516113, |
|
"learning_rate": 1.2897592174567344e-05, |
|
"loss": 0.7358, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.9050614833831787, |
|
"learning_rate": 1.2879909706546275e-05, |
|
"loss": 0.8021, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 44.734962463378906, |
|
"learning_rate": 1.2862227238525209e-05, |
|
"loss": 0.844, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 14.811912536621094, |
|
"learning_rate": 1.284454477050414e-05, |
|
"loss": 0.7822, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 44.70045471191406, |
|
"learning_rate": 1.282686230248307e-05, |
|
"loss": 1.0654, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 48.43465805053711, |
|
"learning_rate": 1.2809179834462003e-05, |
|
"loss": 0.6354, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 47.798423767089844, |
|
"learning_rate": 1.2791497366440933e-05, |
|
"loss": 0.8125, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 42.33122634887695, |
|
"learning_rate": 1.2773814898419864e-05, |
|
"loss": 1.1325, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.14906466007232666, |
|
"learning_rate": 1.2756132430398797e-05, |
|
"loss": 0.5325, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 10.49329662322998, |
|
"learning_rate": 1.2738449962377729e-05, |
|
"loss": 0.7013, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 21.828550338745117, |
|
"learning_rate": 1.272076749435666e-05, |
|
"loss": 0.5134, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.0481252670288086, |
|
"learning_rate": 1.270308502633559e-05, |
|
"loss": 1.4255, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.075194001197815, |
|
"learning_rate": 1.2685402558314523e-05, |
|
"loss": 0.7727, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 17.64851188659668, |
|
"learning_rate": 1.2667720090293453e-05, |
|
"loss": 0.4984, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 49.92161178588867, |
|
"learning_rate": 1.2650037622272386e-05, |
|
"loss": 0.9065, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 11.019123077392578, |
|
"learning_rate": 1.2632355154251318e-05, |
|
"loss": 0.8184, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 12.537881851196289, |
|
"learning_rate": 1.2614672686230249e-05, |
|
"loss": 0.6989, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.5771467089653015, |
|
"learning_rate": 1.259699021820918e-05, |
|
"loss": 0.7282, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 60.68583297729492, |
|
"learning_rate": 1.2579307750188112e-05, |
|
"loss": 0.695, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.7341581583023071, |
|
"learning_rate": 1.2561625282167043e-05, |
|
"loss": 0.7021, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.020291157066822052, |
|
"learning_rate": 1.2543942814145975e-05, |
|
"loss": 0.8563, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.3924442529678345, |
|
"learning_rate": 1.2526260346124907e-05, |
|
"loss": 0.7378, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 31.691173553466797, |
|
"learning_rate": 1.2508577878103838e-05, |
|
"loss": 0.5887, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.1823307275772095, |
|
"learning_rate": 1.2490895410082769e-05, |
|
"loss": 0.8132, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.08653511106967926, |
|
"learning_rate": 1.2473212942061701e-05, |
|
"loss": 0.8374, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.169903039932251, |
|
"learning_rate": 1.2455530474040632e-05, |
|
"loss": 0.588, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 56.76768112182617, |
|
"learning_rate": 1.2437848006019564e-05, |
|
"loss": 0.7869, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.05390803515911102, |
|
"learning_rate": 1.2420165537998497e-05, |
|
"loss": 0.6243, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 5.537655830383301, |
|
"learning_rate": 1.2402483069977427e-05, |
|
"loss": 0.737, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 69.44229125976562, |
|
"learning_rate": 1.2384800601956358e-05, |
|
"loss": 1.0479, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 45.22208023071289, |
|
"learning_rate": 1.236711813393529e-05, |
|
"loss": 0.8327, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 22.553054809570312, |
|
"learning_rate": 1.2349435665914221e-05, |
|
"loss": 0.6587, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.1869142055511475, |
|
"learning_rate": 1.2331753197893154e-05, |
|
"loss": 0.5913, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.483933210372925, |
|
"learning_rate": 1.2314070729872086e-05, |
|
"loss": 0.8163, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 18.768310546875, |
|
"learning_rate": 1.2296388261851017e-05, |
|
"loss": 0.6273, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 56.0864372253418, |
|
"learning_rate": 1.2278705793829947e-05, |
|
"loss": 0.8787, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 51.98051834106445, |
|
"learning_rate": 1.226102332580888e-05, |
|
"loss": 0.4302, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 17.60165023803711, |
|
"learning_rate": 1.224334085778781e-05, |
|
"loss": 0.7238, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 48.4942626953125, |
|
"learning_rate": 1.2225658389766743e-05, |
|
"loss": 0.8018, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 14.206453323364258, |
|
"learning_rate": 1.2207975921745674e-05, |
|
"loss": 0.5428, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 65.64610290527344, |
|
"learning_rate": 1.2190293453724606e-05, |
|
"loss": 0.7923, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 9.786343574523926, |
|
"learning_rate": 1.2172610985703537e-05, |
|
"loss": 0.7779, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 3.1632120609283447, |
|
"learning_rate": 1.2154928517682467e-05, |
|
"loss": 0.6474, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 15.631272315979004, |
|
"learning_rate": 1.21372460496614e-05, |
|
"loss": 0.6736, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.025490593165159225, |
|
"learning_rate": 1.2119563581640332e-05, |
|
"loss": 0.6371, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 84.42486572265625, |
|
"learning_rate": 1.2101881113619263e-05, |
|
"loss": 0.9348, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.32389989495277405, |
|
"learning_rate": 1.2084198645598195e-05, |
|
"loss": 0.8304, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 49.16242599487305, |
|
"learning_rate": 1.2066516177577126e-05, |
|
"loss": 0.6624, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 119.3700942993164, |
|
"learning_rate": 1.2048833709556057e-05, |
|
"loss": 1.1135, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.15834768116474152, |
|
"learning_rate": 1.2031151241534989e-05, |
|
"loss": 0.6358, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 54.722652435302734, |
|
"learning_rate": 1.2013468773513922e-05, |
|
"loss": 0.5639, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.0872531533241272, |
|
"learning_rate": 1.1995786305492852e-05, |
|
"loss": 0.7912, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.5009591579437256, |
|
"learning_rate": 1.1978103837471785e-05, |
|
"loss": 0.6478, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.5101827383041382, |
|
"learning_rate": 1.1960421369450715e-05, |
|
"loss": 0.8577, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 3.4737539291381836, |
|
"learning_rate": 1.1942738901429646e-05, |
|
"loss": 0.9474, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 92.57341003417969, |
|
"learning_rate": 1.1925056433408578e-05, |
|
"loss": 0.8665, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 38.56670379638672, |
|
"learning_rate": 1.1907373965387509e-05, |
|
"loss": 0.7833, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 29.12518310546875, |
|
"learning_rate": 1.1889691497366442e-05, |
|
"loss": 0.7454, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 69.91959381103516, |
|
"learning_rate": 1.1872009029345374e-05, |
|
"loss": 0.7843, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 56.20566177368164, |
|
"learning_rate": 1.1854326561324305e-05, |
|
"loss": 0.841, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 66.2998275756836, |
|
"learning_rate": 1.1836644093303235e-05, |
|
"loss": 0.723, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.9407018423080444, |
|
"learning_rate": 1.1818961625282168e-05, |
|
"loss": 0.7235, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 61.69858932495117, |
|
"learning_rate": 1.1801279157261098e-05, |
|
"loss": 0.8241, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 8.412137985229492, |
|
"learning_rate": 1.178359668924003e-05, |
|
"loss": 0.564, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 9.307317733764648, |
|
"learning_rate": 1.1765914221218962e-05, |
|
"loss": 0.8438, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 41.45466995239258, |
|
"learning_rate": 1.1748231753197894e-05, |
|
"loss": 0.7763, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.8245513439178467, |
|
"learning_rate": 1.1730549285176825e-05, |
|
"loss": 0.7476, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 76.77831268310547, |
|
"learning_rate": 1.1712866817155757e-05, |
|
"loss": 0.9578, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.004409218207001686, |
|
"learning_rate": 1.1695184349134688e-05, |
|
"loss": 0.8765, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 46.58176803588867, |
|
"learning_rate": 1.167750188111362e-05, |
|
"loss": 0.5402, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.006879806518555, |
|
"learning_rate": 1.165981941309255e-05, |
|
"loss": 0.4722, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.194460153579712, |
|
"learning_rate": 1.1642136945071483e-05, |
|
"loss": 0.858, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.012106262147426605, |
|
"learning_rate": 1.1624454477050414e-05, |
|
"loss": 0.6607, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 6.08723258972168, |
|
"learning_rate": 1.1606772009029345e-05, |
|
"loss": 0.866, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 51.338478088378906, |
|
"learning_rate": 1.1589089541008277e-05, |
|
"loss": 0.7508, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 18.472858428955078, |
|
"learning_rate": 1.157140707298721e-05, |
|
"loss": 0.8686, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 4.837900638580322, |
|
"learning_rate": 1.155372460496614e-05, |
|
"loss": 0.5302, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 41.74524688720703, |
|
"learning_rate": 1.1536042136945072e-05, |
|
"loss": 0.7681, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 30.557188034057617, |
|
"learning_rate": 1.1518359668924003e-05, |
|
"loss": 0.9107, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 14.001880645751953, |
|
"learning_rate": 1.1500677200902934e-05, |
|
"loss": 0.5387, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.1815216839313507, |
|
"learning_rate": 1.1482994732881866e-05, |
|
"loss": 0.8152, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 36.915061950683594, |
|
"learning_rate": 1.1465312264860799e-05, |
|
"loss": 0.6313, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.20334406197071075, |
|
"learning_rate": 1.144762979683973e-05, |
|
"loss": 0.8265, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.0018741831881925464, |
|
"learning_rate": 1.1429947328818662e-05, |
|
"loss": 0.7202, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.000707630708348006, |
|
"learning_rate": 1.1412264860797592e-05, |
|
"loss": 0.6488, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.4616662561893463, |
|
"learning_rate": 1.1394582392776523e-05, |
|
"loss": 0.9402, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 43.170814514160156, |
|
"learning_rate": 1.1376899924755456e-05, |
|
"loss": 0.763, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 3.035790205001831, |
|
"learning_rate": 1.1359217456734388e-05, |
|
"loss": 0.5681, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 45.11912536621094, |
|
"learning_rate": 1.1341534988713319e-05, |
|
"loss": 0.9795, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.006427049636841, |
|
"learning_rate": 1.1323852520692251e-05, |
|
"loss": 0.4772, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 69.13399505615234, |
|
"learning_rate": 1.1306170052671182e-05, |
|
"loss": 0.8649, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 43.80717468261719, |
|
"learning_rate": 1.1288487584650112e-05, |
|
"loss": 0.6051, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.3676908016204834, |
|
"learning_rate": 1.1270805116629045e-05, |
|
"loss": 0.4737, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 18.533445358276367, |
|
"learning_rate": 1.1253122648607977e-05, |
|
"loss": 0.4353, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.649580717086792, |
|
"learning_rate": 1.1235440180586908e-05, |
|
"loss": 0.9283, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 37.0181999206543, |
|
"learning_rate": 1.1217757712565839e-05, |
|
"loss": 0.8631, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.1191781759262085, |
|
"learning_rate": 1.1200075244544771e-05, |
|
"loss": 0.7166, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 46.35097885131836, |
|
"learning_rate": 1.1182392776523702e-05, |
|
"loss": 0.6263, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 9.393693923950195, |
|
"learning_rate": 1.1164710308502632e-05, |
|
"loss": 0.7146, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 49.04343032836914, |
|
"learning_rate": 1.1147027840481567e-05, |
|
"loss": 0.5924, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 2.917092800140381, |
|
"learning_rate": 1.1129345372460497e-05, |
|
"loss": 0.815, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 6.2741618156433105, |
|
"learning_rate": 1.1111662904439428e-05, |
|
"loss": 0.8852, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.026425007730722427, |
|
"learning_rate": 1.109398043641836e-05, |
|
"loss": 0.609, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 8.229249954223633, |
|
"learning_rate": 1.1076297968397291e-05, |
|
"loss": 0.5546, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 3.257112979888916, |
|
"learning_rate": 1.1058615500376222e-05, |
|
"loss": 0.6084, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 44.147640228271484, |
|
"learning_rate": 1.1040933032355156e-05, |
|
"loss": 0.4687, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 93.26548767089844, |
|
"learning_rate": 1.1023250564334087e-05, |
|
"loss": 0.6323, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 83.17293548583984, |
|
"learning_rate": 1.1005568096313017e-05, |
|
"loss": 0.8759, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 51.27419662475586, |
|
"learning_rate": 1.098788562829195e-05, |
|
"loss": 0.69, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.0010558576323091984, |
|
"learning_rate": 1.097020316027088e-05, |
|
"loss": 0.5279, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 73.43231201171875, |
|
"learning_rate": 1.0952520692249811e-05, |
|
"loss": 0.9285, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 6.488553047180176, |
|
"learning_rate": 1.0934838224228745e-05, |
|
"loss": 0.6137, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 53.465972900390625, |
|
"learning_rate": 1.0917155756207676e-05, |
|
"loss": 0.4718, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 1.405421495437622, |
|
"learning_rate": 1.0899473288186607e-05, |
|
"loss": 0.7248, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 58.552490234375, |
|
"learning_rate": 1.0881790820165539e-05, |
|
"loss": 0.6312, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 85.75029754638672, |
|
"learning_rate": 1.086410835214447e-05, |
|
"loss": 1.1383, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.4940392971038818, |
|
"learning_rate": 1.08464258841234e-05, |
|
"loss": 0.6104, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.2434502840042114, |
|
"learning_rate": 1.0828743416102334e-05, |
|
"loss": 0.5124, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.002772190608084202, |
|
"learning_rate": 1.0811060948081265e-05, |
|
"loss": 0.8389, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 27.42812156677246, |
|
"learning_rate": 1.0793378480060196e-05, |
|
"loss": 0.6571, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 70.63783264160156, |
|
"learning_rate": 1.0775696012039128e-05, |
|
"loss": 0.5234, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.873970627784729, |
|
"learning_rate": 1.0758013544018059e-05, |
|
"loss": 0.7862, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.0001105390620068647, |
|
"learning_rate": 1.074033107599699e-05, |
|
"loss": 0.9885, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.0316097736358643, |
|
"learning_rate": 1.0722648607975922e-05, |
|
"loss": 0.6648, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 33.791568756103516, |
|
"learning_rate": 1.0704966139954854e-05, |
|
"loss": 0.6746, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 112.26337432861328, |
|
"learning_rate": 1.0687283671933785e-05, |
|
"loss": 0.787, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 53.35863494873047, |
|
"learning_rate": 1.0669601203912716e-05, |
|
"loss": 0.5922, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.0027942871674895287, |
|
"learning_rate": 1.0651918735891648e-05, |
|
"loss": 0.6236, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.00036070370697416365, |
|
"learning_rate": 1.0634236267870579e-05, |
|
"loss": 0.6559, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 2.5188686847686768, |
|
"learning_rate": 1.0616553799849511e-05, |
|
"loss": 1.002, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 42.79086685180664, |
|
"learning_rate": 1.0598871331828444e-05, |
|
"loss": 1.001, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.06492776423692703, |
|
"learning_rate": 1.0581188863807374e-05, |
|
"loss": 0.9975, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 12.079846382141113, |
|
"learning_rate": 1.0563506395786305e-05, |
|
"loss": 0.6417, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 98.72542572021484, |
|
"learning_rate": 1.0545823927765237e-05, |
|
"loss": 0.9242, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.15632659196853638, |
|
"learning_rate": 1.0528141459744168e-05, |
|
"loss": 0.4118, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 3.5314505100250244, |
|
"learning_rate": 1.05104589917231e-05, |
|
"loss": 0.6486, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.06171553581953049, |
|
"learning_rate": 1.0492776523702033e-05, |
|
"loss": 0.7782, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 69.53456115722656, |
|
"learning_rate": 1.0475094055680964e-05, |
|
"loss": 0.5421, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 27.149484634399414, |
|
"learning_rate": 1.0457411587659894e-05, |
|
"loss": 0.7476, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 3.7423877716064453, |
|
"learning_rate": 1.0439729119638827e-05, |
|
"loss": 0.7429, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.6006436944007874, |
|
"learning_rate": 1.0422046651617757e-05, |
|
"loss": 0.4376, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.2609996497631073, |
|
"learning_rate": 1.040436418359669e-05, |
|
"loss": 0.8938, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 73.91007232666016, |
|
"learning_rate": 1.0386681715575622e-05, |
|
"loss": 0.7273, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.010080622509121895, |
|
"learning_rate": 1.0368999247554553e-05, |
|
"loss": 0.7709, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 5.206912994384766, |
|
"learning_rate": 1.0351316779533484e-05, |
|
"loss": 0.696, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 94.36717987060547, |
|
"learning_rate": 1.0333634311512416e-05, |
|
"loss": 0.6964, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.6438612341880798, |
|
"learning_rate": 1.0315951843491347e-05, |
|
"loss": 0.6461, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.02532346546649933, |
|
"learning_rate": 1.029826937547028e-05, |
|
"loss": 0.8581, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.5096291303634644, |
|
"learning_rate": 1.0280586907449212e-05, |
|
"loss": 0.4629, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 81.77324676513672, |
|
"learning_rate": 1.0262904439428142e-05, |
|
"loss": 0.8681, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.1398659944534302, |
|
"learning_rate": 1.0245221971407073e-05, |
|
"loss": 0.5162, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.4226570725440979, |
|
"learning_rate": 1.0227539503386005e-05, |
|
"loss": 0.4572, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.02047480270266533, |
|
"learning_rate": 1.0209857035364936e-05, |
|
"loss": 0.8946, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 124.79954528808594, |
|
"learning_rate": 1.0192174567343868e-05, |
|
"loss": 0.8325, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 7.112376624718308e-05, |
|
"learning_rate": 1.01744920993228e-05, |
|
"loss": 0.5664, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 78.66365051269531, |
|
"learning_rate": 1.0156809631301732e-05, |
|
"loss": 0.9426, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 9.567934466758743e-05, |
|
"learning_rate": 1.0139127163280662e-05, |
|
"loss": 0.4818, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.003907013684511185, |
|
"learning_rate": 1.0121444695259593e-05, |
|
"loss": 0.743, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 84.53366088867188, |
|
"learning_rate": 1.0103762227238525e-05, |
|
"loss": 0.8544, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 3.4674291610717773, |
|
"learning_rate": 1.0086079759217458e-05, |
|
"loss": 0.5553, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 125.62838745117188, |
|
"learning_rate": 1.0068397291196388e-05, |
|
"loss": 0.6168, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 99.19140625, |
|
"learning_rate": 1.005071482317532e-05, |
|
"loss": 1.1238, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 38.717559814453125, |
|
"learning_rate": 1.0033032355154252e-05, |
|
"loss": 1.0667, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 28.915889739990234, |
|
"learning_rate": 1.0015349887133182e-05, |
|
"loss": 0.5045, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 48.31145477294922, |
|
"learning_rate": 9.997667419112115e-06, |
|
"loss": 0.754, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.06709738075733185, |
|
"learning_rate": 9.979984951091047e-06, |
|
"loss": 0.6229, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.2689626216888428, |
|
"learning_rate": 9.962302483069978e-06, |
|
"loss": 0.7818, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 35.311134338378906, |
|
"learning_rate": 9.94462001504891e-06, |
|
"loss": 1.0477, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 88.91561889648438, |
|
"learning_rate": 9.92693754702784e-06, |
|
"loss": 0.6488, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 70.55093383789062, |
|
"learning_rate": 9.909255079006772e-06, |
|
"loss": 0.5951, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 89.51988983154297, |
|
"learning_rate": 9.891572610985704e-06, |
|
"loss": 0.6867, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.40069764852523804, |
|
"learning_rate": 9.873890142964636e-06, |
|
"loss": 0.7094, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 2.006258964538574, |
|
"learning_rate": 9.856207674943567e-06, |
|
"loss": 0.5428, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 51.34798049926758, |
|
"learning_rate": 9.8385252069225e-06, |
|
"loss": 0.573, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 95.47881317138672, |
|
"learning_rate": 9.82084273890143e-06, |
|
"loss": 0.4226, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.07185523957014084, |
|
"learning_rate": 9.80316027088036e-06, |
|
"loss": 0.6424, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 109.8128662109375, |
|
"learning_rate": 9.785477802859293e-06, |
|
"loss": 0.5279, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 44.56191635131836, |
|
"learning_rate": 9.767795334838224e-06, |
|
"loss": 0.3463, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.45552492141723633, |
|
"learning_rate": 9.750112866817156e-06, |
|
"loss": 0.6696, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.0008902169647626579, |
|
"learning_rate": 9.732430398796089e-06, |
|
"loss": 0.3845, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 134.49839782714844, |
|
"learning_rate": 9.71474793077502e-06, |
|
"loss": 0.8803, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.21923835575580597, |
|
"learning_rate": 9.69706546275395e-06, |
|
"loss": 0.741, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.2331884801387787, |
|
"learning_rate": 9.679382994732883e-06, |
|
"loss": 0.7015, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.4663000702857971, |
|
"learning_rate": 9.661700526711813e-06, |
|
"loss": 0.7605, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 59.55733871459961, |
|
"learning_rate": 9.644018058690746e-06, |
|
"loss": 0.5855, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.8377301096916199, |
|
"learning_rate": 9.626335590669676e-06, |
|
"loss": 0.4117, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 64.69242095947266, |
|
"learning_rate": 9.608653122648609e-06, |
|
"loss": 0.5216, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.8485704660415649, |
|
"learning_rate": 9.59097065462754e-06, |
|
"loss": 0.6882, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 143.98147583007812, |
|
"learning_rate": 9.57328818660647e-06, |
|
"loss": 0.6463, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 132.84567260742188, |
|
"learning_rate": 9.555605718585403e-06, |
|
"loss": 0.7474, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 8.179304122924805, |
|
"learning_rate": 9.537923250564335e-06, |
|
"loss": 0.375, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 10.138591766357422, |
|
"learning_rate": 9.520240782543266e-06, |
|
"loss": 0.7204, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.00011070028267567977, |
|
"learning_rate": 9.502558314522198e-06, |
|
"loss": 0.3631, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.0425533056259155, |
|
"learning_rate": 9.484875846501129e-06, |
|
"loss": 0.6752, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 19.544971466064453, |
|
"learning_rate": 9.46719337848006e-06, |
|
"loss": 0.4082, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 3.29071121996094e-06, |
|
"learning_rate": 9.449510910458992e-06, |
|
"loss": 0.752, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.4096872806549072, |
|
"learning_rate": 9.431828442437924e-06, |
|
"loss": 0.739, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.1742667555809021, |
|
"learning_rate": 9.414145974416855e-06, |
|
"loss": 0.5783, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.8604665398597717, |
|
"learning_rate": 9.396463506395787e-06, |
|
"loss": 0.8603, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 6.3410016082343645e-06, |
|
"learning_rate": 9.378781038374718e-06, |
|
"loss": 0.4481, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 157.0394744873047, |
|
"learning_rate": 9.361098570353649e-06, |
|
"loss": 0.6242, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.00026235656696371734, |
|
"learning_rate": 9.343416102332581e-06, |
|
"loss": 0.7734, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.48436620831489563, |
|
"learning_rate": 9.325733634311513e-06, |
|
"loss": 0.4109, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 136.50823974609375, |
|
"learning_rate": 9.308051166290444e-06, |
|
"loss": 0.6214, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.14412285387516022, |
|
"learning_rate": 9.290368698269377e-06, |
|
"loss": 0.2606, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 11.025894165039062, |
|
"learning_rate": 9.272686230248307e-06, |
|
"loss": 0.7337, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 121.1470718383789, |
|
"learning_rate": 9.255003762227238e-06, |
|
"loss": 0.7108, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.08408990502357483, |
|
"learning_rate": 9.23732129420617e-06, |
|
"loss": 0.4979, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.05547923222184181, |
|
"learning_rate": 9.219638826185103e-06, |
|
"loss": 0.299, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 131.8295135498047, |
|
"learning_rate": 9.201956358164033e-06, |
|
"loss": 1.0513, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 40.073734283447266, |
|
"learning_rate": 9.184273890142966e-06, |
|
"loss": 0.4599, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 18.33232879638672, |
|
"learning_rate": 9.166591422121897e-06, |
|
"loss": 0.6182, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.02969328872859478, |
|
"learning_rate": 9.148908954100827e-06, |
|
"loss": 0.4793, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.36942940950393677, |
|
"learning_rate": 9.13122648607976e-06, |
|
"loss": 0.3778, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.076649971306324, |
|
"learning_rate": 9.113544018058692e-06, |
|
"loss": 0.6148, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 282.6568298339844, |
|
"learning_rate": 9.095861550037623e-06, |
|
"loss": 0.6784, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.14636385440826416, |
|
"learning_rate": 9.078179082016553e-06, |
|
"loss": 0.9237, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.014414280652999878, |
|
"learning_rate": 9.060496613995486e-06, |
|
"loss": 0.7111, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.10564962774515152, |
|
"learning_rate": 9.042814145974417e-06, |
|
"loss": 0.4485, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.10087831318378448, |
|
"learning_rate": 9.025131677953347e-06, |
|
"loss": 0.7537, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 75.64422607421875, |
|
"learning_rate": 9.007449209932281e-06, |
|
"loss": 0.4629, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 87.81208801269531, |
|
"learning_rate": 8.989766741911212e-06, |
|
"loss": 0.5313, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.0018619262846186757, |
|
"learning_rate": 8.972084273890143e-06, |
|
"loss": 0.7642, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 110.11195373535156, |
|
"learning_rate": 8.954401805869075e-06, |
|
"loss": 0.6499, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.008621015585958958, |
|
"learning_rate": 8.936719337848006e-06, |
|
"loss": 0.3583, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.022055380046367645, |
|
"learning_rate": 8.919036869826937e-06, |
|
"loss": 0.5497, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 67.4389419555664, |
|
"learning_rate": 8.90135440180587e-06, |
|
"loss": 0.5981, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.000478647300042212, |
|
"learning_rate": 8.883671933784801e-06, |
|
"loss": 0.259, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 1.5297553539276123, |
|
"learning_rate": 8.865989465763732e-06, |
|
"loss": 0.6259, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 36.321128845214844, |
|
"learning_rate": 8.848306997742664e-06, |
|
"loss": 0.6844, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 175.9180450439453, |
|
"learning_rate": 8.830624529721595e-06, |
|
"loss": 0.5772, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 178.33462524414062, |
|
"learning_rate": 8.812942061700526e-06, |
|
"loss": 0.5891, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.00013845643843524158, |
|
"learning_rate": 8.79525959367946e-06, |
|
"loss": 0.464, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 127.49348449707031, |
|
"learning_rate": 8.77757712565839e-06, |
|
"loss": 0.5844, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 1.6402578353881836, |
|
"learning_rate": 8.759894657637321e-06, |
|
"loss": 0.7526, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.008880015462636948, |
|
"learning_rate": 8.742212189616254e-06, |
|
"loss": 0.9234, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.4811843931674957, |
|
"learning_rate": 8.724529721595184e-06, |
|
"loss": 0.848, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.0008742750505916774, |
|
"learning_rate": 8.706847253574115e-06, |
|
"loss": 0.4136, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 45.28816604614258, |
|
"learning_rate": 8.68916478555305e-06, |
|
"loss": 0.6978, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.014465034939348698, |
|
"learning_rate": 8.67148231753198e-06, |
|
"loss": 0.6124, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.01468442752957344, |
|
"learning_rate": 8.65379984951091e-06, |
|
"loss": 0.925, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.076714283954061e-06, |
|
"learning_rate": 8.636117381489843e-06, |
|
"loss": 0.5271, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 4.781663847097661e-07, |
|
"learning_rate": 8.618434913468774e-06, |
|
"loss": 0.4686, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.0695022344589233, |
|
"learning_rate": 8.600752445447704e-06, |
|
"loss": 1.076, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.3064178228378296, |
|
"learning_rate": 8.583069977426637e-06, |
|
"loss": 0.4409, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 95.81256103515625, |
|
"learning_rate": 8.56538750940557e-06, |
|
"loss": 0.628, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.011423008516430855, |
|
"learning_rate": 8.5477050413845e-06, |
|
"loss": 0.4738, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 68.7823257446289, |
|
"learning_rate": 8.53002257336343e-06, |
|
"loss": 0.5614, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.0003278045041952282, |
|
"learning_rate": 8.512340105342363e-06, |
|
"loss": 0.451, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 5.685105293196102e-07, |
|
"learning_rate": 8.494657637321294e-06, |
|
"loss": 0.6919, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.006908051203936338, |
|
"learning_rate": 8.476975169300226e-06, |
|
"loss": 0.7209, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.14153322577476501, |
|
"learning_rate": 8.459292701279158e-06, |
|
"loss": 0.8544, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.01233228575438261, |
|
"learning_rate": 8.44161023325809e-06, |
|
"loss": 0.1127, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.02049972675740719, |
|
"learning_rate": 8.42392776523702e-06, |
|
"loss": 0.2392, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.6001113653182983, |
|
"learning_rate": 8.406245297215952e-06, |
|
"loss": 0.2408, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.7253586649894714, |
|
"learning_rate": 8.388562829194883e-06, |
|
"loss": 0.679, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.20070885121822357, |
|
"learning_rate": 8.370880361173815e-06, |
|
"loss": 0.5534, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.004428381100296974, |
|
"learning_rate": 8.353197893152748e-06, |
|
"loss": 0.3753, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.1646382063627243, |
|
"learning_rate": 8.335515425131678e-06, |
|
"loss": 0.7136, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 4.304123401641846, |
|
"learning_rate": 8.31783295711061e-06, |
|
"loss": 0.6533, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.0014060864923521876, |
|
"learning_rate": 8.300150489089542e-06, |
|
"loss": 0.2931, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 10.760331153869629, |
|
"learning_rate": 8.282468021068472e-06, |
|
"loss": 0.2996, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 151.8526611328125, |
|
"learning_rate": 8.264785553047405e-06, |
|
"loss": 0.2342, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.2262250781059265, |
|
"learning_rate": 8.247103085026337e-06, |
|
"loss": 0.2152, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.028175359591841698, |
|
"learning_rate": 8.229420617005268e-06, |
|
"loss": 0.2108, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 1.2244036197662354, |
|
"learning_rate": 8.211738148984198e-06, |
|
"loss": 0.4471, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.12875045835971832, |
|
"learning_rate": 8.194055680963131e-06, |
|
"loss": 0.5662, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 3.702627420425415, |
|
"learning_rate": 8.176373212942062e-06, |
|
"loss": 0.3945, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 50.61404800415039, |
|
"learning_rate": 8.158690744920994e-06, |
|
"loss": 0.2347, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.736967146396637, |
|
"learning_rate": 8.141008276899926e-06, |
|
"loss": 0.2615, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.00011446132702985778, |
|
"learning_rate": 8.123325808878857e-06, |
|
"loss": 0.5149, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.0010398293379694223, |
|
"learning_rate": 8.105643340857788e-06, |
|
"loss": 0.2957, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.26418277621269226, |
|
"learning_rate": 8.08796087283672e-06, |
|
"loss": 0.2704, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.8061837553977966, |
|
"learning_rate": 8.070278404815651e-06, |
|
"loss": 0.367, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.010115943849086761, |
|
"learning_rate": 8.052595936794583e-06, |
|
"loss": 0.3768, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 27.51811981201172, |
|
"learning_rate": 8.034913468773514e-06, |
|
"loss": 0.3892, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.000684226572047919, |
|
"learning_rate": 8.017231000752446e-06, |
|
"loss": 0.1805, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.08357678353786469, |
|
"learning_rate": 7.999548532731377e-06, |
|
"loss": 0.2773, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 292.503662109375, |
|
"learning_rate": 7.981866064710308e-06, |
|
"loss": 0.6283, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.1264430582523346, |
|
"learning_rate": 7.96418359668924e-06, |
|
"loss": 0.4072, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 1.3433716958388686e-05, |
|
"learning_rate": 7.946501128668173e-06, |
|
"loss": 0.8405, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 27.759994506835938, |
|
"learning_rate": 7.928818660647103e-06, |
|
"loss": 0.4456, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 369.9099426269531, |
|
"learning_rate": 7.911136192626036e-06, |
|
"loss": 0.3382, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 6.0055251121521, |
|
"learning_rate": 7.893453724604966e-06, |
|
"loss": 0.329, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.17973710596561432, |
|
"learning_rate": 7.875771256583897e-06, |
|
"loss": 0.6193, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.03942597284913063, |
|
"learning_rate": 7.85808878856283e-06, |
|
"loss": 0.3823, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.0010533991735428572, |
|
"learning_rate": 7.840406320541762e-06, |
|
"loss": 0.6641, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 3.6850650531050633e-07, |
|
"learning_rate": 7.822723852520693e-06, |
|
"loss": 0.4148, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 5.283959399093874e-05, |
|
"learning_rate": 7.805041384499625e-06, |
|
"loss": 0.799, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.01196613721549511, |
|
"learning_rate": 7.787358916478556e-06, |
|
"loss": 0.5424, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 211.05799865722656, |
|
"learning_rate": 7.769676448457486e-06, |
|
"loss": 0.2341, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 8.655371743770957e-07, |
|
"learning_rate": 7.751993980436419e-06, |
|
"loss": 0.5349, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 1.5644945408621602e-11, |
|
"learning_rate": 7.734311512415351e-06, |
|
"loss": 0.0804, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.00036508633638732135, |
|
"learning_rate": 7.716629044394282e-06, |
|
"loss": 0.3295, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 1.3209816270357e-13, |
|
"learning_rate": 7.698946576373214e-06, |
|
"loss": 0.3606, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 314.8194885253906, |
|
"learning_rate": 7.681264108352145e-06, |
|
"loss": 0.3064, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 1.250010797093637e-07, |
|
"learning_rate": 7.663581640331076e-06, |
|
"loss": 0.2967, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.0573174953460693, |
|
"learning_rate": 7.645899172310008e-06, |
|
"loss": 0.4857, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 204.4314727783203, |
|
"learning_rate": 7.628216704288939e-06, |
|
"loss": 0.358, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.02004345878958702, |
|
"learning_rate": 7.610534236267871e-06, |
|
"loss": 0.753, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 302.43280029296875, |
|
"learning_rate": 7.592851768246803e-06, |
|
"loss": 0.3723, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.0004978284705430269, |
|
"learning_rate": 7.575169300225734e-06, |
|
"loss": 0.4034, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 93.66849517822266, |
|
"learning_rate": 7.557486832204665e-06, |
|
"loss": 0.4249, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.001678618835285306, |
|
"learning_rate": 7.5398043641835965e-06, |
|
"loss": 0.7051, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.37766626477241516, |
|
"learning_rate": 7.522121896162528e-06, |
|
"loss": 0.2375, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 268.7151184082031, |
|
"learning_rate": 7.50443942814146e-06, |
|
"loss": 0.9723, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 10.93520450592041, |
|
"learning_rate": 7.486756960120392e-06, |
|
"loss": 0.4446, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.0002736333408392966, |
|
"learning_rate": 7.4690744920993235e-06, |
|
"loss": 0.5695, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.006334410980343819, |
|
"learning_rate": 7.451392024078254e-06, |
|
"loss": 0.4816, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 2.021748046754368e-10, |
|
"learning_rate": 7.433709556057186e-06, |
|
"loss": 0.6205, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 77.61640930175781, |
|
"learning_rate": 7.416027088036117e-06, |
|
"loss": 0.1743, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.24281173944473267, |
|
"learning_rate": 7.39834462001505e-06, |
|
"loss": 0.3958, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.0005730040138587356, |
|
"learning_rate": 7.380662151993981e-06, |
|
"loss": 0.2709, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 25.074310302734375, |
|
"learning_rate": 7.362979683972912e-06, |
|
"loss": 0.3811, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.0002688245731405914, |
|
"learning_rate": 7.3452972159518435e-06, |
|
"loss": 0.2937, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 6.246182601898909e-05, |
|
"learning_rate": 7.327614747930775e-06, |
|
"loss": 0.2862, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.000318751554004848, |
|
"learning_rate": 7.309932279909706e-06, |
|
"loss": 0.132, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 285.48297119140625, |
|
"learning_rate": 7.292249811888639e-06, |
|
"loss": 0.3126, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 214.23065185546875, |
|
"learning_rate": 7.2745673438675705e-06, |
|
"loss": 0.6408, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 305.9626159667969, |
|
"learning_rate": 7.256884875846501e-06, |
|
"loss": 0.4605, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 4.2915186782011006e-07, |
|
"learning_rate": 7.239202407825433e-06, |
|
"loss": 0.2248, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 265.24072265625, |
|
"learning_rate": 7.221519939804364e-06, |
|
"loss": 0.6776, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 250.4654083251953, |
|
"learning_rate": 7.203837471783295e-06, |
|
"loss": 0.3709, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.0005780484061688185, |
|
"learning_rate": 7.186155003762228e-06, |
|
"loss": 0.3521, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 8.780172348022461, |
|
"learning_rate": 7.168472535741159e-06, |
|
"loss": 0.3998, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 7.643636703491211, |
|
"learning_rate": 7.1507900677200905e-06, |
|
"loss": 0.5537, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.0002484459837432951, |
|
"learning_rate": 7.133107599699022e-06, |
|
"loss": 0.4808, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.2631732225418091, |
|
"learning_rate": 7.115425131677953e-06, |
|
"loss": 0.6781, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.0346391536295414, |
|
"learning_rate": 7.097742663656884e-06, |
|
"loss": 0.1673, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.006426098290830851, |
|
"learning_rate": 7.0800601956358176e-06, |
|
"loss": 0.2001, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.070701465010643, |
|
"learning_rate": 7.062377727614748e-06, |
|
"loss": 0.6119, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 9.641678479965776e-05, |
|
"learning_rate": 7.04469525959368e-06, |
|
"loss": 0.1432, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 80.3648681640625, |
|
"learning_rate": 7.027012791572611e-06, |
|
"loss": 0.2837, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 7.515856123063713e-05, |
|
"learning_rate": 7.009330323551543e-06, |
|
"loss": 0.0325, |
|
"step": 20050 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 9.76786541286856e-05, |
|
"learning_rate": 6.9916478555304745e-06, |
|
"loss": 0.28, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 0.058834467083215714, |
|
"learning_rate": 6.973965387509406e-06, |
|
"loss": 0.119, |
|
"step": 20150 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"grad_norm": 3.0734496116638184, |
|
"learning_rate": 6.9562829194883376e-06, |
|
"loss": 0.1121, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 173.53060913085938, |
|
"learning_rate": 6.938600451467269e-06, |
|
"loss": 0.4994, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"grad_norm": 1.482841071265284e-06, |
|
"learning_rate": 6.920917983446201e-06, |
|
"loss": 0.4273, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 0.06339254975318909, |
|
"learning_rate": 6.903235515425132e-06, |
|
"loss": 0.0653, |
|
"step": 20350 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 29.73435401916504, |
|
"learning_rate": 6.885553047404064e-06, |
|
"loss": 0.0064, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"grad_norm": 0.0535583458840847, |
|
"learning_rate": 6.8678705793829944e-06, |
|
"loss": 0.1328, |
|
"step": 20450 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"grad_norm": 0.016700129956007004, |
|
"learning_rate": 6.850188111361927e-06, |
|
"loss": 0.3879, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"grad_norm": 3.702952017192729e-05, |
|
"learning_rate": 6.832505643340858e-06, |
|
"loss": 0.1604, |
|
"step": 20550 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"grad_norm": 0.03472837060689926, |
|
"learning_rate": 6.814823175319789e-06, |
|
"loss": 0.2436, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 3.1909748940961435e-05, |
|
"learning_rate": 6.7971407072987215e-06, |
|
"loss": 0.1352, |
|
"step": 20650 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 0.3979862630367279, |
|
"learning_rate": 6.779458239277653e-06, |
|
"loss": 0.1159, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.0028309274930506945, |
|
"learning_rate": 6.761775771256584e-06, |
|
"loss": 0.2612, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 0.7586016654968262, |
|
"learning_rate": 6.744093303235516e-06, |
|
"loss": 0.4589, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"grad_norm": 0.0062132058665156364, |
|
"learning_rate": 6.726410835214448e-06, |
|
"loss": 0.0843, |
|
"step": 20850 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 0.01292335707694292, |
|
"learning_rate": 6.708728367193378e-06, |
|
"loss": 0.092, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.0012096440186724067, |
|
"learning_rate": 6.691045899172311e-06, |
|
"loss": 0.1515, |
|
"step": 20950 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.003023844677954912, |
|
"learning_rate": 6.673363431151242e-06, |
|
"loss": 0.3177, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 106.2956771850586, |
|
"learning_rate": 6.655680963130173e-06, |
|
"loss": 0.0315, |
|
"step": 21050 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 0.0011365425307303667, |
|
"learning_rate": 6.637998495109105e-06, |
|
"loss": 0.0159, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 39.502681732177734, |
|
"learning_rate": 6.620316027088036e-06, |
|
"loss": 0.3804, |
|
"step": 21150 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.017230931669473648, |
|
"learning_rate": 6.602633559066968e-06, |
|
"loss": 0.0453, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 6.043082976248115e-06, |
|
"learning_rate": 6.584951091045899e-06, |
|
"loss": 0.3094, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 6.83969769710302e-10, |
|
"learning_rate": 6.567268623024831e-06, |
|
"loss": 0.3382, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 1.2151496714234156e-13, |
|
"learning_rate": 6.549586155003762e-06, |
|
"loss": 0.0469, |
|
"step": 21350 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 129.63966369628906, |
|
"learning_rate": 6.531903686982694e-06, |
|
"loss": 0.1542, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 8.008062764019996e-07, |
|
"learning_rate": 6.514221218961625e-06, |
|
"loss": 0.1121, |
|
"step": 21450 |
|
}, |
|
{ |
|
"epoch": 5.39, |
|
"grad_norm": 195.3101043701172, |
|
"learning_rate": 6.496538750940557e-06, |
|
"loss": 0.134, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"grad_norm": 0.44227921962738037, |
|
"learning_rate": 6.4788562829194885e-06, |
|
"loss": 0.1614, |
|
"step": 21550 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"grad_norm": 389.9450988769531, |
|
"learning_rate": 6.46117381489842e-06, |
|
"loss": 0.2223, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 2.417748987681989e-07, |
|
"learning_rate": 6.443491346877352e-06, |
|
"loss": 0.2297, |
|
"step": 21650 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 0.0011466313153505325, |
|
"learning_rate": 6.425808878856283e-06, |
|
"loss": 0.0367, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 0.4562750458717346, |
|
"learning_rate": 6.408126410835215e-06, |
|
"loss": 0.3361, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 5.47, |
|
"grad_norm": 3.822188591584563e-05, |
|
"learning_rate": 6.390443942814146e-06, |
|
"loss": 0.0979, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 100.44294738769531, |
|
"learning_rate": 6.372761474793078e-06, |
|
"loss": 0.0573, |
|
"step": 21850 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 1.8141976397600956e-05, |
|
"learning_rate": 6.355079006772009e-06, |
|
"loss": 0.6044, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"grad_norm": 2.5538651055034833e-12, |
|
"learning_rate": 6.337396538750941e-06, |
|
"loss": 0.2549, |
|
"step": 21950 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 7.968230164578927e-08, |
|
"learning_rate": 6.319714070729872e-06, |
|
"loss": 0.36, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"grad_norm": 0.001464845146983862, |
|
"learning_rate": 6.302031602708804e-06, |
|
"loss": 0.0043, |
|
"step": 22050 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.5217474102973938, |
|
"learning_rate": 6.2843491346877355e-06, |
|
"loss": 0.3358, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"grad_norm": 2.1627647583954968e-05, |
|
"learning_rate": 6.266666666666666e-06, |
|
"loss": 0.4962, |
|
"step": 22150 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"grad_norm": 0.0039770700968801975, |
|
"learning_rate": 6.248984198645599e-06, |
|
"loss": 0.0886, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"grad_norm": 0.028452860191464424, |
|
"learning_rate": 6.23130173062453e-06, |
|
"loss": 0.2919, |
|
"step": 22250 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 1.0354268550872803, |
|
"learning_rate": 6.213619262603461e-06, |
|
"loss": 0.1583, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"grad_norm": 0.0001276719121960923, |
|
"learning_rate": 6.195936794582393e-06, |
|
"loss": 0.1062, |
|
"step": 22350 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 213.48941040039062, |
|
"learning_rate": 6.178254326561325e-06, |
|
"loss": 0.377, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"grad_norm": 8.587969205109403e-06, |
|
"learning_rate": 6.1605718585402555e-06, |
|
"loss": 0.2043, |
|
"step": 22450 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 0.011805477552115917, |
|
"learning_rate": 6.142889390519188e-06, |
|
"loss": 0.2744, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 5.66, |
|
"grad_norm": 1.4445524776363072e-08, |
|
"learning_rate": 6.125206922498119e-06, |
|
"loss": 0.0145, |
|
"step": 22550 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 136.72720336914062, |
|
"learning_rate": 6.10752445447705e-06, |
|
"loss": 0.1608, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 8.377895937883295e-06, |
|
"learning_rate": 6.0898419864559826e-06, |
|
"loss": 0.1146, |
|
"step": 22650 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 0.0005771568394266069, |
|
"learning_rate": 6.072159518434913e-06, |
|
"loss": 0.3716, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"grad_norm": 0.0033020416740328074, |
|
"learning_rate": 6.054477050413845e-06, |
|
"loss": 0.1609, |
|
"step": 22750 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 0.014289168640971184, |
|
"learning_rate": 6.036794582392777e-06, |
|
"loss": 0.2873, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 433.4857482910156, |
|
"learning_rate": 6.019112114371708e-06, |
|
"loss": 0.2766, |
|
"step": 22850 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"grad_norm": 51.506011962890625, |
|
"learning_rate": 6.0014296463506395e-06, |
|
"loss": 0.2557, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 2.9865319106647803e-07, |
|
"learning_rate": 5.983747178329572e-06, |
|
"loss": 0.052, |
|
"step": 22950 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"grad_norm": 0.0004749756189994514, |
|
"learning_rate": 5.9660647103085026e-06, |
|
"loss": 0.048, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 296.063720703125, |
|
"learning_rate": 5.948382242287434e-06, |
|
"loss": 0.1432, |
|
"step": 23050 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"grad_norm": 0.002446663100272417, |
|
"learning_rate": 5.9306997742663665e-06, |
|
"loss": 0.3151, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.012231925502419472, |
|
"learning_rate": 5.913017306245297e-06, |
|
"loss": 0.0295, |
|
"step": 23150 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 0.006459045223891735, |
|
"learning_rate": 5.895334838224229e-06, |
|
"loss": 0.0319, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"grad_norm": 5.6175377238787405e-08, |
|
"learning_rate": 5.87765237020316e-06, |
|
"loss": 0.1096, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 9.727654060043278e-07, |
|
"learning_rate": 5.859969902182092e-06, |
|
"loss": 0.365, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 167.01791381835938, |
|
"learning_rate": 5.842287434161023e-06, |
|
"loss": 0.0494, |
|
"step": 23350 |
|
}, |
|
{ |
|
"epoch": 5.87, |
|
"grad_norm": 0.05854243040084839, |
|
"learning_rate": 5.824604966139955e-06, |
|
"loss": 0.0218, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 2.8002886676148364e-09, |
|
"learning_rate": 5.8069224981188865e-06, |
|
"loss": 0.0119, |
|
"step": 23450 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"grad_norm": 455.8995361328125, |
|
"learning_rate": 5.789240030097818e-06, |
|
"loss": 0.3402, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 0.0034980960190296173, |
|
"learning_rate": 5.77155756207675e-06, |
|
"loss": 0.1623, |
|
"step": 23550 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.048077382147312164, |
|
"learning_rate": 5.753875094055681e-06, |
|
"loss": 0.5028, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"grad_norm": 1.1395950317382812, |
|
"learning_rate": 5.736192626034613e-06, |
|
"loss": 0.1841, |
|
"step": 23650 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"grad_norm": 3.0090935979387723e-05, |
|
"learning_rate": 5.718510158013544e-06, |
|
"loss": 0.5312, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 4.985315626981901e-08, |
|
"learning_rate": 5.700827689992476e-06, |
|
"loss": 0.0867, |
|
"step": 23750 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.7515669465065002, |
|
"learning_rate": 5.683145221971407e-06, |
|
"loss": 0.3645, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 14.448786735534668, |
|
"learning_rate": 5.665462753950339e-06, |
|
"loss": 0.0975, |
|
"step": 23850 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"grad_norm": 0.58511883020401, |
|
"learning_rate": 5.6477802859292704e-06, |
|
"loss": 0.0981, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 5.8292873291065916e-05, |
|
"learning_rate": 5.630097817908202e-06, |
|
"loss": 0.2598, |
|
"step": 23950 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.03704287111759186, |
|
"learning_rate": 5.6124153498871335e-06, |
|
"loss": 0.1594, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"grad_norm": 0.0010854690335690975, |
|
"learning_rate": 5.594732881866065e-06, |
|
"loss": 0.2415, |
|
"step": 24050 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 381.2314147949219, |
|
"learning_rate": 5.577050413844996e-06, |
|
"loss": 0.0477, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"grad_norm": 8.66334667080082e-05, |
|
"learning_rate": 5.559367945823928e-06, |
|
"loss": 0.0424, |
|
"step": 24150 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"grad_norm": 0.019515322521328926, |
|
"learning_rate": 5.54168547780286e-06, |
|
"loss": 0.3617, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.00011614364484557882, |
|
"learning_rate": 5.52400300978179e-06, |
|
"loss": 0.1944, |
|
"step": 24250 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"grad_norm": 0.00019373864051885903, |
|
"learning_rate": 5.506320541760723e-06, |
|
"loss": 0.0011, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"grad_norm": 1.0937032612901021e-08, |
|
"learning_rate": 5.488638073739654e-06, |
|
"loss": 0.0014, |
|
"step": 24350 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 2.1784097691945198e-13, |
|
"learning_rate": 5.470955605718585e-06, |
|
"loss": 0.0055, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.01839843951165676, |
|
"learning_rate": 5.4532731376975175e-06, |
|
"loss": 0.0042, |
|
"step": 24450 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"grad_norm": 4.981990930907898e-10, |
|
"learning_rate": 5.435590669676449e-06, |
|
"loss": 0.103, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 0.0047708419151604176, |
|
"learning_rate": 5.41790820165538e-06, |
|
"loss": 0.0022, |
|
"step": 24550 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 0.003085497999563813, |
|
"learning_rate": 5.400225733634312e-06, |
|
"loss": 0.0021, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 6.570710642250788e-11, |
|
"learning_rate": 5.382543265613244e-06, |
|
"loss": 0.2051, |
|
"step": 24650 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 0.0029285515192896128, |
|
"learning_rate": 5.364860797592174e-06, |
|
"loss": 0.0012, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"grad_norm": 3.4288578376617806e-07, |
|
"learning_rate": 5.347178329571107e-06, |
|
"loss": 0.0001, |
|
"step": 24750 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"grad_norm": 0.00539399404078722, |
|
"learning_rate": 5.3294958615500375e-06, |
|
"loss": 0.2899, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"grad_norm": 2.6356909188507416e-07, |
|
"learning_rate": 5.311813393528969e-06, |
|
"loss": 0.0019, |
|
"step": 24850 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.019658172503113747, |
|
"learning_rate": 5.294130925507901e-06, |
|
"loss": 0.1133, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"grad_norm": 4.7282670834203344e-11, |
|
"learning_rate": 5.276448457486832e-06, |
|
"loss": 0.0001, |
|
"step": 24950 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"grad_norm": 1.2473710739868693e-06, |
|
"learning_rate": 5.258765989465764e-06, |
|
"loss": 0.1143, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 0.38085153698921204, |
|
"learning_rate": 5.241083521444696e-06, |
|
"loss": 0.059, |
|
"step": 25050 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 5.584224224090576, |
|
"learning_rate": 5.223401053423627e-06, |
|
"loss": 0.0833, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"grad_norm": 9.337106348539237e-06, |
|
"learning_rate": 5.205718585402558e-06, |
|
"loss": 0.0876, |
|
"step": 25150 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 4.118080099146937e-08, |
|
"learning_rate": 5.188036117381491e-06, |
|
"loss": 0.0703, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"grad_norm": 1.8987177554663504e-06, |
|
"learning_rate": 5.170353649360421e-06, |
|
"loss": 0.0625, |
|
"step": 25250 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 4.3221673462490173e-10, |
|
"learning_rate": 5.152671181339353e-06, |
|
"loss": 0.0284, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 0.000691065622959286, |
|
"learning_rate": 5.134988713318285e-06, |
|
"loss": 0.0422, |
|
"step": 25350 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 0.00046700576785951853, |
|
"learning_rate": 5.117306245297216e-06, |
|
"loss": 0.0001, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 0.008938438259065151, |
|
"learning_rate": 5.099623777276148e-06, |
|
"loss": 0.0141, |
|
"step": 25450 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.16503383219242096, |
|
"learning_rate": 5.081941309255079e-06, |
|
"loss": 0.0646, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"grad_norm": 8.952581993071362e-06, |
|
"learning_rate": 5.064258841234011e-06, |
|
"loss": 0.036, |
|
"step": 25550 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"grad_norm": 0.014195716008543968, |
|
"learning_rate": 5.046576373212942e-06, |
|
"loss": 0.0005, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"grad_norm": 0.00028850819217041135, |
|
"learning_rate": 5.028893905191874e-06, |
|
"loss": 0.0754, |
|
"step": 25650 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.00020963407587260008, |
|
"learning_rate": 5.011211437170805e-06, |
|
"loss": 0.0003, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.0010497659677639604, |
|
"learning_rate": 4.993528969149737e-06, |
|
"loss": 0.6013, |
|
"step": 25750 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"grad_norm": 1.387237716699019e-06, |
|
"learning_rate": 4.975846501128668e-06, |
|
"loss": 0.005, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 1.8294354958925396e-05, |
|
"learning_rate": 4.9581640331076e-06, |
|
"loss": 0.0, |
|
"step": 25850 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 4.903622539131902e-06, |
|
"learning_rate": 4.9404815650865315e-06, |
|
"loss": 0.0003, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 0.000930552021600306, |
|
"learning_rate": 4.922799097065463e-06, |
|
"loss": 0.0464, |
|
"step": 25950 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 2.9821951102348976e-05, |
|
"learning_rate": 4.905116629044395e-06, |
|
"loss": 0.0854, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"grad_norm": 0.19266781210899353, |
|
"learning_rate": 4.887434161023326e-06, |
|
"loss": 0.1578, |
|
"step": 26050 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 6.610630862269318e-06, |
|
"learning_rate": 4.869751693002258e-06, |
|
"loss": 0.0004, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 6.910874503773812e-07, |
|
"learning_rate": 4.852069224981189e-06, |
|
"loss": 0.0013, |
|
"step": 26150 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"grad_norm": 0.00030907560721971095, |
|
"learning_rate": 4.834386756960121e-06, |
|
"loss": 0.0005, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"grad_norm": 1.2135699112292286e-09, |
|
"learning_rate": 4.816704288939052e-06, |
|
"loss": 0.1581, |
|
"step": 26250 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 8.979808626463637e-06, |
|
"learning_rate": 4.799021820917984e-06, |
|
"loss": 0.1339, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 8.109305053949356e-05, |
|
"learning_rate": 4.781339352896915e-06, |
|
"loss": 0.1607, |
|
"step": 26350 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 0.11362000554800034, |
|
"learning_rate": 4.763656884875847e-06, |
|
"loss": 0.0152, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 6.63, |
|
"grad_norm": 3.168620969518088e-05, |
|
"learning_rate": 4.7459744168547785e-06, |
|
"loss": 0.062, |
|
"step": 26450 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"grad_norm": 2.37572979927063, |
|
"learning_rate": 4.728291948833709e-06, |
|
"loss": 0.0001, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"grad_norm": 1.1477128509795875e-06, |
|
"learning_rate": 4.710609480812642e-06, |
|
"loss": 0.2304, |
|
"step": 26550 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 3.561492079029449e-08, |
|
"learning_rate": 4.692927012791573e-06, |
|
"loss": 0.1046, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 1.6958483457565308, |
|
"learning_rate": 4.675244544770504e-06, |
|
"loss": 0.0273, |
|
"step": 26650 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"grad_norm": 6.609186675632372e-05, |
|
"learning_rate": 4.657562076749436e-06, |
|
"loss": 0.0504, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"grad_norm": 0.02066265046596527, |
|
"learning_rate": 4.639879608728368e-06, |
|
"loss": 0.0845, |
|
"step": 26750 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.6868598461151123, |
|
"learning_rate": 4.6221971407072985e-06, |
|
"loss": 0.064, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 4.525861463378078e-09, |
|
"learning_rate": 4.604514672686231e-06, |
|
"loss": 0.0372, |
|
"step": 26850 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.0018904170719906688, |
|
"learning_rate": 4.5868322046651625e-06, |
|
"loss": 0.171, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 0.06831281632184982, |
|
"learning_rate": 4.569149736644093e-06, |
|
"loss": 0.0005, |
|
"step": 26950 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"grad_norm": 2.7328371288604103e-05, |
|
"learning_rate": 4.5514672686230256e-06, |
|
"loss": 0.0834, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 1.312251782792373e-07, |
|
"learning_rate": 4.533784800601956e-06, |
|
"loss": 0.0009, |
|
"step": 27050 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.006464004050940275, |
|
"learning_rate": 4.516102332580888e-06, |
|
"loss": 0.1302, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 6.81, |
|
"grad_norm": 4.0537888601477334e-09, |
|
"learning_rate": 4.49841986455982e-06, |
|
"loss": 0.1255, |
|
"step": 27150 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 0.0004817073349840939, |
|
"learning_rate": 4.480737396538751e-06, |
|
"loss": 0.001, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.014918695203959942, |
|
"learning_rate": 4.4630549285176825e-06, |
|
"loss": 0.0019, |
|
"step": 27250 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"grad_norm": 6.75780752420712e-17, |
|
"learning_rate": 4.445372460496614e-06, |
|
"loss": 0.0179, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 382.1897888183594, |
|
"learning_rate": 4.4276899924755456e-06, |
|
"loss": 0.0396, |
|
"step": 27350 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"grad_norm": 0.30687054991722107, |
|
"learning_rate": 4.410007524454477e-06, |
|
"loss": 0.0576, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 1.2169127785455203e-06, |
|
"learning_rate": 4.392325056433409e-06, |
|
"loss": 0.0002, |
|
"step": 27450 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 6.928129077377054e-12, |
|
"learning_rate": 4.37464258841234e-06, |
|
"loss": 0.0989, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"grad_norm": 7.992535522305388e-10, |
|
"learning_rate": 4.356960120391272e-06, |
|
"loss": 0.0014, |
|
"step": 27550 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 0.001016330672428012, |
|
"learning_rate": 4.339277652370203e-06, |
|
"loss": 0.0796, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 9.33817503323553e-08, |
|
"learning_rate": 4.321595184349135e-06, |
|
"loss": 0.0031, |
|
"step": 27650 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"grad_norm": 3.0769423120524664e-10, |
|
"learning_rate": 4.303912716328066e-06, |
|
"loss": 0.0482, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 5.2930868577050205e-09, |
|
"learning_rate": 4.286230248306998e-06, |
|
"loss": 0.0241, |
|
"step": 27750 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"grad_norm": 2.738467628660146e-05, |
|
"learning_rate": 4.2685477802859295e-06, |
|
"loss": 0.0094, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 1.259439272871532e-06, |
|
"learning_rate": 4.250865312264861e-06, |
|
"loss": 0.0011, |
|
"step": 27850 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 433.7135925292969, |
|
"learning_rate": 4.233182844243792e-06, |
|
"loss": 0.4265, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 0.000105952778540086, |
|
"learning_rate": 4.215500376222724e-06, |
|
"loss": 0.0048, |
|
"step": 27950 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 0.2630611062049866, |
|
"learning_rate": 4.197817908201656e-06, |
|
"loss": 0.083, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 1.2784289252221193e-11, |
|
"learning_rate": 4.180135440180586e-06, |
|
"loss": 0.0003, |
|
"step": 28050 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 4.8076164577137703e-11, |
|
"learning_rate": 4.162452972159519e-06, |
|
"loss": 0.0, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"grad_norm": 2.940306558230077e-07, |
|
"learning_rate": 4.14477050413845e-06, |
|
"loss": 0.0001, |
|
"step": 28150 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 4.1964653064496815e-05, |
|
"learning_rate": 4.127088036117381e-06, |
|
"loss": 0.0005, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"grad_norm": 0.005852025002241135, |
|
"learning_rate": 4.1094055680963134e-06, |
|
"loss": 0.0049, |
|
"step": 28250 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.05330043286085129, |
|
"learning_rate": 4.091723100075245e-06, |
|
"loss": 0.0, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 7.11, |
|
"grad_norm": 2.5323606323013337e-08, |
|
"learning_rate": 4.074040632054176e-06, |
|
"loss": 0.0001, |
|
"step": 28350 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 0.004866173956543207, |
|
"learning_rate": 4.056358164033108e-06, |
|
"loss": 0.0002, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"grad_norm": 1.1348839645819453e-09, |
|
"learning_rate": 4.038675696012039e-06, |
|
"loss": 0.0361, |
|
"step": 28450 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 4.0626005102240015e-06, |
|
"learning_rate": 4.02099322799097e-06, |
|
"loss": 0.0006, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 1.4158376870909706e-07, |
|
"learning_rate": 4.003310759969903e-06, |
|
"loss": 0.0, |
|
"step": 28550 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"grad_norm": 3.5035823202633765e-06, |
|
"learning_rate": 3.9856282919488334e-06, |
|
"loss": 0.0, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 7.668052421649918e-05, |
|
"learning_rate": 3.967945823927765e-06, |
|
"loss": 0.1484, |
|
"step": 28650 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.0006498922011815012, |
|
"learning_rate": 3.950263355906697e-06, |
|
"loss": 0.0, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"grad_norm": 1.2344708920863923e-05, |
|
"learning_rate": 3.932580887885628e-06, |
|
"loss": 0.0001, |
|
"step": 28750 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"grad_norm": 4.231491038808599e-05, |
|
"learning_rate": 3.91489841986456e-06, |
|
"loss": 0.0001, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 0.008648673072457314, |
|
"learning_rate": 3.897215951843492e-06, |
|
"loss": 0.0, |
|
"step": 28850 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.0010539034847170115, |
|
"learning_rate": 3.879533483822423e-06, |
|
"loss": 0.0, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 5.991931902826764e-05, |
|
"learning_rate": 3.861851015801354e-06, |
|
"loss": 0.0001, |
|
"step": 28950 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"grad_norm": 0.017336919903755188, |
|
"learning_rate": 3.844168547780287e-06, |
|
"loss": 0.0206, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"grad_norm": 0.0004083296225871891, |
|
"learning_rate": 3.826486079759217e-06, |
|
"loss": 0.0002, |
|
"step": 29050 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"grad_norm": 9.027652740478516, |
|
"learning_rate": 3.808803611738149e-06, |
|
"loss": 0.0067, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.0003242001694161445, |
|
"learning_rate": 3.791121143717081e-06, |
|
"loss": 0.0, |
|
"step": 29150 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"grad_norm": 2.259884604427498e-05, |
|
"learning_rate": 3.773438675696012e-06, |
|
"loss": 0.0002, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"grad_norm": 9.495877265930176, |
|
"learning_rate": 3.7557562076749436e-06, |
|
"loss": 0.0002, |
|
"step": 29250 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"grad_norm": 0.0059493957087397575, |
|
"learning_rate": 3.7380737396538755e-06, |
|
"loss": 0.0, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 0.004485088866204023, |
|
"learning_rate": 3.7203912716328067e-06, |
|
"loss": 0.0, |
|
"step": 29350 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 8.322012309412881e-15, |
|
"learning_rate": 3.702708803611738e-06, |
|
"loss": 0.0018, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 0.0009153097053058445, |
|
"learning_rate": 3.68502633559067e-06, |
|
"loss": 0.0001, |
|
"step": 29450 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 2.3616248654434457e-05, |
|
"learning_rate": 3.6673438675696013e-06, |
|
"loss": 0.1638, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"grad_norm": 0.0017722542397677898, |
|
"learning_rate": 3.6496613995485324e-06, |
|
"loss": 0.0123, |
|
"step": 29550 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.06969759613275528, |
|
"learning_rate": 3.631978931527465e-06, |
|
"loss": 0.0155, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 1.5746809367556125e-06, |
|
"learning_rate": 3.614296463506396e-06, |
|
"loss": 0.0001, |
|
"step": 29650 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"grad_norm": 3.0426802744010217e-10, |
|
"learning_rate": 3.596613995485327e-06, |
|
"loss": 0.0, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 0.5712952017784119, |
|
"learning_rate": 3.578931527464259e-06, |
|
"loss": 0.1067, |
|
"step": 29750 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"grad_norm": 0.766385555267334, |
|
"learning_rate": 3.5612490594431906e-06, |
|
"loss": 0.0107, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 7.49, |
|
"grad_norm": 0.05696748197078705, |
|
"learning_rate": 3.5435665914221217e-06, |
|
"loss": 0.0013, |
|
"step": 29850 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 7.25884137864341e-06, |
|
"learning_rate": 3.5258841234010537e-06, |
|
"loss": 0.0, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"grad_norm": 1.7060403479263186e-05, |
|
"learning_rate": 3.5082016553799852e-06, |
|
"loss": 0.0002, |
|
"step": 29950 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 0.012671858072280884, |
|
"learning_rate": 3.4905191873589168e-06, |
|
"loss": 0.0, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"grad_norm": 2.8193007928223324e-09, |
|
"learning_rate": 3.472836719337848e-06, |
|
"loss": 0.0001, |
|
"step": 30050 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 0.019155049696564674, |
|
"learning_rate": 3.4551542513167795e-06, |
|
"loss": 0.0, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"grad_norm": 0.0020516354124993086, |
|
"learning_rate": 3.4374717832957114e-06, |
|
"loss": 0.0002, |
|
"step": 30150 |
|
}, |
|
{ |
|
"epoch": 7.57, |
|
"grad_norm": 2.4088294594548643e-05, |
|
"learning_rate": 3.4197893152746425e-06, |
|
"loss": 0.0492, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 7.59, |
|
"grad_norm": 1.9164204786648043e-05, |
|
"learning_rate": 3.402106847253574e-06, |
|
"loss": 0.0312, |
|
"step": 30250 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 0.0160346832126379, |
|
"learning_rate": 3.384424379232506e-06, |
|
"loss": 0.0007, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"grad_norm": 7.57160614739405e-06, |
|
"learning_rate": 3.366741911211437e-06, |
|
"loss": 0.0005, |
|
"step": 30350 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"grad_norm": 1.1699286504851525e-11, |
|
"learning_rate": 3.3490594431903687e-06, |
|
"loss": 0.0027, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 1.0412069286758197e-06, |
|
"learning_rate": 3.3313769751693003e-06, |
|
"loss": 0.0011, |
|
"step": 30450 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 1.0678839998945477e-06, |
|
"learning_rate": 3.313694507148232e-06, |
|
"loss": 0.0, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"grad_norm": 2.537229315535683e-09, |
|
"learning_rate": 3.2960120391271634e-06, |
|
"loss": 0.0, |
|
"step": 30550 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"grad_norm": 8.23991967990878e-07, |
|
"learning_rate": 3.278329571106095e-06, |
|
"loss": 0.0001, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.0006322423578239977, |
|
"learning_rate": 3.2606471030850265e-06, |
|
"loss": 0.0001, |
|
"step": 30650 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"grad_norm": 1.3688865863059618e-07, |
|
"learning_rate": 3.242964635063958e-06, |
|
"loss": 0.062, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 7.71, |
|
"grad_norm": 2.41971292780363e-06, |
|
"learning_rate": 3.2252821670428896e-06, |
|
"loss": 0.1235, |
|
"step": 30750 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 2.4634087480990274e-07, |
|
"learning_rate": 3.207599699021821e-06, |
|
"loss": 0.0035, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 3.1068152566149365e-06, |
|
"learning_rate": 3.1899172310007527e-06, |
|
"loss": 0.0205, |
|
"step": 30850 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 5.763430177552209e-09, |
|
"learning_rate": 3.1722347629796842e-06, |
|
"loss": 0.0, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 0.008364195004105568, |
|
"learning_rate": 3.1545522949586153e-06, |
|
"loss": 0.0009, |
|
"step": 30950 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 0.00012845598394051194, |
|
"learning_rate": 3.1368698269375473e-06, |
|
"loss": 0.0008, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"grad_norm": 0.001842482597567141, |
|
"learning_rate": 3.119187358916479e-06, |
|
"loss": 0.0007, |
|
"step": 31050 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 1.2641396263113336e-10, |
|
"learning_rate": 3.10150489089541e-06, |
|
"loss": 0.0019, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 7.81, |
|
"grad_norm": 0.00033131783129647374, |
|
"learning_rate": 3.083822422874342e-06, |
|
"loss": 0.0002, |
|
"step": 31150 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 1.851675369834993e-05, |
|
"learning_rate": 3.0661399548532735e-06, |
|
"loss": 0.0009, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 0.00795644149184227, |
|
"learning_rate": 3.0484574868322046e-06, |
|
"loss": 0.077, |
|
"step": 31250 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.07745194435119629, |
|
"learning_rate": 3.030775018811136e-06, |
|
"loss": 0.0001, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 0.10175588726997375, |
|
"learning_rate": 3.013092550790068e-06, |
|
"loss": 0.0307, |
|
"step": 31350 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 8.556443935958669e-05, |
|
"learning_rate": 2.9954100827689993e-06, |
|
"loss": 0.0, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 7.89, |
|
"grad_norm": 0.9275371432304382, |
|
"learning_rate": 2.977727614747931e-06, |
|
"loss": 0.1478, |
|
"step": 31450 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 5.1567803360796916e-09, |
|
"learning_rate": 2.960045146726863e-06, |
|
"loss": 0.0598, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 6.67710139623523e-07, |
|
"learning_rate": 2.942362678705794e-06, |
|
"loss": 0.0094, |
|
"step": 31550 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"grad_norm": 1.458290155298414e-10, |
|
"learning_rate": 2.9246802106847255e-06, |
|
"loss": 0.0009, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"grad_norm": 5.1869348681066185e-05, |
|
"learning_rate": 2.906997742663657e-06, |
|
"loss": 0.0007, |
|
"step": 31650 |
|
}, |
|
{ |
|
"epoch": 7.95, |
|
"grad_norm": 0.00036754223401658237, |
|
"learning_rate": 2.8893152746425886e-06, |
|
"loss": 0.1282, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.0028616636991500854, |
|
"learning_rate": 2.87163280662152e-06, |
|
"loss": 0.1516, |
|
"step": 31750 |
|
}, |
|
{ |
|
"epoch": 7.98, |
|
"grad_norm": 0.0008008142467588186, |
|
"learning_rate": 2.8539503386004512e-06, |
|
"loss": 0.0004, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"grad_norm": 1.0718519405372717e-07, |
|
"learning_rate": 2.8362678705793832e-06, |
|
"loss": 0.0, |
|
"step": 31850 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.0009103859774768353, |
|
"learning_rate": 2.8185854025583148e-06, |
|
"loss": 0.0001, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.0001856798044173047, |
|
"learning_rate": 2.800902934537246e-06, |
|
"loss": 0.0, |
|
"step": 31950 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 0.00011591133807087317, |
|
"learning_rate": 2.7832204665161774e-06, |
|
"loss": 0.0001, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"grad_norm": 0.00040982267819345, |
|
"learning_rate": 2.7655379984951094e-06, |
|
"loss": 0.0, |
|
"step": 32050 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"grad_norm": 2.265534648770995e-09, |
|
"learning_rate": 2.7478555304740405e-06, |
|
"loss": 0.0001, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 5.858885425424898e-13, |
|
"learning_rate": 2.730173062452972e-06, |
|
"loss": 0.0001, |
|
"step": 32150 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 2.8236866932730136e-18, |
|
"learning_rate": 2.712490594431904e-06, |
|
"loss": 0.0, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 8.09, |
|
"grad_norm": 0.0001981940004043281, |
|
"learning_rate": 2.694808126410835e-06, |
|
"loss": 0.0, |
|
"step": 32250 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 3.2661256511856696e-12, |
|
"learning_rate": 2.6771256583897667e-06, |
|
"loss": 0.0, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 8.11, |
|
"grad_norm": 1.1293546776869334e-05, |
|
"learning_rate": 2.6594431903686983e-06, |
|
"loss": 0.0, |
|
"step": 32350 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"grad_norm": 0.0003391726640984416, |
|
"learning_rate": 2.64176072234763e-06, |
|
"loss": 0.0, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"grad_norm": 6.486132042482495e-05, |
|
"learning_rate": 2.6240782543265614e-06, |
|
"loss": 0.0, |
|
"step": 32450 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 2.1309777366695926e-05, |
|
"learning_rate": 2.606395786305493e-06, |
|
"loss": 0.0, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 4.4795211806558655e-07, |
|
"learning_rate": 2.5887133182844245e-06, |
|
"loss": 0.0007, |
|
"step": 32550 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"grad_norm": 2.0528705402256264e-09, |
|
"learning_rate": 2.571030850263356e-06, |
|
"loss": 0.0, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 4.783522308571264e-05, |
|
"learning_rate": 2.5533483822422876e-06, |
|
"loss": 0.0, |
|
"step": 32650 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 1.7800081408836377e-08, |
|
"learning_rate": 2.535665914221219e-06, |
|
"loss": 0.0, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"grad_norm": 0.0003143524518236518, |
|
"learning_rate": 2.5179834462001507e-06, |
|
"loss": 0.0003, |
|
"step": 32750 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"grad_norm": 9.409014455741271e-05, |
|
"learning_rate": 2.500300978179082e-06, |
|
"loss": 0.0, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"grad_norm": 3.097814449404268e-09, |
|
"learning_rate": 2.4826185101580133e-06, |
|
"loss": 0.0, |
|
"step": 32850 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 6.660656595158798e-07, |
|
"learning_rate": 2.4649360421369453e-06, |
|
"loss": 0.0, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"grad_norm": 0.04804990068078041, |
|
"learning_rate": 2.447253574115877e-06, |
|
"loss": 0.0, |
|
"step": 32950 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 3.926641234386352e-09, |
|
"learning_rate": 2.429571106094808e-06, |
|
"loss": 0.0, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"grad_norm": 1.5834859022183257e-20, |
|
"learning_rate": 2.4118886380737395e-06, |
|
"loss": 0.0001, |
|
"step": 33050 |
|
}, |
|
{ |
|
"epoch": 8.3, |
|
"grad_norm": 0.20250500738620758, |
|
"learning_rate": 2.3942061700526715e-06, |
|
"loss": 0.0004, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"grad_norm": 5.932114959250612e-07, |
|
"learning_rate": 2.3765237020316026e-06, |
|
"loss": 0.0001, |
|
"step": 33150 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"grad_norm": 1.5223192498248217e-11, |
|
"learning_rate": 2.358841234010534e-06, |
|
"loss": 0.0, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 1.2739813826101454e-07, |
|
"learning_rate": 2.341158765989466e-06, |
|
"loss": 0.0, |
|
"step": 33250 |
|
}, |
|
{ |
|
"epoch": 8.35, |
|
"grad_norm": 1.2789546310898459e-08, |
|
"learning_rate": 2.3234762979683973e-06, |
|
"loss": 0.0001, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 1.4692803233629093e-05, |
|
"learning_rate": 2.305793829947329e-06, |
|
"loss": 0.0, |
|
"step": 33350 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 0.00019242956477683038, |
|
"learning_rate": 2.2881113619262604e-06, |
|
"loss": 0.0403, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.270428893905192e-06, |
|
"loss": 0.0, |
|
"step": 33450 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.002393543953076005, |
|
"learning_rate": 2.2527464258841235e-06, |
|
"loss": 0.0399, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"grad_norm": 1.7551202802223997e-07, |
|
"learning_rate": 2.235063957863055e-06, |
|
"loss": 0.0, |
|
"step": 33550 |
|
}, |
|
{ |
|
"epoch": 8.43, |
|
"grad_norm": 2.735872639547665e-11, |
|
"learning_rate": 2.2173814898419866e-06, |
|
"loss": 0.0004, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.0003994428552687168, |
|
"learning_rate": 2.199699021820918e-06, |
|
"loss": 0.0, |
|
"step": 33650 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"grad_norm": 2.7801218032836914, |
|
"learning_rate": 2.1820165537998497e-06, |
|
"loss": 0.0, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 1.100529516406823e-06, |
|
"learning_rate": 2.164334085778781e-06, |
|
"loss": 0.0, |
|
"step": 33750 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 0.02319416031241417, |
|
"learning_rate": 2.1466516177577128e-06, |
|
"loss": 0.0, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"grad_norm": 0.0017326247179880738, |
|
"learning_rate": 2.1289691497366443e-06, |
|
"loss": 0.0, |
|
"step": 33850 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 1.8130524859216735e-10, |
|
"learning_rate": 2.1112866817155754e-06, |
|
"loss": 0.0001, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 0.0017156396061182022, |
|
"learning_rate": 2.0936042136945074e-06, |
|
"loss": 0.0004, |
|
"step": 33950 |
|
}, |
|
{ |
|
"epoch": 8.53, |
|
"grad_norm": 0.0018524077022448182, |
|
"learning_rate": 2.075921745673439e-06, |
|
"loss": 0.0, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 1.1194772923772689e-05, |
|
"learning_rate": 2.05823927765237e-06, |
|
"loss": 0.0, |
|
"step": 34050 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 7.453370471921517e-06, |
|
"learning_rate": 2.040556809631302e-06, |
|
"loss": 0.0164, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 8.57, |
|
"grad_norm": 6.412294029090049e-10, |
|
"learning_rate": 2.0228743416102336e-06, |
|
"loss": 0.0, |
|
"step": 34150 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"grad_norm": 1.4134855689861236e-17, |
|
"learning_rate": 2.0051918735891647e-06, |
|
"loss": 0.0, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 8.59, |
|
"grad_norm": 0.00038817909080535173, |
|
"learning_rate": 1.9875094055680963e-06, |
|
"loss": 0.0365, |
|
"step": 34250 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 2.2248328605201095e-05, |
|
"learning_rate": 1.9698269375470282e-06, |
|
"loss": 0.0, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 0.010381842032074928, |
|
"learning_rate": 1.9521444695259594e-06, |
|
"loss": 0.0, |
|
"step": 34350 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"grad_norm": 0.001246288768015802, |
|
"learning_rate": 1.934462001504891e-06, |
|
"loss": 0.0, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 1.8006402254104614, |
|
"learning_rate": 1.916779533483823e-06, |
|
"loss": 0.0007, |
|
"step": 34450 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"grad_norm": 1.644072100681626e-10, |
|
"learning_rate": 1.899097065462754e-06, |
|
"loss": 0.0, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 8.67, |
|
"grad_norm": 5.652666779099036e-09, |
|
"learning_rate": 1.8814145974416856e-06, |
|
"loss": 0.0, |
|
"step": 34550 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"grad_norm": 0.003141549648717046, |
|
"learning_rate": 1.8637321294206173e-06, |
|
"loss": 0.0, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 8.69, |
|
"grad_norm": 1.1486420135042863e-06, |
|
"learning_rate": 1.8460496613995484e-06, |
|
"loss": 0.0003, |
|
"step": 34650 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"grad_norm": 1.1713603271346074e-05, |
|
"learning_rate": 1.8283671933784802e-06, |
|
"loss": 0.0, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 1.2204428685436142e-06, |
|
"learning_rate": 1.8106847253574115e-06, |
|
"loss": 0.0, |
|
"step": 34750 |
|
}, |
|
{ |
|
"epoch": 8.73, |
|
"grad_norm": 0.0014657212886959314, |
|
"learning_rate": 1.793002257336343e-06, |
|
"loss": 0.0, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 0.017868679016828537, |
|
"learning_rate": 1.7753197893152748e-06, |
|
"loss": 0.0, |
|
"step": 34850 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 3.3499613891763147e-06, |
|
"learning_rate": 1.7576373212942062e-06, |
|
"loss": 0.0001, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 6.1278524476904295e-09, |
|
"learning_rate": 1.7399548532731377e-06, |
|
"loss": 0.0001, |
|
"step": 34950 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"grad_norm": 1.445396605959104e-06, |
|
"learning_rate": 1.7222723852520693e-06, |
|
"loss": 0.0001, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"grad_norm": 0.0017798148328438401, |
|
"learning_rate": 1.7045899172310008e-06, |
|
"loss": 0.1651, |
|
"step": 35050 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 3.6833380789857983e-08, |
|
"learning_rate": 1.6869074492099324e-06, |
|
"loss": 0.0, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 9.361156988463293e-11, |
|
"learning_rate": 1.669224981188864e-06, |
|
"loss": 0.0, |
|
"step": 35150 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 7.828115933250501e-09, |
|
"learning_rate": 1.6515425131677955e-06, |
|
"loss": 0.0, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 1.020300643972405e-08, |
|
"learning_rate": 1.6338600451467268e-06, |
|
"loss": 0.0, |
|
"step": 35250 |
|
}, |
|
{ |
|
"epoch": 8.85, |
|
"grad_norm": 0.000530413759406656, |
|
"learning_rate": 1.6161775771256586e-06, |
|
"loss": 0.0, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 8.391751182834639e-10, |
|
"learning_rate": 1.59849510910459e-06, |
|
"loss": 0.0, |
|
"step": 35350 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.003899802453815937, |
|
"learning_rate": 1.5808126410835214e-06, |
|
"loss": 0.0, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"grad_norm": 2.3727285224595107e-05, |
|
"learning_rate": 1.5631301730624532e-06, |
|
"loss": 0.0, |
|
"step": 35450 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"grad_norm": 4.068557245773263e-06, |
|
"learning_rate": 1.5454477050413845e-06, |
|
"loss": 0.0, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 0.007758264895528555, |
|
"learning_rate": 1.527765237020316e-06, |
|
"loss": 0.0, |
|
"step": 35550 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.00016868404054548591, |
|
"learning_rate": 1.5100827689992474e-06, |
|
"loss": 0.0, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 8.94, |
|
"grad_norm": 0.00011449763405835256, |
|
"learning_rate": 1.4924003009781792e-06, |
|
"loss": 0.0, |
|
"step": 35650 |
|
}, |
|
{ |
|
"epoch": 8.95, |
|
"grad_norm": 4.054548298881855e-06, |
|
"learning_rate": 1.4747178329571107e-06, |
|
"loss": 0.0, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"grad_norm": 0.0010476693278178573, |
|
"learning_rate": 1.457035364936042e-06, |
|
"loss": 0.0001, |
|
"step": 35750 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.06502784043550491, |
|
"learning_rate": 1.4393528969149738e-06, |
|
"loss": 0.0, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 8.99, |
|
"grad_norm": 2.2866407789479126e-07, |
|
"learning_rate": 1.4216704288939052e-06, |
|
"loss": 0.0, |
|
"step": 35850 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.00021389636094681919, |
|
"learning_rate": 1.4039879608728367e-06, |
|
"loss": 0.0002, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"grad_norm": 1.3870979032049036e-08, |
|
"learning_rate": 1.3863054928517683e-06, |
|
"loss": 0.0, |
|
"step": 35950 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 6.817894586674811e-07, |
|
"learning_rate": 1.3686230248306998e-06, |
|
"loss": 0.0, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 6.899564031215277e-09, |
|
"learning_rate": 1.3509405568096314e-06, |
|
"loss": 0.0, |
|
"step": 36050 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"grad_norm": 5.222953859629342e-06, |
|
"learning_rate": 1.333258088788563e-06, |
|
"loss": 0.0, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"grad_norm": 1.425233087104516e-08, |
|
"learning_rate": 1.3155756207674945e-06, |
|
"loss": 0.0, |
|
"step": 36150 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 0.001089599565602839, |
|
"learning_rate": 1.2978931527464258e-06, |
|
"loss": 0.0, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 5.556981932386407e-07, |
|
"learning_rate": 1.2802106847253576e-06, |
|
"loss": 0.0, |
|
"step": 36250 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 3.3812255423981696e-05, |
|
"learning_rate": 1.2625282167042889e-06, |
|
"loss": 0.0, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 0.008249111473560333, |
|
"learning_rate": 1.2448457486832204e-06, |
|
"loss": 0.0, |
|
"step": 36350 |
|
}, |
|
{ |
|
"epoch": 9.13, |
|
"grad_norm": 1.3336196388991084e-05, |
|
"learning_rate": 1.2271632806621522e-06, |
|
"loss": 0.0, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 7.238022403655009e-10, |
|
"learning_rate": 1.2094808126410835e-06, |
|
"loss": 0.0, |
|
"step": 36450 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"grad_norm": 1.9307069831775436e-11, |
|
"learning_rate": 1.191798344620015e-06, |
|
"loss": 0.0, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"grad_norm": 2.09102075932055e-11, |
|
"learning_rate": 1.1741158765989466e-06, |
|
"loss": 0.0, |
|
"step": 36550 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"grad_norm": 0.005663714837282896, |
|
"learning_rate": 1.1564334085778782e-06, |
|
"loss": 0.0, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 0.0010162381222471595, |
|
"learning_rate": 1.1387509405568097e-06, |
|
"loss": 0.0, |
|
"step": 36650 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 6.629519339185208e-05, |
|
"learning_rate": 1.1210684725357413e-06, |
|
"loss": 0.0, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 9.22, |
|
"grad_norm": 3.708991016537766e-06, |
|
"learning_rate": 1.1033860045146728e-06, |
|
"loss": 0.0, |
|
"step": 36750 |
|
}, |
|
{ |
|
"epoch": 9.23, |
|
"grad_norm": 1.2199451703054365e-05, |
|
"learning_rate": 1.0857035364936042e-06, |
|
"loss": 0.0, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"grad_norm": 3.44480326930352e-07, |
|
"learning_rate": 1.068021068472536e-06, |
|
"loss": 0.0, |
|
"step": 36850 |
|
}, |
|
{ |
|
"epoch": 9.26, |
|
"grad_norm": 4.88109819229976e-09, |
|
"learning_rate": 1.0503386004514673e-06, |
|
"loss": 0.0, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 9.27, |
|
"grad_norm": 1.233081690088511e-07, |
|
"learning_rate": 1.0326561324303988e-06, |
|
"loss": 0.0, |
|
"step": 36950 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 4.49614967479306e-09, |
|
"learning_rate": 1.0149736644093304e-06, |
|
"loss": 0.0, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 9.29, |
|
"grad_norm": 1.2748416793328943e-06, |
|
"learning_rate": 9.97291196388262e-07, |
|
"loss": 0.0, |
|
"step": 37050 |
|
}, |
|
{ |
|
"epoch": 9.31, |
|
"grad_norm": 2.0055947869265442e-14, |
|
"learning_rate": 9.796087283671935e-07, |
|
"loss": 0.0, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"grad_norm": 1.1864563075153988e-15, |
|
"learning_rate": 9.619262603461248e-07, |
|
"loss": 0.0, |
|
"step": 37150 |
|
}, |
|
{ |
|
"epoch": 9.33, |
|
"grad_norm": 3.7789734051330015e-05, |
|
"learning_rate": 9.442437923250566e-07, |
|
"loss": 0.0, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 9.34, |
|
"grad_norm": 0.2207726538181305, |
|
"learning_rate": 9.26561324303988e-07, |
|
"loss": 0.0, |
|
"step": 37250 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 0.0002287498500663787, |
|
"learning_rate": 9.088788562829194e-07, |
|
"loss": 0.0, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 1.4228673350658028e-08, |
|
"learning_rate": 8.911963882618511e-07, |
|
"loss": 0.0, |
|
"step": 37350 |
|
}, |
|
{ |
|
"epoch": 9.38, |
|
"grad_norm": 2.1739325584408233e-16, |
|
"learning_rate": 8.735139202407825e-07, |
|
"loss": 0.0, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 9.39, |
|
"grad_norm": 1.1177444037002715e-07, |
|
"learning_rate": 8.558314522197141e-07, |
|
"loss": 0.0, |
|
"step": 37450 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 2.286371909576701e-06, |
|
"learning_rate": 8.381489841986456e-07, |
|
"loss": 0.0, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 9.42, |
|
"grad_norm": 0.0007677926332689822, |
|
"learning_rate": 8.204665161775772e-07, |
|
"loss": 0.0, |
|
"step": 37550 |
|
}, |
|
{ |
|
"epoch": 9.43, |
|
"grad_norm": 7.146362435150877e-08, |
|
"learning_rate": 8.027840481565087e-07, |
|
"loss": 0.0, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"grad_norm": 7.620369160576956e-06, |
|
"learning_rate": 7.851015801354402e-07, |
|
"loss": 0.0, |
|
"step": 37650 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 1.6175413009023032e-07, |
|
"learning_rate": 7.674191121143717e-07, |
|
"loss": 0.0, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"grad_norm": 1.4307224773801863e-06, |
|
"learning_rate": 7.497366440933033e-07, |
|
"loss": 0.0, |
|
"step": 37750 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"grad_norm": 1.4143168414193497e-07, |
|
"learning_rate": 7.320541760722348e-07, |
|
"loss": 0.0, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 9.49, |
|
"grad_norm": 5.554405676719276e-13, |
|
"learning_rate": 7.143717080511664e-07, |
|
"loss": 0.0, |
|
"step": 37850 |
|
}, |
|
{ |
|
"epoch": 9.51, |
|
"grad_norm": 3.3866279225414075e-10, |
|
"learning_rate": 6.966892400300979e-07, |
|
"loss": 0.0941, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 6.048647804846041e-08, |
|
"learning_rate": 6.790067720090294e-07, |
|
"loss": 0.0, |
|
"step": 37950 |
|
}, |
|
{ |
|
"epoch": 9.53, |
|
"grad_norm": 2.0648124632316467e-07, |
|
"learning_rate": 6.613243039879609e-07, |
|
"loss": 0.0, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 9.54, |
|
"grad_norm": 0.0016063437797129154, |
|
"learning_rate": 6.436418359668924e-07, |
|
"loss": 0.0, |
|
"step": 38050 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"grad_norm": 1.247152141559127e-07, |
|
"learning_rate": 6.259593679458239e-07, |
|
"loss": 0.0, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 8.07224120880079e-10, |
|
"learning_rate": 6.082768999247555e-07, |
|
"loss": 0.0, |
|
"step": 38150 |
|
}, |
|
{ |
|
"epoch": 9.58, |
|
"grad_norm": 7.635571320184498e-13, |
|
"learning_rate": 5.90594431903687e-07, |
|
"loss": 0.0, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"grad_norm": 4.792551688836966e-09, |
|
"learning_rate": 5.729119638826185e-07, |
|
"loss": 0.0, |
|
"step": 38250 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"grad_norm": 3.3811686535045737e-06, |
|
"learning_rate": 5.552294958615501e-07, |
|
"loss": 0.0, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 1.0496427338413383e-10, |
|
"learning_rate": 5.375470278404815e-07, |
|
"loss": 0.0, |
|
"step": 38350 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"grad_norm": 0.000780309725087136, |
|
"learning_rate": 5.198645598194131e-07, |
|
"loss": 0.0, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 9.64, |
|
"grad_norm": 4.170356078248005e-06, |
|
"learning_rate": 5.021820917983446e-07, |
|
"loss": 0.0, |
|
"step": 38450 |
|
}, |
|
{ |
|
"epoch": 9.66, |
|
"grad_norm": 0.004391836933791637, |
|
"learning_rate": 4.844996237772762e-07, |
|
"loss": 0.0, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 9.67, |
|
"grad_norm": 6.341772859741468e-06, |
|
"learning_rate": 4.668171557562077e-07, |
|
"loss": 0.0, |
|
"step": 38550 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 0.006547071970999241, |
|
"learning_rate": 4.4913468773513927e-07, |
|
"loss": 0.0, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 9.69, |
|
"grad_norm": 4.025184352940414e-06, |
|
"learning_rate": 4.3145221971407076e-07, |
|
"loss": 0.0, |
|
"step": 38650 |
|
}, |
|
{ |
|
"epoch": 9.71, |
|
"grad_norm": 0.007137050852179527, |
|
"learning_rate": 4.1376975169300226e-07, |
|
"loss": 0.0, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 9.72, |
|
"grad_norm": 2.464033421745171e-10, |
|
"learning_rate": 3.960872836719338e-07, |
|
"loss": 0.0, |
|
"step": 38750 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 1.651005368330516e-05, |
|
"learning_rate": 3.784048156508653e-07, |
|
"loss": 0.0, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 8.712972184021783e-12, |
|
"learning_rate": 3.6072234762979685e-07, |
|
"loss": 0.0, |
|
"step": 38850 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 0.00013669347390532494, |
|
"learning_rate": 3.430398796087284e-07, |
|
"loss": 0.0, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 9.77, |
|
"grad_norm": 2.920106635428965e-05, |
|
"learning_rate": 3.253574115876599e-07, |
|
"loss": 0.0, |
|
"step": 38950 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 3.793598768453421e-09, |
|
"learning_rate": 3.0767494356659144e-07, |
|
"loss": 0.0, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"grad_norm": 1.0695604402144454e-09, |
|
"learning_rate": 2.89992475545523e-07, |
|
"loss": 0.0, |
|
"step": 39050 |
|
}, |
|
{ |
|
"epoch": 9.81, |
|
"grad_norm": 1.2106751764691062e-16, |
|
"learning_rate": 2.723100075244545e-07, |
|
"loss": 0.0, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 9.82, |
|
"grad_norm": 3.6123870472692943e-07, |
|
"learning_rate": 2.54627539503386e-07, |
|
"loss": 0.0, |
|
"step": 39150 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 3.495557336918864e-07, |
|
"learning_rate": 2.3694507148231756e-07, |
|
"loss": 0.0, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 4.76532950415276e-05, |
|
"learning_rate": 2.1926260346124908e-07, |
|
"loss": 0.0, |
|
"step": 39250 |
|
}, |
|
{ |
|
"epoch": 9.86, |
|
"grad_norm": 8.871047612046823e-05, |
|
"learning_rate": 2.015801354401806e-07, |
|
"loss": 0.0, |
|
"step": 39300 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 39870, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 50, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|