{ "best_metric": 0.5480290651321411, "best_model_checkpoint": "E:/000_Tesis/test_executions/pretrain_utg4java\\checkpoint-15052", "epoch": 4.0, "eval_steps": 500, "global_step": 15052, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 4.428943157196045, "learning_rate": 0.00019936921218628373, "loss": 1.6003, "step": 200 }, { "epoch": 0.11, "grad_norm": 3.2689905166625977, "learning_rate": 0.00019668500872366127, "loss": 1.0391, "step": 400 }, { "epoch": 0.16, "grad_norm": 3.4730663299560547, "learning_rate": 0.00019400080526103878, "loss": 0.9586, "step": 600 }, { "epoch": 0.21, "grad_norm": 3.3503410816192627, "learning_rate": 0.00019131660179841633, "loss": 0.9416, "step": 800 }, { "epoch": 0.27, "grad_norm": 3.9026317596435547, "learning_rate": 0.00018863239833579387, "loss": 0.9352, "step": 1000 }, { "epoch": 0.32, "grad_norm": 3.4323599338531494, "learning_rate": 0.00018594819487317138, "loss": 0.9018, "step": 1200 }, { "epoch": 0.37, "grad_norm": 2.90022349357605, "learning_rate": 0.00018326399141054892, "loss": 0.8959, "step": 1400 }, { "epoch": 0.43, "grad_norm": 3.3339474201202393, "learning_rate": 0.00018057978794792646, "loss": 0.8777, "step": 1600 }, { "epoch": 0.48, "grad_norm": 5.839554309844971, "learning_rate": 0.00017789558448530398, "loss": 0.8719, "step": 1800 }, { "epoch": 0.53, "grad_norm": 3.7872190475463867, "learning_rate": 0.00017521138102268152, "loss": 0.8763, "step": 2000 }, { "epoch": 0.58, "grad_norm": 3.5567479133605957, "learning_rate": 0.00017252717756005906, "loss": 0.8601, "step": 2200 }, { "epoch": 0.64, "grad_norm": 2.728114604949951, "learning_rate": 0.0001698429740974366, "loss": 0.8522, "step": 2400 }, { "epoch": 0.69, "grad_norm": 2.7635385990142822, "learning_rate": 0.00016717219165212724, "loss": 0.852, "step": 2600 }, { "epoch": 0.74, "grad_norm": 2.332137107849121, "learning_rate": 0.00016448798818950478, "loss": 0.8622, "step": 2800 }, { "epoch": 0.8, "grad_norm": 2.689222812652588, "learning_rate": 0.00016180378472688232, "loss": 0.8814, "step": 3000 }, { "epoch": 0.85, "grad_norm": 3.2268168926239014, "learning_rate": 0.00015911958126425983, "loss": 0.8223, "step": 3200 }, { "epoch": 0.9, "grad_norm": 2.7432162761688232, "learning_rate": 0.00015643537780163738, "loss": 0.8538, "step": 3400 }, { "epoch": 0.96, "grad_norm": 3.417742967605591, "learning_rate": 0.00015375117433901492, "loss": 0.8118, "step": 3600 }, { "epoch": 1.0, "eval_loss": 0.7114242315292358, "eval_runtime": 144.1269, "eval_samples_per_second": 13.051, "eval_steps_per_second": 1.637, "step": 3763 }, { "epoch": 1.01, "grad_norm": 2.479877233505249, "learning_rate": 0.00015106697087639243, "loss": 0.7983, "step": 3800 }, { "epoch": 1.06, "grad_norm": 2.73120379447937, "learning_rate": 0.00014838276741376997, "loss": 0.7892, "step": 4000 }, { "epoch": 1.12, "grad_norm": 2.924459934234619, "learning_rate": 0.00014569856395114749, "loss": 0.7747, "step": 4200 }, { "epoch": 1.17, "grad_norm": 3.1980695724487305, "learning_rate": 0.00014301436048852503, "loss": 0.7758, "step": 4400 }, { "epoch": 1.22, "grad_norm": 2.564530849456787, "learning_rate": 0.00014033015702590257, "loss": 0.7814, "step": 4600 }, { "epoch": 1.28, "grad_norm": 2.7397029399871826, "learning_rate": 0.0001376593745805932, "loss": 0.7501, "step": 4800 }, { "epoch": 1.33, "grad_norm": 2.7164580821990967, "learning_rate": 0.00013497517111797075, "loss": 0.7712, "step": 5000 }, { "epoch": 1.38, "grad_norm": 2.4152510166168213, "learning_rate": 0.0001322909676553483, "loss": 0.7999, "step": 5200 }, { "epoch": 1.44, "grad_norm": 2.9376044273376465, "learning_rate": 0.00012960676419272583, "loss": 0.7516, "step": 5400 }, { "epoch": 1.49, "grad_norm": 3.0849993228912354, "learning_rate": 0.00012692256073010337, "loss": 0.7663, "step": 5600 }, { "epoch": 1.54, "grad_norm": 2.0552046298980713, "learning_rate": 0.00012423835726748088, "loss": 0.7725, "step": 5800 }, { "epoch": 1.59, "grad_norm": 4.300168991088867, "learning_rate": 0.00012155415380485841, "loss": 0.748, "step": 6000 }, { "epoch": 1.65, "grad_norm": 2.172149658203125, "learning_rate": 0.00011886995034223594, "loss": 0.7219, "step": 6200 }, { "epoch": 1.7, "grad_norm": 4.83528470993042, "learning_rate": 0.00011618574687961348, "loss": 0.7565, "step": 6400 }, { "epoch": 1.75, "grad_norm": 2.4364535808563232, "learning_rate": 0.00011350154341699102, "loss": 0.7433, "step": 6600 }, { "epoch": 1.81, "grad_norm": 3.630072593688965, "learning_rate": 0.00011081733995436854, "loss": 0.729, "step": 6800 }, { "epoch": 1.86, "grad_norm": 3.485239267349243, "learning_rate": 0.00010813313649174608, "loss": 0.7178, "step": 7000 }, { "epoch": 1.91, "grad_norm": 2.265608787536621, "learning_rate": 0.00010544893302912362, "loss": 0.7507, "step": 7200 }, { "epoch": 1.97, "grad_norm": 2.48197340965271, "learning_rate": 0.00010276472956650113, "loss": 0.763, "step": 7400 }, { "epoch": 2.0, "eval_loss": 0.6458454728126526, "eval_runtime": 138.1168, "eval_samples_per_second": 13.619, "eval_steps_per_second": 1.709, "step": 7526 }, { "epoch": 2.02, "grad_norm": 3.2982826232910156, "learning_rate": 0.00010008052610387867, "loss": 0.7165, "step": 7600 }, { "epoch": 2.07, "grad_norm": 2.9913341999053955, "learning_rate": 9.739632264125621e-05, "loss": 0.7093, "step": 7800 }, { "epoch": 2.13, "grad_norm": 3.782381057739258, "learning_rate": 9.471211917863374e-05, "loss": 0.7007, "step": 8000 }, { "epoch": 2.18, "grad_norm": 1.392814040184021, "learning_rate": 9.202791571601128e-05, "loss": 0.6961, "step": 8200 }, { "epoch": 2.23, "grad_norm": 1.976282000541687, "learning_rate": 8.934371225338881e-05, "loss": 0.7014, "step": 8400 }, { "epoch": 2.29, "grad_norm": 1.8080275058746338, "learning_rate": 8.665950879076634e-05, "loss": 0.6845, "step": 8600 }, { "epoch": 2.34, "grad_norm": 2.2179641723632812, "learning_rate": 8.397530532814388e-05, "loss": 0.6766, "step": 8800 }, { "epoch": 2.39, "grad_norm": 4.8770551681518555, "learning_rate": 8.129110186552141e-05, "loss": 0.6716, "step": 9000 }, { "epoch": 2.44, "grad_norm": 2.4720568656921387, "learning_rate": 7.860689840289895e-05, "loss": 0.663, "step": 9200 }, { "epoch": 2.5, "grad_norm": 2.546283006668091, "learning_rate": 7.592269494027648e-05, "loss": 0.6629, "step": 9400 }, { "epoch": 2.55, "grad_norm": 2.2792224884033203, "learning_rate": 7.323849147765402e-05, "loss": 0.6677, "step": 9600 }, { "epoch": 2.6, "grad_norm": 2.87917160987854, "learning_rate": 7.055428801503155e-05, "loss": 0.6697, "step": 9800 }, { "epoch": 2.66, "grad_norm": 4.642680644989014, "learning_rate": 6.787008455240907e-05, "loss": 0.661, "step": 10000 }, { "epoch": 2.71, "grad_norm": 2.9181325435638428, "learning_rate": 6.51858810897866e-05, "loss": 0.6447, "step": 10200 }, { "epoch": 2.76, "grad_norm": 1.9983739852905273, "learning_rate": 6.250167762716414e-05, "loss": 0.6668, "step": 10400 }, { "epoch": 2.82, "grad_norm": 3.3339133262634277, "learning_rate": 5.9817474164541676e-05, "loss": 0.6494, "step": 10600 }, { "epoch": 2.87, "grad_norm": 2.7515461444854736, "learning_rate": 5.7133270701919204e-05, "loss": 0.6471, "step": 10800 }, { "epoch": 2.92, "grad_norm": 2.502366781234741, "learning_rate": 5.4462488256609854e-05, "loss": 0.6513, "step": 11000 }, { "epoch": 2.98, "grad_norm": 2.680753707885742, "learning_rate": 5.177828479398739e-05, "loss": 0.6701, "step": 11200 }, { "epoch": 3.0, "eval_loss": 0.5908769369125366, "eval_runtime": 267.1069, "eval_samples_per_second": 7.042, "eval_steps_per_second": 0.884, "step": 11289 }, { "epoch": 3.03, "grad_norm": 1.714341640472412, "learning_rate": 4.909408133136492e-05, "loss": 0.6571, "step": 11400 }, { "epoch": 3.08, "grad_norm": 2.996971607208252, "learning_rate": 4.640987786874245e-05, "loss": 0.6627, "step": 11600 }, { "epoch": 3.14, "grad_norm": 2.6136629581451416, "learning_rate": 4.3725674406119985e-05, "loss": 0.6486, "step": 11800 }, { "epoch": 3.19, "grad_norm": 1.8732506036758423, "learning_rate": 4.104147094349752e-05, "loss": 0.5942, "step": 12000 }, { "epoch": 3.24, "grad_norm": 3.316340923309326, "learning_rate": 3.8357267480875054e-05, "loss": 0.6228, "step": 12200 }, { "epoch": 3.3, "grad_norm": 2.4904277324676514, "learning_rate": 3.567306401825258e-05, "loss": 0.6313, "step": 12400 }, { "epoch": 3.35, "grad_norm": 2.3936824798583984, "learning_rate": 3.2988860555630116e-05, "loss": 0.6132, "step": 12600 }, { "epoch": 3.4, "grad_norm": 2.5523078441619873, "learning_rate": 3.030465709300765e-05, "loss": 0.6108, "step": 12800 }, { "epoch": 3.45, "grad_norm": 2.3019254207611084, "learning_rate": 2.7620453630385185e-05, "loss": 0.6133, "step": 13000 }, { "epoch": 3.51, "grad_norm": 2.492788791656494, "learning_rate": 2.493625016776272e-05, "loss": 0.6241, "step": 13200 }, { "epoch": 3.56, "grad_norm": 2.1467068195343018, "learning_rate": 2.2265467722453363e-05, "loss": 0.6023, "step": 13400 }, { "epoch": 3.61, "grad_norm": 1.78839910030365, "learning_rate": 1.9581264259830894e-05, "loss": 0.6059, "step": 13600 }, { "epoch": 3.67, "grad_norm": 2.343613624572754, "learning_rate": 1.689706079720843e-05, "loss": 0.5922, "step": 13800 }, { "epoch": 3.72, "grad_norm": 2.2175772190093994, "learning_rate": 1.4212857334585964e-05, "loss": 0.6092, "step": 14000 }, { "epoch": 3.77, "grad_norm": 1.8630731105804443, "learning_rate": 1.1528653871963495e-05, "loss": 0.6055, "step": 14200 }, { "epoch": 3.83, "grad_norm": 1.4924801588058472, "learning_rate": 8.84445040934103e-06, "loss": 0.6053, "step": 14400 }, { "epoch": 3.88, "grad_norm": 3.014918088912964, "learning_rate": 6.1602469467185615e-06, "loss": 0.5846, "step": 14600 }, { "epoch": 3.93, "grad_norm": 1.5459446907043457, "learning_rate": 3.4760434840960946e-06, "loss": 0.5826, "step": 14800 }, { "epoch": 3.99, "grad_norm": 3.936332941055298, "learning_rate": 7.918400214736277e-07, "loss": 0.608, "step": 15000 }, { "epoch": 4.0, "eval_loss": 0.5480290651321411, "eval_runtime": 221.6429, "eval_samples_per_second": 8.487, "eval_steps_per_second": 1.065, "step": 15052 }, { "epoch": 4.0, "step": 15052, "total_flos": 8148659183026176.0, "train_loss": 0.7414002821479045, "train_runtime": 7813.3494, "train_samples_per_second": 7.706, "train_steps_per_second": 1.926 } ], "logging_steps": 200, "max_steps": 15052, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 8148659183026176.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }