|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.7199999999999998, |
|
"eval_steps": 3, |
|
"global_step": 48, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.1656520962715149, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4615, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 1.4899382591247559, |
|
"eval_runtime": 29.186, |
|
"eval_samples_per_second": 3.426, |
|
"eval_steps_per_second": 1.713, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.1882542371749878, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4241, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.15945176780223846, |
|
"learning_rate": 6e-05, |
|
"loss": 1.3849, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.485183835029602, |
|
"eval_runtime": 29.3484, |
|
"eval_samples_per_second": 3.407, |
|
"eval_steps_per_second": 1.704, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.13675835728645325, |
|
"learning_rate": 8e-05, |
|
"loss": 1.2212, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1532098948955536, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3626, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.16159594058990479, |
|
"learning_rate": 0.00012, |
|
"loss": 1.3665, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.441084623336792, |
|
"eval_runtime": 29.4785, |
|
"eval_samples_per_second": 3.392, |
|
"eval_steps_per_second": 1.696, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.1462002545595169, |
|
"learning_rate": 0.00014, |
|
"loss": 1.3003, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.13418763875961304, |
|
"learning_rate": 0.00016, |
|
"loss": 1.3331, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.10984567552804947, |
|
"learning_rate": 0.00018, |
|
"loss": 1.2689, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 1.3380621671676636, |
|
"eval_runtime": 29.5305, |
|
"eval_samples_per_second": 3.386, |
|
"eval_steps_per_second": 1.693, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.10075916349887848, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2942, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.11774784326553345, |
|
"learning_rate": 0.000199658449300667, |
|
"loss": 1.2936, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.10691336542367935, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 1.2258, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 1.296021580696106, |
|
"eval_runtime": 29.6291, |
|
"eval_samples_per_second": 3.375, |
|
"eval_steps_per_second": 1.688, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.10832437872886658, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 1.2647, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.11240752041339874, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 1.2595, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.10769112408161163, |
|
"learning_rate": 0.00019157733266550575, |
|
"loss": 1.2518, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 1.279650092124939, |
|
"eval_runtime": 29.6168, |
|
"eval_samples_per_second": 3.376, |
|
"eval_steps_per_second": 1.688, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.09908384829759598, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 1.1644, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.09107685834169388, |
|
"learning_rate": 0.00018371664782625287, |
|
"loss": 1.1601, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.09361294656991959, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 1.2263, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 1.2533847093582153, |
|
"eval_runtime": 29.5676, |
|
"eval_samples_per_second": 3.382, |
|
"eval_steps_per_second": 1.691, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.0980026125907898, |
|
"learning_rate": 0.00017357239106731317, |
|
"loss": 1.2272, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.07957063615322113, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 1.1913, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.07226990163326263, |
|
"learning_rate": 0.0001614212712689668, |
|
"loss": 1.1343, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"eval_loss": 1.2354038953781128, |
|
"eval_runtime": 29.5859, |
|
"eval_samples_per_second": 3.38, |
|
"eval_steps_per_second": 1.69, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.0797078013420105, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 1.202, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.0746772438287735, |
|
"learning_rate": 0.00014759473930370736, |
|
"loss": 1.2479, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.07190073281526566, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 1.2699, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 1.2254745960235596, |
|
"eval_runtime": 29.6412, |
|
"eval_samples_per_second": 3.374, |
|
"eval_steps_per_second": 1.687, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.06926661729812622, |
|
"learning_rate": 0.00013246994692046836, |
|
"loss": 1.2042, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.07788683474063873, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 1.1925, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.06513918191194534, |
|
"learning_rate": 0.00011645945902807341, |
|
"loss": 1.1493, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_loss": 1.2227890491485596, |
|
"eval_runtime": 29.6808, |
|
"eval_samples_per_second": 3.369, |
|
"eval_steps_per_second": 1.685, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.07514671981334686, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 1.1685, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.06782150268554688, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2049, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.06837104260921478, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 1.153, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"eval_loss": 1.2187583446502686, |
|
"eval_runtime": 29.5599, |
|
"eval_samples_per_second": 3.383, |
|
"eval_steps_per_second": 1.691, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.0675550326704979, |
|
"learning_rate": 8.35405409719266e-05, |
|
"loss": 1.1826, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.06812074780464172, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 1.2106, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.06854978948831558, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 1.1947, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 1.2183395624160767, |
|
"eval_runtime": 29.5611, |
|
"eval_samples_per_second": 3.383, |
|
"eval_steps_per_second": 1.691, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.06954147666692734, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 1.1887, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.07219364494085312, |
|
"learning_rate": 5.240526069629265e-05, |
|
"loss": 1.1784, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.07003732025623322, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 1.1125, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 1.2157045602798462, |
|
"eval_runtime": 29.5711, |
|
"eval_samples_per_second": 3.382, |
|
"eval_steps_per_second": 1.691, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.06867608428001404, |
|
"learning_rate": 3.857872873103322e-05, |
|
"loss": 1.1847, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.06787115335464478, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 1.1651, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.06784378737211227, |
|
"learning_rate": 2.6427608932686843e-05, |
|
"loss": 1.1512, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"eval_loss": 1.2122877836227417, |
|
"eval_runtime": 29.5849, |
|
"eval_samples_per_second": 3.38, |
|
"eval_steps_per_second": 1.69, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.064244844019413, |
|
"learning_rate": 2.1085949060360654e-05, |
|
"loss": 1.154, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.0654948428273201, |
|
"learning_rate": 1.6283352173747145e-05, |
|
"loss": 1.1454, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.06797634065151215, |
|
"learning_rate": 1.2052624879351104e-05, |
|
"loss": 1.1883, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"eval_loss": 1.209986686706543, |
|
"eval_runtime": 29.54, |
|
"eval_samples_per_second": 3.385, |
|
"eval_steps_per_second": 1.693, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.06889563798904419, |
|
"learning_rate": 8.422667334494249e-06, |
|
"loss": 1.174, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.06667731702327728, |
|
"learning_rate": 5.418275829936537e-06, |
|
"loss": 1.0968, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.0668441653251648, |
|
"learning_rate": 3.059973406066963e-06, |
|
"loss": 1.1012, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"eval_loss": 1.2118505239486694, |
|
"eval_runtime": 29.5358, |
|
"eval_samples_per_second": 3.386, |
|
"eval_steps_per_second": 1.693, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.06772288680076599, |
|
"learning_rate": 1.3638696597277679e-06, |
|
"loss": 1.1264, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.06901554763317108, |
|
"learning_rate": 3.415506993330153e-07, |
|
"loss": 1.1457, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 0.06979726254940033, |
|
"learning_rate": 0.0, |
|
"loss": 1.1891, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"eval_loss": 1.2122316360473633, |
|
"eval_runtime": 29.5331, |
|
"eval_samples_per_second": 3.386, |
|
"eval_steps_per_second": 1.693, |
|
"step": 48 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 48, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 12, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9974952442724352.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|