End of training
Browse files- README.md +20 -2
- all_results.json +12 -12
- eval_results.json +7 -7
- runs/Jul17_23-18-43_fe084eaf0329/events.out.tfevents.1721264859.fe084eaf0329.1588.1 +3 -0
- train_results.json +6 -6
- trainer_state.json +365 -6
README.md
CHANGED
@@ -1,9 +1,24 @@
|
|
1 |
---
|
2 |
tags:
|
3 |
- generated_from_trainer
|
|
|
|
|
|
|
|
|
4 |
model-index:
|
5 |
- name: roberta-javanese
|
6 |
-
results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
---
|
8 |
|
9 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
@@ -11,7 +26,10 @@ should probably proofread and complete it, then remove this comment. -->
|
|
11 |
|
12 |
# roberta-javanese
|
13 |
|
14 |
-
This model is a fine-tuned version of [](https://huggingface.co/) on
|
|
|
|
|
|
|
15 |
|
16 |
## Model description
|
17 |
|
|
|
1 |
---
|
2 |
tags:
|
3 |
- generated_from_trainer
|
4 |
+
datasets:
|
5 |
+
- akahana/GlotCC-V1-jav-Latn
|
6 |
+
metrics:
|
7 |
+
- accuracy
|
8 |
model-index:
|
9 |
- name: roberta-javanese
|
10 |
+
results:
|
11 |
+
- task:
|
12 |
+
name: Masked Language Modeling
|
13 |
+
type: fill-mask
|
14 |
+
dataset:
|
15 |
+
name: akahana/GlotCC-V1-jav-Latn default
|
16 |
+
type: akahana/GlotCC-V1-jav-Latn
|
17 |
+
args: default
|
18 |
+
metrics:
|
19 |
+
- name: Accuracy
|
20 |
+
type: accuracy
|
21 |
+
value: 0.5302221081011683
|
22 |
---
|
23 |
|
24 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
|
|
26 |
|
27 |
# roberta-javanese
|
28 |
|
29 |
+
This model is a fine-tuned version of [](https://huggingface.co/) on the akahana/GlotCC-V1-jav-Latn default dataset.
|
30 |
+
It achieves the following results on the evaluation set:
|
31 |
+
- Loss: 2.9194
|
32 |
+
- Accuracy: 0.5302
|
33 |
|
34 |
## Model description
|
35 |
|
all_results.json
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"eval_accuracy": 0.
|
4 |
-
"eval_loss": 2.
|
5 |
-
"eval_runtime":
|
6 |
"eval_samples": 4053,
|
7 |
-
"eval_samples_per_second":
|
8 |
-
"eval_steps_per_second":
|
9 |
-
"perplexity":
|
10 |
-
"total_flos": 1.
|
11 |
-
"train_loss": 0.
|
12 |
-
"train_runtime":
|
13 |
"train_samples": 80219,
|
14 |
-
"train_samples_per_second":
|
15 |
-
"train_steps_per_second":
|
16 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 35.0,
|
3 |
+
"eval_accuracy": 0.5302221081011683,
|
4 |
+
"eval_loss": 2.9193999767303467,
|
5 |
+
"eval_runtime": 31.3487,
|
6 |
"eval_samples": 4053,
|
7 |
+
"eval_samples_per_second": 129.287,
|
8 |
+
"eval_steps_per_second": 32.346,
|
9 |
+
"perplexity": 18.530165592844845,
|
10 |
+
"total_flos": 1.8479030675124096e+17,
|
11 |
+
"train_loss": 0.37831091759340585,
|
12 |
+
"train_runtime": 6392.496,
|
13 |
"train_samples": 80219,
|
14 |
+
"train_samples_per_second": 439.213,
|
15 |
+
"train_steps_per_second": 27.453
|
16 |
}
|
eval_results.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"eval_accuracy": 0.
|
4 |
-
"eval_loss": 2.
|
5 |
-
"eval_runtime":
|
6 |
"eval_samples": 4053,
|
7 |
-
"eval_samples_per_second":
|
8 |
-
"eval_steps_per_second":
|
9 |
-
"perplexity":
|
10 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 35.0,
|
3 |
+
"eval_accuracy": 0.5302221081011683,
|
4 |
+
"eval_loss": 2.9193999767303467,
|
5 |
+
"eval_runtime": 31.3487,
|
6 |
"eval_samples": 4053,
|
7 |
+
"eval_samples_per_second": 129.287,
|
8 |
+
"eval_steps_per_second": 32.346,
|
9 |
+
"perplexity": 18.530165592844845
|
10 |
}
|
runs/Jul17_23-18-43_fe084eaf0329/events.out.tfevents.1721264859.fe084eaf0329.1588.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:359c054406426308763fd4c7fcc18b3e809ebf6ab4e0ade30f1f237069aa4e55
|
3 |
+
size 417
|
train_results.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"total_flos": 1.
|
4 |
-
"train_loss": 0.
|
5 |
-
"train_runtime":
|
6 |
"train_samples": 80219,
|
7 |
-
"train_samples_per_second":
|
8 |
-
"train_steps_per_second":
|
9 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 35.0,
|
3 |
+
"total_flos": 1.8479030675124096e+17,
|
4 |
+
"train_loss": 0.37831091759340585,
|
5 |
+
"train_runtime": 6392.496,
|
6 |
"train_samples": 80219,
|
7 |
+
"train_samples_per_second": 439.213,
|
8 |
+
"train_steps_per_second": 27.453
|
9 |
}
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch":
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -2161,12 +2161,371 @@
|
|
2161 |
"train_runtime": 6930.0607,
|
2162 |
"train_samples_per_second": 347.265,
|
2163 |
"train_steps_per_second": 21.705
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2164 |
}
|
2165 |
],
|
2166 |
"logging_steps": 500,
|
2167 |
-
"max_steps":
|
2168 |
"num_input_tokens_seen": 0,
|
2169 |
-
"num_train_epochs":
|
2170 |
"save_steps": 500,
|
2171 |
"stateful_callbacks": {
|
2172 |
"TrainerControl": {
|
@@ -2175,12 +2534,12 @@
|
|
2175 |
"should_evaluate": false,
|
2176 |
"should_log": false,
|
2177 |
"should_save": true,
|
2178 |
-
"should_training_stop":
|
2179 |
},
|
2180 |
"attributes": {}
|
2181 |
}
|
2182 |
},
|
2183 |
-
"total_flos": 1.
|
2184 |
"train_batch_size": 16,
|
2185 |
"trial_name": null,
|
2186 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 35.0,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 175490,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
2161 |
"train_runtime": 6930.0607,
|
2162 |
"train_samples_per_second": 347.265,
|
2163 |
"train_steps_per_second": 21.705
|
2164 |
+
},
|
2165 |
+
{
|
2166 |
+
"epoch": 30.01595532508975,
|
2167 |
+
"grad_norm": 6.077478885650635,
|
2168 |
+
"learning_rate": 4.997720667844322e-05,
|
2169 |
+
"loss": 2.7286,
|
2170 |
+
"step": 150500
|
2171 |
+
},
|
2172 |
+
{
|
2173 |
+
"epoch": 30.11567610690068,
|
2174 |
+
"grad_norm": 6.566033363342285,
|
2175 |
+
"learning_rate": 4.983474841871332e-05,
|
2176 |
+
"loss": 2.7319,
|
2177 |
+
"step": 151000
|
2178 |
+
},
|
2179 |
+
{
|
2180 |
+
"epoch": 30.215396888711606,
|
2181 |
+
"grad_norm": 7.486234188079834,
|
2182 |
+
"learning_rate": 4.969229015898342e-05,
|
2183 |
+
"loss": 2.7899,
|
2184 |
+
"step": 151500
|
2185 |
+
},
|
2186 |
+
{
|
2187 |
+
"epoch": 30.315117670522536,
|
2188 |
+
"grad_norm": 7.640929222106934,
|
2189 |
+
"learning_rate": 4.954983189925352e-05,
|
2190 |
+
"loss": 2.7598,
|
2191 |
+
"step": 152000
|
2192 |
+
},
|
2193 |
+
{
|
2194 |
+
"epoch": 30.414838452333466,
|
2195 |
+
"grad_norm": 7.036547660827637,
|
2196 |
+
"learning_rate": 4.940737363952362e-05,
|
2197 |
+
"loss": 2.754,
|
2198 |
+
"step": 152500
|
2199 |
+
},
|
2200 |
+
{
|
2201 |
+
"epoch": 30.514559234144397,
|
2202 |
+
"grad_norm": 7.128058910369873,
|
2203 |
+
"learning_rate": 4.926491537979372e-05,
|
2204 |
+
"loss": 2.7888,
|
2205 |
+
"step": 153000
|
2206 |
+
},
|
2207 |
+
{
|
2208 |
+
"epoch": 30.614280015955327,
|
2209 |
+
"grad_norm": 7.1788249015808105,
|
2210 |
+
"learning_rate": 4.912245712006382e-05,
|
2211 |
+
"loss": 2.7662,
|
2212 |
+
"step": 153500
|
2213 |
+
},
|
2214 |
+
{
|
2215 |
+
"epoch": 30.714000797766253,
|
2216 |
+
"grad_norm": 7.081215858459473,
|
2217 |
+
"learning_rate": 4.897999886033392e-05,
|
2218 |
+
"loss": 2.7722,
|
2219 |
+
"step": 154000
|
2220 |
+
},
|
2221 |
+
{
|
2222 |
+
"epoch": 30.813721579577184,
|
2223 |
+
"grad_norm": 6.131695747375488,
|
2224 |
+
"learning_rate": 4.883754060060402e-05,
|
2225 |
+
"loss": 2.7464,
|
2226 |
+
"step": 154500
|
2227 |
+
},
|
2228 |
+
{
|
2229 |
+
"epoch": 30.913442361388114,
|
2230 |
+
"grad_norm": 6.66817569732666,
|
2231 |
+
"learning_rate": 4.869508234087412e-05,
|
2232 |
+
"loss": 2.7352,
|
2233 |
+
"step": 155000
|
2234 |
+
},
|
2235 |
+
{
|
2236 |
+
"epoch": 31.013163143199044,
|
2237 |
+
"grad_norm": 7.4430952072143555,
|
2238 |
+
"learning_rate": 4.8552908997663685e-05,
|
2239 |
+
"loss": 2.7503,
|
2240 |
+
"step": 155500
|
2241 |
+
},
|
2242 |
+
{
|
2243 |
+
"epoch": 31.11288392500997,
|
2244 |
+
"grad_norm": 7.984841346740723,
|
2245 |
+
"learning_rate": 4.8410450737933786e-05,
|
2246 |
+
"loss": 2.6821,
|
2247 |
+
"step": 156000
|
2248 |
+
},
|
2249 |
+
{
|
2250 |
+
"epoch": 31.2126047068209,
|
2251 |
+
"grad_norm": 7.386984348297119,
|
2252 |
+
"learning_rate": 4.8267992478203886e-05,
|
2253 |
+
"loss": 2.6916,
|
2254 |
+
"step": 156500
|
2255 |
+
},
|
2256 |
+
{
|
2257 |
+
"epoch": 31.31232548863183,
|
2258 |
+
"grad_norm": 6.3857951164245605,
|
2259 |
+
"learning_rate": 4.8125534218473987e-05,
|
2260 |
+
"loss": 2.6826,
|
2261 |
+
"step": 157000
|
2262 |
+
},
|
2263 |
+
{
|
2264 |
+
"epoch": 31.41204627044276,
|
2265 |
+
"grad_norm": 7.394888401031494,
|
2266 |
+
"learning_rate": 4.798307595874409e-05,
|
2267 |
+
"loss": 2.7099,
|
2268 |
+
"step": 157500
|
2269 |
+
},
|
2270 |
+
{
|
2271 |
+
"epoch": 31.51176705225369,
|
2272 |
+
"grad_norm": 7.39955997467041,
|
2273 |
+
"learning_rate": 4.784061769901419e-05,
|
2274 |
+
"loss": 2.7056,
|
2275 |
+
"step": 158000
|
2276 |
+
},
|
2277 |
+
{
|
2278 |
+
"epoch": 31.61148783406462,
|
2279 |
+
"grad_norm": 6.624033451080322,
|
2280 |
+
"learning_rate": 4.769844435580375e-05,
|
2281 |
+
"loss": 2.6903,
|
2282 |
+
"step": 158500
|
2283 |
+
},
|
2284 |
+
{
|
2285 |
+
"epoch": 31.71120861587555,
|
2286 |
+
"grad_norm": 6.656693458557129,
|
2287 |
+
"learning_rate": 4.755627101259331e-05,
|
2288 |
+
"loss": 2.6877,
|
2289 |
+
"step": 159000
|
2290 |
+
},
|
2291 |
+
{
|
2292 |
+
"epoch": 31.81092939768648,
|
2293 |
+
"grad_norm": 7.474542140960693,
|
2294 |
+
"learning_rate": 4.741381275286341e-05,
|
2295 |
+
"loss": 2.6965,
|
2296 |
+
"step": 159500
|
2297 |
+
},
|
2298 |
+
{
|
2299 |
+
"epoch": 31.910650179497406,
|
2300 |
+
"grad_norm": 7.388774394989014,
|
2301 |
+
"learning_rate": 4.727135449313351e-05,
|
2302 |
+
"loss": 2.7145,
|
2303 |
+
"step": 160000
|
2304 |
+
},
|
2305 |
+
{
|
2306 |
+
"epoch": 32.01037096130834,
|
2307 |
+
"grad_norm": 7.423541069030762,
|
2308 |
+
"learning_rate": 4.712889623340361e-05,
|
2309 |
+
"loss": 2.6943,
|
2310 |
+
"step": 160500
|
2311 |
+
},
|
2312 |
+
{
|
2313 |
+
"epoch": 32.11009174311926,
|
2314 |
+
"grad_norm": 6.063508033752441,
|
2315 |
+
"learning_rate": 4.698643797367371e-05,
|
2316 |
+
"loss": 2.6214,
|
2317 |
+
"step": 161000
|
2318 |
+
},
|
2319 |
+
{
|
2320 |
+
"epoch": 32.20981252493019,
|
2321 |
+
"grad_norm": 7.619082450866699,
|
2322 |
+
"learning_rate": 4.6843979713943814e-05,
|
2323 |
+
"loss": 2.6318,
|
2324 |
+
"step": 161500
|
2325 |
+
},
|
2326 |
+
{
|
2327 |
+
"epoch": 32.30953330674112,
|
2328 |
+
"grad_norm": 6.978066921234131,
|
2329 |
+
"learning_rate": 4.670152145421392e-05,
|
2330 |
+
"loss": 2.6327,
|
2331 |
+
"step": 162000
|
2332 |
+
},
|
2333 |
+
{
|
2334 |
+
"epoch": 32.40925408855205,
|
2335 |
+
"grad_norm": 6.166346073150635,
|
2336 |
+
"learning_rate": 4.655906319448402e-05,
|
2337 |
+
"loss": 2.6419,
|
2338 |
+
"step": 162500
|
2339 |
+
},
|
2340 |
+
{
|
2341 |
+
"epoch": 32.508974870362984,
|
2342 |
+
"grad_norm": 7.364738464355469,
|
2343 |
+
"learning_rate": 4.641660493475412e-05,
|
2344 |
+
"loss": 2.6356,
|
2345 |
+
"step": 163000
|
2346 |
+
},
|
2347 |
+
{
|
2348 |
+
"epoch": 32.608695652173914,
|
2349 |
+
"grad_norm": 7.476531982421875,
|
2350 |
+
"learning_rate": 4.627414667502422e-05,
|
2351 |
+
"loss": 2.6344,
|
2352 |
+
"step": 163500
|
2353 |
+
},
|
2354 |
+
{
|
2355 |
+
"epoch": 32.708416433984844,
|
2356 |
+
"grad_norm": 7.627068042755127,
|
2357 |
+
"learning_rate": 4.613168841529432e-05,
|
2358 |
+
"loss": 2.6434,
|
2359 |
+
"step": 164000
|
2360 |
+
},
|
2361 |
+
{
|
2362 |
+
"epoch": 32.808137215795774,
|
2363 |
+
"grad_norm": 7.334908962249756,
|
2364 |
+
"learning_rate": 4.598923015556442e-05,
|
2365 |
+
"loss": 2.663,
|
2366 |
+
"step": 164500
|
2367 |
+
},
|
2368 |
+
{
|
2369 |
+
"epoch": 32.907857997606705,
|
2370 |
+
"grad_norm": 6.580120086669922,
|
2371 |
+
"learning_rate": 4.5847341728873446e-05,
|
2372 |
+
"loss": 2.6406,
|
2373 |
+
"step": 165000
|
2374 |
+
},
|
2375 |
+
{
|
2376 |
+
"epoch": 33.00757877941763,
|
2377 |
+
"grad_norm": 6.953055381774902,
|
2378 |
+
"learning_rate": 4.570488346914355e-05,
|
2379 |
+
"loss": 2.6517,
|
2380 |
+
"step": 165500
|
2381 |
+
},
|
2382 |
+
{
|
2383 |
+
"epoch": 33.10729956122856,
|
2384 |
+
"grad_norm": 6.980926036834717,
|
2385 |
+
"learning_rate": 4.556242520941365e-05,
|
2386 |
+
"loss": 2.589,
|
2387 |
+
"step": 166000
|
2388 |
+
},
|
2389 |
+
{
|
2390 |
+
"epoch": 33.20702034303949,
|
2391 |
+
"grad_norm": 7.215412616729736,
|
2392 |
+
"learning_rate": 4.541996694968375e-05,
|
2393 |
+
"loss": 2.5831,
|
2394 |
+
"step": 166500
|
2395 |
+
},
|
2396 |
+
{
|
2397 |
+
"epoch": 33.30674112485042,
|
2398 |
+
"grad_norm": 7.203444004058838,
|
2399 |
+
"learning_rate": 4.527750868995385e-05,
|
2400 |
+
"loss": 2.5739,
|
2401 |
+
"step": 167000
|
2402 |
+
},
|
2403 |
+
{
|
2404 |
+
"epoch": 33.40646190666135,
|
2405 |
+
"grad_norm": 5.696502685546875,
|
2406 |
+
"learning_rate": 4.513505043022395e-05,
|
2407 |
+
"loss": 2.604,
|
2408 |
+
"step": 167500
|
2409 |
+
},
|
2410 |
+
{
|
2411 |
+
"epoch": 33.50618268847228,
|
2412 |
+
"grad_norm": 6.160342216491699,
|
2413 |
+
"learning_rate": 4.499259217049405e-05,
|
2414 |
+
"loss": 2.5848,
|
2415 |
+
"step": 168000
|
2416 |
+
},
|
2417 |
+
{
|
2418 |
+
"epoch": 33.60590347028321,
|
2419 |
+
"grad_norm": 6.758869171142578,
|
2420 |
+
"learning_rate": 4.485013391076415e-05,
|
2421 |
+
"loss": 2.6157,
|
2422 |
+
"step": 168500
|
2423 |
+
},
|
2424 |
+
{
|
2425 |
+
"epoch": 33.70562425209414,
|
2426 |
+
"grad_norm": 7.064002513885498,
|
2427 |
+
"learning_rate": 4.4708245484073166e-05,
|
2428 |
+
"loss": 2.5765,
|
2429 |
+
"step": 169000
|
2430 |
+
},
|
2431 |
+
{
|
2432 |
+
"epoch": 33.80534503390506,
|
2433 |
+
"grad_norm": 7.993391513824463,
|
2434 |
+
"learning_rate": 4.4565787224343267e-05,
|
2435 |
+
"loss": 2.6115,
|
2436 |
+
"step": 169500
|
2437 |
+
},
|
2438 |
+
{
|
2439 |
+
"epoch": 33.90506581571599,
|
2440 |
+
"grad_norm": 7.196022033691406,
|
2441 |
+
"learning_rate": 4.442332896461337e-05,
|
2442 |
+
"loss": 2.591,
|
2443 |
+
"step": 170000
|
2444 |
+
},
|
2445 |
+
{
|
2446 |
+
"epoch": 34.00478659752692,
|
2447 |
+
"grad_norm": 8.118667602539062,
|
2448 |
+
"learning_rate": 4.428115562140293e-05,
|
2449 |
+
"loss": 2.5833,
|
2450 |
+
"step": 170500
|
2451 |
+
},
|
2452 |
+
{
|
2453 |
+
"epoch": 34.10450737933785,
|
2454 |
+
"grad_norm": 7.465199947357178,
|
2455 |
+
"learning_rate": 4.413869736167303e-05,
|
2456 |
+
"loss": 2.5509,
|
2457 |
+
"step": 171000
|
2458 |
+
},
|
2459 |
+
{
|
2460 |
+
"epoch": 34.204228161148784,
|
2461 |
+
"grad_norm": 6.739304542541504,
|
2462 |
+
"learning_rate": 4.399623910194313e-05,
|
2463 |
+
"loss": 2.5357,
|
2464 |
+
"step": 171500
|
2465 |
+
},
|
2466 |
+
{
|
2467 |
+
"epoch": 34.303948942959714,
|
2468 |
+
"grad_norm": 6.758444786071777,
|
2469 |
+
"learning_rate": 4.385378084221323e-05,
|
2470 |
+
"loss": 2.567,
|
2471 |
+
"step": 172000
|
2472 |
+
},
|
2473 |
+
{
|
2474 |
+
"epoch": 34.403669724770644,
|
2475 |
+
"grad_norm": 6.511049270629883,
|
2476 |
+
"learning_rate": 4.371132258248333e-05,
|
2477 |
+
"loss": 2.5759,
|
2478 |
+
"step": 172500
|
2479 |
+
},
|
2480 |
+
{
|
2481 |
+
"epoch": 34.503390506581574,
|
2482 |
+
"grad_norm": 7.730967044830322,
|
2483 |
+
"learning_rate": 4.356886432275343e-05,
|
2484 |
+
"loss": 2.5494,
|
2485 |
+
"step": 173000
|
2486 |
+
},
|
2487 |
+
{
|
2488 |
+
"epoch": 34.6031112883925,
|
2489 |
+
"grad_norm": 6.543623924255371,
|
2490 |
+
"learning_rate": 4.342640606302353e-05,
|
2491 |
+
"loss": 2.5482,
|
2492 |
+
"step": 173500
|
2493 |
+
},
|
2494 |
+
{
|
2495 |
+
"epoch": 34.70283207020343,
|
2496 |
+
"grad_norm": 7.216828346252441,
|
2497 |
+
"learning_rate": 4.328394780329364e-05,
|
2498 |
+
"loss": 2.5593,
|
2499 |
+
"step": 174000
|
2500 |
+
},
|
2501 |
+
{
|
2502 |
+
"epoch": 34.80255285201436,
|
2503 |
+
"grad_norm": 6.891706943511963,
|
2504 |
+
"learning_rate": 4.3141774460083194e-05,
|
2505 |
+
"loss": 2.5409,
|
2506 |
+
"step": 174500
|
2507 |
+
},
|
2508 |
+
{
|
2509 |
+
"epoch": 34.90227363382529,
|
2510 |
+
"grad_norm": 7.4927778244018555,
|
2511 |
+
"learning_rate": 4.29993162003533e-05,
|
2512 |
+
"loss": 2.5673,
|
2513 |
+
"step": 175000
|
2514 |
+
},
|
2515 |
+
{
|
2516 |
+
"epoch": 35.0,
|
2517 |
+
"step": 175490,
|
2518 |
+
"total_flos": 1.8479030675124096e+17,
|
2519 |
+
"train_loss": 0.37831091759340585,
|
2520 |
+
"train_runtime": 6392.496,
|
2521 |
+
"train_samples_per_second": 439.213,
|
2522 |
+
"train_steps_per_second": 27.453
|
2523 |
}
|
2524 |
],
|
2525 |
"logging_steps": 500,
|
2526 |
+
"max_steps": 175490,
|
2527 |
"num_input_tokens_seen": 0,
|
2528 |
+
"num_train_epochs": 35,
|
2529 |
"save_steps": 500,
|
2530 |
"stateful_callbacks": {
|
2531 |
"TrainerControl": {
|
|
|
2534 |
"should_evaluate": false,
|
2535 |
"should_log": false,
|
2536 |
"should_save": true,
|
2537 |
+
"should_training_stop": true
|
2538 |
},
|
2539 |
"attributes": {}
|
2540 |
}
|
2541 |
},
|
2542 |
+
"total_flos": 1.8479030675124096e+17,
|
2543 |
"train_batch_size": 16,
|
2544 |
"trial_name": null,
|
2545 |
"trial_params": null
|