fats-fme commited on
Commit
0ac1fbe
·
verified ·
1 Parent(s): 02cfd10

Training in progress, step 563, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efedcd7712efe5df4242d40d0fc157567550dc57198de0fde11a067a253c3786
3
  size 101752088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8c595720a41e9384906f2d3e480d3aa304689f4e0612ea898704da5876df4ce
3
  size 101752088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a3093ef84d124bf4f3a388a3f58cedd89b5fbf3ec80a866e1189f65649a0f5e
3
  size 203713238
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa85267159afa7ee96961d8e086ff9125093d3784893802d3ad17d9a528ea772
3
  size 203713238
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e52b4ddcd925a725a65812af6610fe4debc708c6e4fc1ee7e0e17160e2a6fc5
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da5aca99bcde1ec8b0dc9a1dd61af6a832b8ca17d5e8974363414a00fe156561
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d9aa8c4c4812086f9a0cd74c7d98dc727224f492c2c8deb8168a9fa04e2846e
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6355acd2ed897b92e4c2ba4445c30b5ccb7ab5d77d60dc3141cf3e52bd674a29
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2bb049f58262ac24b66ea8e4bbb35c588cda72b0f20c7495d16197e65e5d114
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1aa6a07fde8b7b9172b2dbbfa971ec598626e705bbfe8cea4899774e3eba905a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.751165371809101,
5
  "eval_steps": 141,
6
- "global_step": 423,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3000,6 +3000,986 @@
3000
  "eval_samples_per_second": 6.06,
3001
  "eval_steps_per_second": 1.52,
3002
  "step": 423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3003
  }
3004
  ],
3005
  "logging_steps": 1,
@@ -3014,12 +3994,12 @@
3014
  "should_evaluate": false,
3015
  "should_log": false,
3016
  "should_save": true,
3017
- "should_training_stop": false
3018
  },
3019
  "attributes": {}
3020
  }
3021
  },
3022
- "total_flos": 5.571234948741857e+17,
3023
  "train_batch_size": 2,
3024
  "trial_name": null,
3025
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9997780244173141,
5
  "eval_steps": 141,
6
+ "global_step": 563,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3000
  "eval_samples_per_second": 6.06,
3001
  "eval_steps_per_second": 1.52,
3002
  "step": 423
3003
+ },
3004
+ {
3005
+ "epoch": 0.7529411764705882,
3006
+ "grad_norm": 0.44857800006866455,
3007
+ "learning_rate": 3.4094199810279924e-05,
3008
+ "loss": 0.6288,
3009
+ "step": 424
3010
+ },
3011
+ {
3012
+ "epoch": 0.7547169811320755,
3013
+ "grad_norm": 0.564264178276062,
3014
+ "learning_rate": 3.363486013983788e-05,
3015
+ "loss": 0.784,
3016
+ "step": 425
3017
+ },
3018
+ {
3019
+ "epoch": 0.7564927857935627,
3020
+ "grad_norm": 0.26752522587776184,
3021
+ "learning_rate": 3.317800934775696e-05,
3022
+ "loss": 0.8435,
3023
+ "step": 426
3024
+ },
3025
+ {
3026
+ "epoch": 0.7582685904550499,
3027
+ "grad_norm": 0.2690868377685547,
3028
+ "learning_rate": 3.2723664567219626e-05,
3029
+ "loss": 0.8125,
3030
+ "step": 427
3031
+ },
3032
+ {
3033
+ "epoch": 0.7600443951165372,
3034
+ "grad_norm": 0.28497472405433655,
3035
+ "learning_rate": 3.227184283742591e-05,
3036
+ "loss": 0.7349,
3037
+ "step": 428
3038
+ },
3039
+ {
3040
+ "epoch": 0.7618201997780244,
3041
+ "grad_norm": 0.3184243142604828,
3042
+ "learning_rate": 3.182256110295437e-05,
3043
+ "loss": 0.8037,
3044
+ "step": 429
3045
+ },
3046
+ {
3047
+ "epoch": 0.7635960044395117,
3048
+ "grad_norm": 0.29851233959198,
3049
+ "learning_rate": 3.137583621312665e-05,
3050
+ "loss": 0.7631,
3051
+ "step": 430
3052
+ },
3053
+ {
3054
+ "epoch": 0.7653718091009989,
3055
+ "grad_norm": 0.31429582834243774,
3056
+ "learning_rate": 3.093168492137557e-05,
3057
+ "loss": 0.7446,
3058
+ "step": 431
3059
+ },
3060
+ {
3061
+ "epoch": 0.7671476137624861,
3062
+ "grad_norm": 0.3191598653793335,
3063
+ "learning_rate": 3.0490123884616796e-05,
3064
+ "loss": 0.7301,
3065
+ "step": 432
3066
+ },
3067
+ {
3068
+ "epoch": 0.7689234184239734,
3069
+ "grad_norm": 0.32959750294685364,
3070
+ "learning_rate": 3.0051169662624225e-05,
3071
+ "loss": 0.7891,
3072
+ "step": 433
3073
+ },
3074
+ {
3075
+ "epoch": 0.7706992230854606,
3076
+ "grad_norm": 0.306445449590683,
3077
+ "learning_rate": 2.9614838717408867e-05,
3078
+ "loss": 0.7205,
3079
+ "step": 434
3080
+ },
3081
+ {
3082
+ "epoch": 0.7724750277469479,
3083
+ "grad_norm": 0.3152560889720917,
3084
+ "learning_rate": 2.9181147412601562e-05,
3085
+ "loss": 0.7436,
3086
+ "step": 435
3087
+ },
3088
+ {
3089
+ "epoch": 0.7742508324084351,
3090
+ "grad_norm": 0.3343624174594879,
3091
+ "learning_rate": 2.8750112012839214e-05,
3092
+ "loss": 0.7004,
3093
+ "step": 436
3094
+ },
3095
+ {
3096
+ "epoch": 0.7760266370699223,
3097
+ "grad_norm": 0.3389514982700348,
3098
+ "learning_rate": 2.8321748683154893e-05,
3099
+ "loss": 0.7037,
3100
+ "step": 437
3101
+ },
3102
+ {
3103
+ "epoch": 0.7778024417314096,
3104
+ "grad_norm": 0.36133134365081787,
3105
+ "learning_rate": 2.789607348837153e-05,
3106
+ "loss": 0.7648,
3107
+ "step": 438
3108
+ },
3109
+ {
3110
+ "epoch": 0.7795782463928967,
3111
+ "grad_norm": 0.3495211601257324,
3112
+ "learning_rate": 2.7473102392499518e-05,
3113
+ "loss": 0.7668,
3114
+ "step": 439
3115
+ },
3116
+ {
3117
+ "epoch": 0.781354051054384,
3118
+ "grad_norm": 0.3677636384963989,
3119
+ "learning_rate": 2.7052851258137935e-05,
3120
+ "loss": 0.7267,
3121
+ "step": 440
3122
+ },
3123
+ {
3124
+ "epoch": 0.7831298557158712,
3125
+ "grad_norm": 0.36717966198921204,
3126
+ "learning_rate": 2.6635335845879737e-05,
3127
+ "loss": 0.7577,
3128
+ "step": 441
3129
+ },
3130
+ {
3131
+ "epoch": 0.7849056603773585,
3132
+ "grad_norm": 0.38047298789024353,
3133
+ "learning_rate": 2.622057181372063e-05,
3134
+ "loss": 0.6682,
3135
+ "step": 442
3136
+ },
3137
+ {
3138
+ "epoch": 0.7866814650388457,
3139
+ "grad_norm": 0.37885257601737976,
3140
+ "learning_rate": 2.5808574716471856e-05,
3141
+ "loss": 0.7454,
3142
+ "step": 443
3143
+ },
3144
+ {
3145
+ "epoch": 0.7884572697003329,
3146
+ "grad_norm": 0.37553516030311584,
3147
+ "learning_rate": 2.5399360005176886e-05,
3148
+ "loss": 0.6721,
3149
+ "step": 444
3150
+ },
3151
+ {
3152
+ "epoch": 0.7902330743618202,
3153
+ "grad_norm": 0.42358729243278503,
3154
+ "learning_rate": 2.4992943026531935e-05,
3155
+ "loss": 0.7339,
3156
+ "step": 445
3157
+ },
3158
+ {
3159
+ "epoch": 0.7920088790233074,
3160
+ "grad_norm": 0.3923121392726898,
3161
+ "learning_rate": 2.4589339022310386e-05,
3162
+ "loss": 0.6952,
3163
+ "step": 446
3164
+ },
3165
+ {
3166
+ "epoch": 0.7937846836847947,
3167
+ "grad_norm": 0.39447784423828125,
3168
+ "learning_rate": 2.4188563128791254e-05,
3169
+ "loss": 0.6898,
3170
+ "step": 447
3171
+ },
3172
+ {
3173
+ "epoch": 0.7955604883462819,
3174
+ "grad_norm": 0.4486071467399597,
3175
+ "learning_rate": 2.379063037619146e-05,
3176
+ "loss": 0.6485,
3177
+ "step": 448
3178
+ },
3179
+ {
3180
+ "epoch": 0.7973362930077691,
3181
+ "grad_norm": 0.47466063499450684,
3182
+ "learning_rate": 2.339555568810221e-05,
3183
+ "loss": 0.6961,
3184
+ "step": 449
3185
+ },
3186
+ {
3187
+ "epoch": 0.7991120976692564,
3188
+ "grad_norm": 0.5662741661071777,
3189
+ "learning_rate": 2.300335388092929e-05,
3190
+ "loss": 0.7295,
3191
+ "step": 450
3192
+ },
3193
+ {
3194
+ "epoch": 0.8008879023307436,
3195
+ "grad_norm": 0.28682488203048706,
3196
+ "learning_rate": 2.2614039663337417e-05,
3197
+ "loss": 0.8068,
3198
+ "step": 451
3199
+ },
3200
+ {
3201
+ "epoch": 0.8026637069922309,
3202
+ "grad_norm": 0.2780812680721283,
3203
+ "learning_rate": 2.222762763569862e-05,
3204
+ "loss": 0.8236,
3205
+ "step": 452
3206
+ },
3207
+ {
3208
+ "epoch": 0.8044395116537181,
3209
+ "grad_norm": 0.29560670256614685,
3210
+ "learning_rate": 2.184413228954468e-05,
3211
+ "loss": 0.7894,
3212
+ "step": 453
3213
+ },
3214
+ {
3215
+ "epoch": 0.8062153163152054,
3216
+ "grad_norm": 0.2896682918071747,
3217
+ "learning_rate": 2.1463568007023704e-05,
3218
+ "loss": 0.7534,
3219
+ "step": 454
3220
+ },
3221
+ {
3222
+ "epoch": 0.8079911209766926,
3223
+ "grad_norm": 0.2878231108188629,
3224
+ "learning_rate": 2.1085949060360654e-05,
3225
+ "loss": 0.7245,
3226
+ "step": 455
3227
+ },
3228
+ {
3229
+ "epoch": 0.8097669256381798,
3230
+ "grad_norm": 0.3214217722415924,
3231
+ "learning_rate": 2.0711289611322204e-05,
3232
+ "loss": 0.7731,
3233
+ "step": 456
3234
+ },
3235
+ {
3236
+ "epoch": 0.8115427302996671,
3237
+ "grad_norm": 0.3201158940792084,
3238
+ "learning_rate": 2.033960371068557e-05,
3239
+ "loss": 0.7475,
3240
+ "step": 457
3241
+ },
3242
+ {
3243
+ "epoch": 0.8133185349611542,
3244
+ "grad_norm": 0.33665937185287476,
3245
+ "learning_rate": 1.9970905297711606e-05,
3246
+ "loss": 0.721,
3247
+ "step": 458
3248
+ },
3249
+ {
3250
+ "epoch": 0.8150943396226416,
3251
+ "grad_norm": 0.3305956721305847,
3252
+ "learning_rate": 1.9605208199621995e-05,
3253
+ "loss": 0.7249,
3254
+ "step": 459
3255
+ },
3256
+ {
3257
+ "epoch": 0.8168701442841287,
3258
+ "grad_norm": 0.3384665548801422,
3259
+ "learning_rate": 1.924252613108073e-05,
3260
+ "loss": 0.724,
3261
+ "step": 460
3262
+ },
3263
+ {
3264
+ "epoch": 0.8186459489456159,
3265
+ "grad_norm": 0.359944224357605,
3266
+ "learning_rate": 1.888287269367979e-05,
3267
+ "loss": 0.7516,
3268
+ "step": 461
3269
+ },
3270
+ {
3271
+ "epoch": 0.8204217536071032,
3272
+ "grad_norm": 0.3357195556163788,
3273
+ "learning_rate": 1.8526261375428955e-05,
3274
+ "loss": 0.7327,
3275
+ "step": 462
3276
+ },
3277
+ {
3278
+ "epoch": 0.8221975582685904,
3279
+ "grad_norm": 0.34272313117980957,
3280
+ "learning_rate": 1.8172705550250092e-05,
3281
+ "loss": 0.7161,
3282
+ "step": 463
3283
+ },
3284
+ {
3285
+ "epoch": 0.8239733629300777,
3286
+ "grad_norm": 0.32507383823394775,
3287
+ "learning_rate": 1.7822218477475494e-05,
3288
+ "loss": 0.6392,
3289
+ "step": 464
3290
+ },
3291
+ {
3292
+ "epoch": 0.8257491675915649,
3293
+ "grad_norm": 0.37359514832496643,
3294
+ "learning_rate": 1.7474813301350666e-05,
3295
+ "loss": 0.7298,
3296
+ "step": 465
3297
+ },
3298
+ {
3299
+ "epoch": 0.8275249722530522,
3300
+ "grad_norm": 0.3348104655742645,
3301
+ "learning_rate": 1.7130503050541368e-05,
3302
+ "loss": 0.6568,
3303
+ "step": 466
3304
+ },
3305
+ {
3306
+ "epoch": 0.8293007769145394,
3307
+ "grad_norm": 0.3460003435611725,
3308
+ "learning_rate": 1.6789300637645e-05,
3309
+ "loss": 0.6742,
3310
+ "step": 467
3311
+ },
3312
+ {
3313
+ "epoch": 0.8310765815760266,
3314
+ "grad_norm": 0.3993259370326996,
3315
+ "learning_rate": 1.6451218858706374e-05,
3316
+ "loss": 0.7365,
3317
+ "step": 468
3318
+ },
3319
+ {
3320
+ "epoch": 0.8328523862375139,
3321
+ "grad_norm": 0.3710460960865021,
3322
+ "learning_rate": 1.6116270392737754e-05,
3323
+ "loss": 0.699,
3324
+ "step": 469
3325
+ },
3326
+ {
3327
+ "epoch": 0.8346281908990011,
3328
+ "grad_norm": 0.41104644536972046,
3329
+ "learning_rate": 1.578446780124344e-05,
3330
+ "loss": 0.7185,
3331
+ "step": 470
3332
+ },
3333
+ {
3334
+ "epoch": 0.8364039955604884,
3335
+ "grad_norm": 0.39822298288345337,
3336
+ "learning_rate": 1.5455823527748626e-05,
3337
+ "loss": 0.6968,
3338
+ "step": 471
3339
+ },
3340
+ {
3341
+ "epoch": 0.8381798002219756,
3342
+ "grad_norm": 0.3909063935279846,
3343
+ "learning_rate": 1.5130349897332763e-05,
3344
+ "loss": 0.6427,
3345
+ "step": 472
3346
+ },
3347
+ {
3348
+ "epoch": 0.8399556048834628,
3349
+ "grad_norm": 0.39908355474472046,
3350
+ "learning_rate": 1.4808059116167305e-05,
3351
+ "loss": 0.6307,
3352
+ "step": 473
3353
+ },
3354
+ {
3355
+ "epoch": 0.8417314095449501,
3356
+ "grad_norm": 0.4725106954574585,
3357
+ "learning_rate": 1.4488963271057943e-05,
3358
+ "loss": 0.7274,
3359
+ "step": 474
3360
+ },
3361
+ {
3362
+ "epoch": 0.8435072142064373,
3363
+ "grad_norm": 0.58518385887146,
3364
+ "learning_rate": 1.4173074328991377e-05,
3365
+ "loss": 0.7112,
3366
+ "step": 475
3367
+ },
3368
+ {
3369
+ "epoch": 0.8452830188679246,
3370
+ "grad_norm": 0.2664174735546112,
3371
+ "learning_rate": 1.3860404136686411e-05,
3372
+ "loss": 0.8515,
3373
+ "step": 476
3374
+ },
3375
+ {
3376
+ "epoch": 0.8470588235294118,
3377
+ "grad_norm": 0.30460578203201294,
3378
+ "learning_rate": 1.355096442014977e-05,
3379
+ "loss": 0.8107,
3380
+ "step": 477
3381
+ },
3382
+ {
3383
+ "epoch": 0.848834628190899,
3384
+ "grad_norm": 0.28965044021606445,
3385
+ "learning_rate": 1.3244766784236307e-05,
3386
+ "loss": 0.7361,
3387
+ "step": 478
3388
+ },
3389
+ {
3390
+ "epoch": 0.8506104328523862,
3391
+ "grad_norm": 0.3329102396965027,
3392
+ "learning_rate": 1.294182271221377e-05,
3393
+ "loss": 0.7712,
3394
+ "step": 479
3395
+ },
3396
+ {
3397
+ "epoch": 0.8523862375138734,
3398
+ "grad_norm": 0.30333012342453003,
3399
+ "learning_rate": 1.2642143565332154e-05,
3400
+ "loss": 0.7245,
3401
+ "step": 480
3402
+ },
3403
+ {
3404
+ "epoch": 0.8541620421753607,
3405
+ "grad_norm": 0.328744500875473,
3406
+ "learning_rate": 1.2345740582397648e-05,
3407
+ "loss": 0.7557,
3408
+ "step": 481
3409
+ },
3410
+ {
3411
+ "epoch": 0.8559378468368479,
3412
+ "grad_norm": 0.33845219016075134,
3413
+ "learning_rate": 1.2052624879351104e-05,
3414
+ "loss": 0.7857,
3415
+ "step": 482
3416
+ },
3417
+ {
3418
+ "epoch": 0.8577136514983352,
3419
+ "grad_norm": 0.3305346667766571,
3420
+ "learning_rate": 1.176280744885121e-05,
3421
+ "loss": 0.7512,
3422
+ "step": 483
3423
+ },
3424
+ {
3425
+ "epoch": 0.8594894561598224,
3426
+ "grad_norm": 0.340707391500473,
3427
+ "learning_rate": 1.1476299159862203e-05,
3428
+ "loss": 0.7678,
3429
+ "step": 484
3430
+ },
3431
+ {
3432
+ "epoch": 0.8612652608213096,
3433
+ "grad_norm": 0.3646427392959595,
3434
+ "learning_rate": 1.119311075724625e-05,
3435
+ "loss": 0.7473,
3436
+ "step": 485
3437
+ },
3438
+ {
3439
+ "epoch": 0.8630410654827969,
3440
+ "grad_norm": 0.35034942626953125,
3441
+ "learning_rate": 1.09132528613605e-05,
3442
+ "loss": 0.7679,
3443
+ "step": 486
3444
+ },
3445
+ {
3446
+ "epoch": 0.8648168701442841,
3447
+ "grad_norm": 0.35471147298812866,
3448
+ "learning_rate": 1.0636735967658784e-05,
3449
+ "loss": 0.7416,
3450
+ "step": 487
3451
+ },
3452
+ {
3453
+ "epoch": 0.8665926748057714,
3454
+ "grad_norm": 0.3597537875175476,
3455
+ "learning_rate": 1.0363570446297999e-05,
3456
+ "loss": 0.7125,
3457
+ "step": 488
3458
+ },
3459
+ {
3460
+ "epoch": 0.8683684794672586,
3461
+ "grad_norm": 0.35092103481292725,
3462
+ "learning_rate": 1.0093766541749205e-05,
3463
+ "loss": 0.692,
3464
+ "step": 489
3465
+ },
3466
+ {
3467
+ "epoch": 0.8701442841287459,
3468
+ "grad_norm": 0.35275334119796753,
3469
+ "learning_rate": 9.827334372413444e-06,
3470
+ "loss": 0.6683,
3471
+ "step": 490
3472
+ },
3473
+ {
3474
+ "epoch": 0.8719200887902331,
3475
+ "grad_norm": 0.3727843463420868,
3476
+ "learning_rate": 9.564283930242257e-06,
3477
+ "loss": 0.665,
3478
+ "step": 491
3479
+ },
3480
+ {
3481
+ "epoch": 0.8736958934517203,
3482
+ "grad_norm": 0.3570787310600281,
3483
+ "learning_rate": 9.30462508036294e-06,
3484
+ "loss": 0.6736,
3485
+ "step": 492
3486
+ },
3487
+ {
3488
+ "epoch": 0.8754716981132076,
3489
+ "grad_norm": 0.39428988099098206,
3490
+ "learning_rate": 9.048367560708604e-06,
3491
+ "loss": 0.7076,
3492
+ "step": 493
3493
+ },
3494
+ {
3495
+ "epoch": 0.8772475027746948,
3496
+ "grad_norm": 0.3717636168003082,
3497
+ "learning_rate": 8.795520981652961e-06,
3498
+ "loss": 0.6807,
3499
+ "step": 494
3500
+ },
3501
+ {
3502
+ "epoch": 0.8790233074361821,
3503
+ "grad_norm": 0.4105593264102936,
3504
+ "learning_rate": 8.546094825649908e-06,
3505
+ "loss": 0.7068,
3506
+ "step": 495
3507
+ },
3508
+ {
3509
+ "epoch": 0.8807991120976693,
3510
+ "grad_norm": 0.45720747113227844,
3511
+ "learning_rate": 8.300098446877923e-06,
3512
+ "loss": 0.7189,
3513
+ "step": 496
3514
+ },
3515
+ {
3516
+ "epoch": 0.8825749167591564,
3517
+ "grad_norm": 0.44911620020866394,
3518
+ "learning_rate": 8.05754107088923e-06,
3519
+ "loss": 0.6891,
3520
+ "step": 497
3521
+ },
3522
+ {
3523
+ "epoch": 0.8843507214206437,
3524
+ "grad_norm": 0.4414433240890503,
3525
+ "learning_rate": 7.818431794263836e-06,
3526
+ "loss": 0.7167,
3527
+ "step": 498
3528
+ },
3529
+ {
3530
+ "epoch": 0.8861265260821309,
3531
+ "grad_norm": 0.49208080768585205,
3532
+ "learning_rate": 7.582779584268373e-06,
3533
+ "loss": 0.7084,
3534
+ "step": 499
3535
+ },
3536
+ {
3537
+ "epoch": 0.8879023307436182,
3538
+ "grad_norm": 0.5831857323646545,
3539
+ "learning_rate": 7.350593278519824e-06,
3540
+ "loss": 0.7877,
3541
+ "step": 500
3542
+ },
3543
+ {
3544
+ "epoch": 0.8896781354051054,
3545
+ "grad_norm": 0.25118499994277954,
3546
+ "learning_rate": 7.121881584654056e-06,
3547
+ "loss": 0.8006,
3548
+ "step": 501
3549
+ },
3550
+ {
3551
+ "epoch": 0.8914539400665926,
3552
+ "grad_norm": 0.2842087745666504,
3553
+ "learning_rate": 6.896653079999249e-06,
3554
+ "loss": 0.7796,
3555
+ "step": 502
3556
+ },
3557
+ {
3558
+ "epoch": 0.8932297447280799,
3559
+ "grad_norm": 0.2935945391654968,
3560
+ "learning_rate": 6.674916211254289e-06,
3561
+ "loss": 0.7614,
3562
+ "step": 503
3563
+ },
3564
+ {
3565
+ "epoch": 0.8950055493895671,
3566
+ "grad_norm": 0.3194078505039215,
3567
+ "learning_rate": 6.45667929417193e-06,
3568
+ "loss": 0.7537,
3569
+ "step": 504
3570
+ },
3571
+ {
3572
+ "epoch": 0.8967813540510544,
3573
+ "grad_norm": 0.32085007429122925,
3574
+ "learning_rate": 6.2419505132469305e-06,
3575
+ "loss": 0.7843,
3576
+ "step": 505
3577
+ },
3578
+ {
3579
+ "epoch": 0.8985571587125416,
3580
+ "grad_norm": 0.32116949558258057,
3581
+ "learning_rate": 6.030737921409169e-06,
3582
+ "loss": 0.736,
3583
+ "step": 506
3584
+ },
3585
+ {
3586
+ "epoch": 0.9003329633740289,
3587
+ "grad_norm": 0.32136133313179016,
3588
+ "learning_rate": 5.823049439721561e-06,
3589
+ "loss": 0.7388,
3590
+ "step": 507
3591
+ },
3592
+ {
3593
+ "epoch": 0.9021087680355161,
3594
+ "grad_norm": 0.33068105578422546,
3595
+ "learning_rate": 5.618892857083069e-06,
3596
+ "loss": 0.6994,
3597
+ "step": 508
3598
+ },
3599
+ {
3600
+ "epoch": 0.9038845726970033,
3601
+ "grad_norm": 0.3484742343425751,
3602
+ "learning_rate": 5.418275829936537e-06,
3603
+ "loss": 0.7431,
3604
+ "step": 509
3605
+ },
3606
+ {
3607
+ "epoch": 0.9056603773584906,
3608
+ "grad_norm": 0.35299912095069885,
3609
+ "learning_rate": 5.221205881981595e-06,
3610
+ "loss": 0.7568,
3611
+ "step": 510
3612
+ },
3613
+ {
3614
+ "epoch": 0.9074361820199778,
3615
+ "grad_norm": 0.34243056178092957,
3616
+ "learning_rate": 5.02769040389246e-06,
3617
+ "loss": 0.6817,
3618
+ "step": 511
3619
+ },
3620
+ {
3621
+ "epoch": 0.9092119866814651,
3622
+ "grad_norm": 0.38018926978111267,
3623
+ "learning_rate": 4.8377366530408254e-06,
3624
+ "loss": 0.759,
3625
+ "step": 512
3626
+ },
3627
+ {
3628
+ "epoch": 0.9109877913429523,
3629
+ "grad_norm": 0.36619412899017334,
3630
+ "learning_rate": 4.65135175322361e-06,
3631
+ "loss": 0.7202,
3632
+ "step": 513
3633
+ },
3634
+ {
3635
+ "epoch": 0.9127635960044395,
3636
+ "grad_norm": 0.38696765899658203,
3637
+ "learning_rate": 4.468542694395861e-06,
3638
+ "loss": 0.7202,
3639
+ "step": 514
3640
+ },
3641
+ {
3642
+ "epoch": 0.9145394006659268,
3643
+ "grad_norm": 0.39391985535621643,
3644
+ "learning_rate": 4.2893163324085885e-06,
3645
+ "loss": 0.7091,
3646
+ "step": 515
3647
+ },
3648
+ {
3649
+ "epoch": 0.916315205327414,
3650
+ "grad_norm": 0.37037429213523865,
3651
+ "learning_rate": 4.1136793887516345e-06,
3652
+ "loss": 0.6974,
3653
+ "step": 516
3654
+ },
3655
+ {
3656
+ "epoch": 0.9180910099889013,
3657
+ "grad_norm": 0.39087411761283875,
3658
+ "learning_rate": 3.941638450301644e-06,
3659
+ "loss": 0.7328,
3660
+ "step": 517
3661
+ },
3662
+ {
3663
+ "epoch": 0.9198668146503884,
3664
+ "grad_norm": 0.38174766302108765,
3665
+ "learning_rate": 3.7731999690749585e-06,
3666
+ "loss": 0.7184,
3667
+ "step": 518
3668
+ },
3669
+ {
3670
+ "epoch": 0.9216426193118757,
3671
+ "grad_norm": 0.3953598737716675,
3672
+ "learning_rate": 3.6083702619857605e-06,
3673
+ "loss": 0.7121,
3674
+ "step": 519
3675
+ },
3676
+ {
3677
+ "epoch": 0.9234184239733629,
3678
+ "grad_norm": 0.39504143595695496,
3679
+ "learning_rate": 3.447155510609057e-06,
3680
+ "loss": 0.6665,
3681
+ "step": 520
3682
+ },
3683
+ {
3684
+ "epoch": 0.9251942286348501,
3685
+ "grad_norm": 0.4065762162208557,
3686
+ "learning_rate": 3.2895617609489336e-06,
3687
+ "loss": 0.7019,
3688
+ "step": 521
3689
+ },
3690
+ {
3691
+ "epoch": 0.9269700332963374,
3692
+ "grad_norm": 0.39577242732048035,
3693
+ "learning_rate": 3.135594923211771e-06,
3694
+ "loss": 0.6444,
3695
+ "step": 522
3696
+ },
3697
+ {
3698
+ "epoch": 0.9287458379578246,
3699
+ "grad_norm": 0.4378613531589508,
3700
+ "learning_rate": 2.9852607715846193e-06,
3701
+ "loss": 0.7066,
3702
+ "step": 523
3703
+ },
3704
+ {
3705
+ "epoch": 0.9305216426193119,
3706
+ "grad_norm": 0.4423007369041443,
3707
+ "learning_rate": 2.838564944018618e-06,
3708
+ "loss": 0.6555,
3709
+ "step": 524
3710
+ },
3711
+ {
3712
+ "epoch": 0.9322974472807991,
3713
+ "grad_norm": 0.5693342089653015,
3714
+ "learning_rate": 2.6955129420176196e-06,
3715
+ "loss": 0.8152,
3716
+ "step": 525
3717
+ },
3718
+ {
3719
+ "epoch": 0.9340732519422863,
3720
+ "grad_norm": 0.2593931555747986,
3721
+ "learning_rate": 2.556110130431788e-06,
3722
+ "loss": 0.813,
3723
+ "step": 526
3724
+ },
3725
+ {
3726
+ "epoch": 0.9358490566037736,
3727
+ "grad_norm": 0.30711933970451355,
3728
+ "learning_rate": 2.420361737256438e-06,
3729
+ "loss": 0.7564,
3730
+ "step": 527
3731
+ },
3732
+ {
3733
+ "epoch": 0.9376248612652608,
3734
+ "grad_norm": 0.29575708508491516,
3735
+ "learning_rate": 2.288272853436013e-06,
3736
+ "loss": 0.7813,
3737
+ "step": 528
3738
+ },
3739
+ {
3740
+ "epoch": 0.9394006659267481,
3741
+ "grad_norm": 0.3270512521266937,
3742
+ "learning_rate": 2.1598484326730837e-06,
3743
+ "loss": 0.7658,
3744
+ "step": 529
3745
+ },
3746
+ {
3747
+ "epoch": 0.9411764705882353,
3748
+ "grad_norm": 0.3134397268295288,
3749
+ "learning_rate": 2.035093291242607e-06,
3750
+ "loss": 0.7335,
3751
+ "step": 530
3752
+ },
3753
+ {
3754
+ "epoch": 0.9429522752497226,
3755
+ "grad_norm": 0.34165453910827637,
3756
+ "learning_rate": 1.914012107811336e-06,
3757
+ "loss": 0.798,
3758
+ "step": 531
3759
+ },
3760
+ {
3761
+ "epoch": 0.9447280799112098,
3762
+ "grad_norm": 0.33172738552093506,
3763
+ "learning_rate": 1.7966094232622855e-06,
3764
+ "loss": 0.7516,
3765
+ "step": 532
3766
+ },
3767
+ {
3768
+ "epoch": 0.946503884572697,
3769
+ "grad_norm": 0.35185980796813965,
3770
+ "learning_rate": 1.6828896405244988e-06,
3771
+ "loss": 0.7745,
3772
+ "step": 533
3773
+ },
3774
+ {
3775
+ "epoch": 0.9482796892341843,
3776
+ "grad_norm": 0.3368275761604309,
3777
+ "learning_rate": 1.572857024407881e-06,
3778
+ "loss": 0.749,
3779
+ "step": 534
3780
+ },
3781
+ {
3782
+ "epoch": 0.9500554938956715,
3783
+ "grad_norm": 0.3556804656982422,
3784
+ "learning_rate": 1.466515701443294e-06,
3785
+ "loss": 0.737,
3786
+ "step": 535
3787
+ },
3788
+ {
3789
+ "epoch": 0.9518312985571588,
3790
+ "grad_norm": 0.35035377740859985,
3791
+ "learning_rate": 1.3638696597277679e-06,
3792
+ "loss": 0.716,
3793
+ "step": 536
3794
+ },
3795
+ {
3796
+ "epoch": 0.953607103218646,
3797
+ "grad_norm": 0.356507807970047,
3798
+ "learning_rate": 1.2649227487749548e-06,
3799
+ "loss": 0.7292,
3800
+ "step": 537
3801
+ },
3802
+ {
3803
+ "epoch": 0.9553829078801331,
3804
+ "grad_norm": 0.3645875155925751,
3805
+ "learning_rate": 1.1696786793707781e-06,
3806
+ "loss": 0.7325,
3807
+ "step": 538
3808
+ },
3809
+ {
3810
+ "epoch": 0.9571587125416204,
3811
+ "grad_norm": 0.35591599345207214,
3812
+ "learning_rate": 1.0781410234342094e-06,
3813
+ "loss": 0.7203,
3814
+ "step": 539
3815
+ },
3816
+ {
3817
+ "epoch": 0.9589345172031076,
3818
+ "grad_norm": 0.35885104537010193,
3819
+ "learning_rate": 9.90313213883376e-07,
3820
+ "loss": 0.665,
3821
+ "step": 540
3822
+ },
3823
+ {
3824
+ "epoch": 0.9607103218645949,
3825
+ "grad_norm": 0.38247016072273254,
3826
+ "learning_rate": 9.061985445067756e-07,
3827
+ "loss": 0.6885,
3828
+ "step": 541
3829
+ },
3830
+ {
3831
+ "epoch": 0.9624861265260821,
3832
+ "grad_norm": 0.38850679993629456,
3833
+ "learning_rate": 8.258001698397744e-07,
3834
+ "loss": 0.707,
3835
+ "step": 542
3836
+ },
3837
+ {
3838
+ "epoch": 0.9642619311875694,
3839
+ "grad_norm": 0.3912898004055023,
3840
+ "learning_rate": 7.491211050462798e-07,
3841
+ "loss": 0.6818,
3842
+ "step": 543
3843
+ },
3844
+ {
3845
+ "epoch": 0.9660377358490566,
3846
+ "grad_norm": 0.3983571529388428,
3847
+ "learning_rate": 6.761642258056978e-07,
3848
+ "loss": 0.6786,
3849
+ "step": 544
3850
+ },
3851
+ {
3852
+ "epoch": 0.9678135405105438,
3853
+ "grad_norm": 0.4406982660293579,
3854
+ "learning_rate": 6.069322682050516e-07,
3855
+ "loss": 0.6564,
3856
+ "step": 545
3857
+ },
3858
+ {
3859
+ "epoch": 0.9695893451720311,
3860
+ "grad_norm": 0.38563239574432373,
3861
+ "learning_rate": 5.414278286363761e-07,
3862
+ "loss": 0.5921,
3863
+ "step": 546
3864
+ },
3865
+ {
3866
+ "epoch": 0.9713651498335183,
3867
+ "grad_norm": 0.4474928081035614,
3868
+ "learning_rate": 4.796533636993727e-07,
3869
+ "loss": 0.681,
3870
+ "step": 547
3871
+ },
3872
+ {
3873
+ "epoch": 0.9731409544950056,
3874
+ "grad_norm": 0.43392056226730347,
3875
+ "learning_rate": 4.216111901092501e-07,
3876
+ "loss": 0.6673,
3877
+ "step": 548
3878
+ },
3879
+ {
3880
+ "epoch": 0.9749167591564928,
3881
+ "grad_norm": 0.5074283480644226,
3882
+ "learning_rate": 3.6730348460985996e-07,
3883
+ "loss": 0.7363,
3884
+ "step": 549
3885
+ },
3886
+ {
3887
+ "epoch": 0.97669256381798,
3888
+ "grad_norm": 0.5351345539093018,
3889
+ "learning_rate": 3.1673228389204055e-07,
3890
+ "loss": 0.6898,
3891
+ "step": 550
3892
+ },
3893
+ {
3894
+ "epoch": 0.9784683684794673,
3895
+ "grad_norm": 0.2640553414821625,
3896
+ "learning_rate": 2.6989948451726643e-07,
3897
+ "loss": 0.7556,
3898
+ "step": 551
3899
+ },
3900
+ {
3901
+ "epoch": 0.9802441731409545,
3902
+ "grad_norm": 0.28559839725494385,
3903
+ "learning_rate": 2.2680684284650533e-07,
3904
+ "loss": 0.7636,
3905
+ "step": 552
3906
+ },
3907
+ {
3908
+ "epoch": 0.9820199778024418,
3909
+ "grad_norm": 0.3131345510482788,
3910
+ "learning_rate": 1.8745597497433765e-07,
3911
+ "loss": 0.7366,
3912
+ "step": 553
3913
+ },
3914
+ {
3915
+ "epoch": 0.983795782463929,
3916
+ "grad_norm": 0.3156748116016388,
3917
+ "learning_rate": 1.518483566683826e-07,
3918
+ "loss": 0.7377,
3919
+ "step": 554
3920
+ },
3921
+ {
3922
+ "epoch": 0.9855715871254163,
3923
+ "grad_norm": 0.34887486696243286,
3924
+ "learning_rate": 1.199853233138981e-07,
3925
+ "loss": 0.7588,
3926
+ "step": 555
3927
+ },
3928
+ {
3929
+ "epoch": 0.9873473917869034,
3930
+ "grad_norm": 0.3328061103820801,
3931
+ "learning_rate": 9.186806986376529e-08,
3932
+ "loss": 0.744,
3933
+ "step": 556
3934
+ },
3935
+ {
3936
+ "epoch": 0.9891231964483906,
3937
+ "grad_norm": 0.36429402232170105,
3938
+ "learning_rate": 6.749765079363534e-08,
3939
+ "loss": 0.7261,
3940
+ "step": 557
3941
+ },
3942
+ {
3943
+ "epoch": 0.9908990011098779,
3944
+ "grad_norm": 0.36065101623535156,
3945
+ "learning_rate": 4.687498006236135e-08,
3946
+ "loss": 0.6815,
3947
+ "step": 558
3948
+ },
3949
+ {
3950
+ "epoch": 0.9926748057713651,
3951
+ "grad_norm": 0.3841758966445923,
3952
+ "learning_rate": 3.000083107780327e-08,
3953
+ "loss": 0.7204,
3954
+ "step": 559
3955
+ },
3956
+ {
3957
+ "epoch": 0.9944506104328524,
3958
+ "grad_norm": 0.43167850375175476,
3959
+ "learning_rate": 1.687583666772907e-08,
3960
+ "loss": 0.725,
3961
+ "step": 560
3962
+ },
3963
+ {
3964
+ "epoch": 0.9962264150943396,
3965
+ "grad_norm": 0.4131225347518921,
3966
+ "learning_rate": 7.500489056133652e-09,
3967
+ "loss": 0.6763,
3968
+ "step": 561
3969
+ },
3970
+ {
3971
+ "epoch": 0.9980022197558268,
3972
+ "grad_norm": 0.4707167446613312,
3973
+ "learning_rate": 1.8751398447758306e-09,
3974
+ "loss": 0.6854,
3975
+ "step": 562
3976
+ },
3977
+ {
3978
+ "epoch": 0.9997780244173141,
3979
+ "grad_norm": 0.4748234152793884,
3980
+ "learning_rate": 0.0,
3981
+ "loss": 0.6381,
3982
+ "step": 563
3983
  }
3984
  ],
3985
  "logging_steps": 1,
 
3994
  "should_evaluate": false,
3995
  "should_log": false,
3996
  "should_save": true,
3997
+ "should_training_stop": true
3998
  },
3999
  "attributes": {}
4000
  }
4001
  },
4002
+ "total_flos": 7.415142496788808e+17,
4003
  "train_batch_size": 2,
4004
  "trial_name": null,
4005
  "trial_params": null