File size: 52,555 Bytes
67d0111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "episode": 17920,
  "epoch": 0.24579258507413554,
  "eval_steps": 200.0,
  "global_step": 350,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "episode": 256,
      "epoch": 0.003511322643916222,
      "eps": 6,
      "loss/policy_avg": -0.07090990990400314,
      "loss/value_avg": 0.0,
      "lr": 3e-06,
      "objective/entropy": 49.42120361328125,
      "objective/kl": 0.006465356796979904,
      "objective/non_score_reward": -0.000646535714622587,
      "objective/rlhf_reward": -1.1137903928756714,
      "objective/scores": -1.109375,
      "policy/approxkl_avg": 27.096786499023438,
      "policy/clipfrac_avg": 0.732421875,
      "policy/entropy_avg": 0.92181396484375,
      "step": 5,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 12,
      "val/ratio": 1.0399832725524902,
      "val/ratio_var": 0.010045886039733887
    },
    {
      "episode": 512,
      "epoch": 0.007022645287832444,
      "eps": 6,
      "loss/policy_avg": -0.06497187167406082,
      "loss/value_avg": 0.0,
      "lr": 2.9923273657289e-06,
      "objective/entropy": 48.286014556884766,
      "objective/kl": 0.8119473457336426,
      "objective/non_score_reward": -0.08119472861289978,
      "objective/rlhf_reward": -1.266162633895874,
      "objective/scores": -1.1875,
      "policy/approxkl_avg": 18.666072845458984,
      "policy/clipfrac_avg": 0.7314453125,
      "policy/entropy_avg": 0.912261962890625,
      "step": 10,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.020957112312317,
      "val/ratio_var": 0.00411860179156065
    },
    {
      "episode": 768,
      "epoch": 0.010533967931748666,
      "eps": 6,
      "loss/policy_avg": -0.0872286781668663,
      "loss/value_avg": 0.0,
      "lr": 2.9846547314578008e-06,
      "objective/entropy": 49.34376525878906,
      "objective/kl": 1.9591996669769287,
      "objective/non_score_reward": -0.1959199756383896,
      "objective/rlhf_reward": -1.2858657836914062,
      "objective/scores": -1.09375,
      "policy/approxkl_avg": 20.772502899169922,
      "policy/clipfrac_avg": 0.73828125,
      "policy/entropy_avg": 0.927978515625,
      "step": 15,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 12,
      "val/ratio": 1.0191609859466553,
      "val/ratio_var": 0.00307083735242486
    },
    {
      "episode": 1024,
      "epoch": 0.014045290575664887,
      "eps": 6,
      "loss/policy_avg": -0.07566041499376297,
      "loss/value_avg": 0.0,
      "lr": 2.9769820971867007e-06,
      "objective/entropy": 53.13662338256836,
      "objective/kl": 2.4811532497406006,
      "objective/non_score_reward": -0.24811533093452454,
      "objective/rlhf_reward": -1.2548893690109253,
      "objective/scores": -1.0078125,
      "policy/approxkl_avg": 20.665164947509766,
      "policy/clipfrac_avg": 0.7314453125,
      "policy/entropy_avg": 0.989776611328125,
      "step": 20,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.011010766029358,
      "val/ratio_var": 0.004201602190732956
    },
    {
      "episode": 1280,
      "epoch": 0.01755661321958111,
      "eps": 6,
      "loss/policy_avg": -0.08593496680259705,
      "loss/value_avg": 0.0,
      "lr": 2.9693094629156014e-06,
      "objective/entropy": 53.72633743286133,
      "objective/kl": 3.3111624717712402,
      "objective/non_score_reward": -0.3311161994934082,
      "objective/rlhf_reward": -1.339456558227539,
      "objective/scores": -1.0078125,
      "policy/approxkl_avg": 25.559288024902344,
      "policy/clipfrac_avg": 0.7353515625,
      "policy/entropy_avg": 0.997894287109375,
      "step": 25,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0134021043777466,
      "val/ratio_var": 0.0019979747012257576
    },
    {
      "episode": 1536,
      "epoch": 0.021067935863497332,
      "eps": 6,
      "loss/policy_avg": -0.09734417498111725,
      "loss/value_avg": 0.0,
      "lr": 2.9616368286445014e-06,
      "objective/entropy": 51.259735107421875,
      "objective/kl": 5.089182376861572,
      "objective/non_score_reward": -0.5089181661605835,
      "objective/rlhf_reward": -1.2202520370483398,
      "objective/scores": -0.7109375,
      "policy/approxkl_avg": 29.841636657714844,
      "policy/clipfrac_avg": 0.736328125,
      "policy/entropy_avg": 0.960479736328125,
      "step": 30,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 26,
      "val/ratio": 1.0178756713867188,
      "val/ratio_var": 0.009866585955023766
    },
    {
      "episode": 1792,
      "epoch": 0.024579258507413555,
      "eps": 6,
      "loss/policy_avg": -0.06831618398427963,
      "loss/value_avg": 0.0,
      "lr": 2.9539641943734013e-06,
      "objective/entropy": 40.643272399902344,
      "objective/kl": 6.974010944366455,
      "objective/non_score_reward": -0.6974011063575745,
      "objective/rlhf_reward": -1.2684605121612549,
      "objective/scores": -0.5703125,
      "policy/approxkl_avg": 35.33942413330078,
      "policy/clipfrac_avg": 0.6982421875,
      "policy/entropy_avg": 0.7505035400390625,
      "step": 35,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.00449800491333,
      "val/ratio_var": 0.0022142010275274515
    },
    {
      "episode": 2048,
      "epoch": 0.028090581151329775,
      "eps": 6,
      "loss/policy_avg": -0.04068079590797424,
      "loss/value_avg": 0.0,
      "lr": 2.946291560102302e-06,
      "objective/entropy": 23.142562866210938,
      "objective/kl": 8.180486679077148,
      "objective/non_score_reward": -0.8180487155914307,
      "objective/rlhf_reward": -1.0729957818984985,
      "objective/scores": -0.255859375,
      "policy/approxkl_avg": 23.68307876586914,
      "policy/clipfrac_avg": 0.5859375,
      "policy/entropy_avg": 0.4361400604248047,
      "step": 40,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.0077030658721924,
      "val/ratio_var": 0.0024766812566667795
    },
    {
      "episode": 2304,
      "epoch": 0.031601903795246,
      "eps": 6,
      "loss/policy_avg": -0.07307010889053345,
      "loss/value_avg": 0.0,
      "lr": 2.938618925831202e-06,
      "objective/entropy": 19.376842498779297,
      "objective/kl": 8.770210266113281,
      "objective/non_score_reward": -0.8770210146903992,
      "objective/rlhf_reward": -1.0002652406692505,
      "objective/scores": -0.12353515625,
      "policy/approxkl_avg": 31.00873565673828,
      "policy/clipfrac_avg": 0.5302734375,
      "policy/entropy_avg": 0.33237457275390625,
      "step": 45,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 0.996111273765564,
      "val/ratio_var": 0.001100091845728457
    },
    {
      "episode": 2560,
      "epoch": 0.03511322643916222,
      "eps": 6,
      "loss/policy_avg": -0.04584116116166115,
      "loss/value_avg": 0.0,
      "lr": 2.9309462915601027e-06,
      "objective/entropy": 11.984097480773926,
      "objective/kl": 8.4966402053833,
      "objective/non_score_reward": -0.849664032459259,
      "objective/rlhf_reward": -0.8017911911010742,
      "objective/scores": 0.0478515625,
      "policy/approxkl_avg": 22.561037063598633,
      "policy/clipfrac_avg": 0.451171875,
      "policy/entropy_avg": 0.19393539428710938,
      "step": 50,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 0.9952375888824463,
      "val/ratio_var": 0.000761833623982966
    },
    {
      "episode": 2816,
      "epoch": 0.03862454908307844,
      "eps": 5,
      "loss/policy_avg": -0.029720915481448174,
      "loss/value_avg": 0.0,
      "lr": 2.9232736572890026e-06,
      "objective/entropy": 4.9489898681640625,
      "objective/kl": 8.733837127685547,
      "objective/non_score_reward": -0.8733837604522705,
      "objective/rlhf_reward": -0.7492713928222656,
      "objective/scores": 0.1240234375,
      "policy/approxkl_avg": 16.253189086914062,
      "policy/clipfrac_avg": 0.341796875,
      "policy/entropy_avg": 0.07728099822998047,
      "step": 55,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 18,
      "val/ratio": 0.9972053170204163,
      "val/ratio_var": 0.00032430028659291565
    },
    {
      "episode": 3072,
      "epoch": 0.042135871726994664,
      "eps": 5,
      "loss/policy_avg": -0.01298562902957201,
      "loss/value_avg": 0.0,
      "lr": 2.9156010230179026e-06,
      "objective/entropy": 1.3101667165756226,
      "objective/kl": 8.699792861938477,
      "objective/non_score_reward": -0.8699792623519897,
      "objective/rlhf_reward": -0.5752952098846436,
      "objective/scores": 0.294921875,
      "policy/approxkl_avg": 2.27925968170166,
      "policy/clipfrac_avg": 0.236328125,
      "policy/entropy_avg": 0.02513742446899414,
      "step": 60,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 1.0017118453979492,
      "val/ratio_var": 0.00016639505338389426
    },
    {
      "episode": 3328,
      "epoch": 0.04564719437091089,
      "eps": 5,
      "loss/policy_avg": -0.02618303708732128,
      "loss/value_avg": 0.0,
      "lr": 2.9079283887468033e-06,
      "objective/entropy": 2.3685269355773926,
      "objective/kl": 9.208517074584961,
      "objective/non_score_reward": -0.9208516478538513,
      "objective/rlhf_reward": -0.5182289481163025,
      "objective/scores": 0.40234375,
      "policy/approxkl_avg": 2.6189699172973633,
      "policy/clipfrac_avg": 0.310546875,
      "policy/entropy_avg": 0.04020071029663086,
      "step": 65,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 1.003983497619629,
      "val/ratio_var": 0.0009448421187698841
    },
    {
      "episode": 3584,
      "epoch": 0.04915851701482711,
      "eps": 5,
      "loss/policy_avg": -0.02327096462249756,
      "loss/value_avg": 0.0,
      "lr": 2.9002557544757032e-06,
      "objective/entropy": 2.0416018962860107,
      "objective/kl": 9.701976776123047,
      "objective/non_score_reward": -0.9701976776123047,
      "objective/rlhf_reward": -0.49486449360847473,
      "objective/scores": 0.474609375,
      "policy/approxkl_avg": 1.271956443786621,
      "policy/clipfrac_avg": 0.2734375,
      "policy/entropy_avg": 0.041253089904785156,
      "step": 70,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.0039558410644531,
      "val/ratio_var": 0.00041477559716440737
    },
    {
      "episode": 3840,
      "epoch": 0.052669839658743334,
      "eps": 5,
      "loss/policy_avg": -0.033096276223659515,
      "loss/value_avg": 0.0,
      "lr": 2.892583120204604e-06,
      "objective/entropy": 2.7795495986938477,
      "objective/kl": 10.028523445129395,
      "objective/non_score_reward": -1.0028523206710815,
      "objective/rlhf_reward": -0.46555712819099426,
      "objective/scores": 0.5390625,
      "policy/approxkl_avg": 3.055203676223755,
      "policy/clipfrac_avg": 0.3427734375,
      "policy/entropy_avg": 0.053270816802978516,
      "step": 75,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 23,
      "val/ratio": 1.0012407302856445,
      "val/ratio_var": 0.00011274257121840492
    },
    {
      "episode": 4096,
      "epoch": 0.05618116230265955,
      "eps": 5,
      "loss/policy_avg": -0.01961323618888855,
      "loss/value_avg": 0.0,
      "lr": 2.884910485933504e-06,
      "objective/entropy": 2.5525641441345215,
      "objective/kl": 10.111019134521484,
      "objective/non_score_reward": -1.0111019611358643,
      "objective/rlhf_reward": -0.510233461856842,
      "objective/scores": 0.5,
      "policy/approxkl_avg": 1.331697940826416,
      "policy/clipfrac_avg": 0.2861328125,
      "policy/entropy_avg": 0.048857688903808594,
      "step": 80,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 25,
      "val/ratio": 1.011049509048462,
      "val/ratio_var": 0.004252108279615641
    },
    {
      "episode": 4352,
      "epoch": 0.05969248494657577,
      "eps": 5,
      "loss/policy_avg": -0.009127877652645111,
      "loss/value_avg": 0.0,
      "lr": 2.877237851662404e-06,
      "objective/entropy": 3.016789674758911,
      "objective/kl": 11.257818222045898,
      "objective/non_score_reward": -1.125781774520874,
      "objective/rlhf_reward": -0.4276960492134094,
      "objective/scores": 0.69921875,
      "policy/approxkl_avg": 1.4772686958312988,
      "policy/clipfrac_avg": 0.35546875,
      "policy/entropy_avg": 0.053719520568847656,
      "step": 85,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.0042904615402222,
      "val/ratio_var": 0.0008556774700991809
    },
    {
      "episode": 4608,
      "epoch": 0.063203807590492,
      "eps": 5,
      "loss/policy_avg": -0.025049656629562378,
      "loss/value_avg": 0.0,
      "lr": 2.8695652173913046e-06,
      "objective/entropy": 2.5907459259033203,
      "objective/kl": 10.457273483276367,
      "objective/non_score_reward": -1.0457274913787842,
      "objective/rlhf_reward": -0.3816419839859009,
      "objective/scores": 0.6640625,
      "policy/approxkl_avg": 2.3460922241210938,
      "policy/clipfrac_avg": 0.322265625,
      "policy/entropy_avg": 0.04626178741455078,
      "step": 90,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.0003862380981445,
      "val/ratio_var": 7.93520302977413e-05
    },
    {
      "episode": 4864,
      "epoch": 0.06671513023440821,
      "eps": 5,
      "loss/policy_avg": -0.01828361675143242,
      "loss/value_avg": 0.0,
      "lr": 2.8618925831202045e-06,
      "objective/entropy": 2.397810220718384,
      "objective/kl": 10.732559204101562,
      "objective/non_score_reward": -1.073256015777588,
      "objective/rlhf_reward": -0.35966813564300537,
      "objective/scores": 0.71484375,
      "policy/approxkl_avg": 1.1093428134918213,
      "policy/clipfrac_avg": 0.32421875,
      "policy/entropy_avg": 0.041881561279296875,
      "step": 95,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.0054664611816406,
      "val/ratio_var": 0.0017973663052543998
    },
    {
      "episode": 5120,
      "epoch": 0.07022645287832444,
      "eps": 5,
      "loss/policy_avg": -0.04088423401117325,
      "loss/value_avg": 0.0,
      "lr": 2.8542199488491053e-06,
      "objective/entropy": 2.343449592590332,
      "objective/kl": 11.780994415283203,
      "objective/non_score_reward": -1.1780993938446045,
      "objective/rlhf_reward": -0.4628324806690216,
      "objective/scores": 0.71484375,
      "policy/approxkl_avg": 0.894420325756073,
      "policy/clipfrac_avg": 0.46875,
      "policy/entropy_avg": 0.04486083984375,
      "step": 100,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.0009559392929077,
      "val/ratio_var": 4.804596756002866e-05
    },
    {
      "episode": 5376,
      "epoch": 0.07373777552224066,
      "eps": 5,
      "loss/policy_avg": -0.020697183907032013,
      "loss/value_avg": 0.0,
      "lr": 2.846547314578005e-06,
      "objective/entropy": 1.9023351669311523,
      "objective/kl": 10.29288101196289,
      "objective/non_score_reward": -1.0292882919311523,
      "objective/rlhf_reward": -0.29047834873199463,
      "objective/scores": 0.73828125,
      "policy/approxkl_avg": 0.9143690466880798,
      "policy/clipfrac_avg": 0.373046875,
      "policy/entropy_avg": 0.028568267822265625,
      "step": 105,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 10,
      "val/ratio": 1.000715732574463,
      "val/ratio_var": 4.201457340968773e-05
    },
    {
      "episode": 5632,
      "epoch": 0.07724909816615688,
      "eps": 5,
      "loss/policy_avg": -0.012633640319108963,
      "loss/value_avg": 0.0,
      "lr": 2.8388746803069055e-06,
      "objective/entropy": 1.3839142322540283,
      "objective/kl": 10.57151985168457,
      "objective/non_score_reward": -1.0571520328521729,
      "objective/rlhf_reward": -0.2935946583747864,
      "objective/scores": 0.765625,
      "policy/approxkl_avg": 0.6525547504425049,
      "policy/clipfrac_avg": 0.2646484375,
      "policy/entropy_avg": 0.0345916748046875,
      "step": 110,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 10,
      "val/ratio": 0.9999199509620667,
      "val/ratio_var": 2.6978697860613465e-05
    },
    {
      "episode": 5888,
      "epoch": 0.0807604208100731,
      "eps": 5,
      "loss/policy_avg": -0.026668714359402657,
      "loss/value_avg": 0.0,
      "lr": 2.831202046035806e-06,
      "objective/entropy": 2.17741322517395,
      "objective/kl": 11.39688491821289,
      "objective/non_score_reward": -1.139688491821289,
      "objective/rlhf_reward": -0.3027456998825073,
      "objective/scores": 0.8359375,
      "policy/approxkl_avg": 8.829752922058105,
      "policy/clipfrac_avg": 0.35546875,
      "policy/entropy_avg": 0.034277915954589844,
      "step": 115,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.0012441873550415,
      "val/ratio_var": 9.009366476675496e-05
    },
    {
      "episode": 6144,
      "epoch": 0.08427174345398933,
      "eps": 5,
      "loss/policy_avg": -0.011602860875427723,
      "loss/value_avg": 0.0,
      "lr": 2.823529411764706e-06,
      "objective/entropy": 1.418602466583252,
      "objective/kl": 10.246469497680664,
      "objective/non_score_reward": -1.0246469974517822,
      "objective/rlhf_reward": -0.22599510848522186,
      "objective/scores": 0.796875,
      "policy/approxkl_avg": 0.31790149211883545,
      "policy/clipfrac_avg": 0.2314453125,
      "policy/entropy_avg": 0.028847694396972656,
      "step": 120,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 1.0009679794311523,
      "val/ratio_var": 3.900106457876973e-05
    },
    {
      "episode": 6400,
      "epoch": 0.08778306609790555,
      "eps": 5,
      "loss/policy_avg": -0.0157505851238966,
      "loss/value_avg": 0.0,
      "lr": 2.8158567774936066e-06,
      "objective/entropy": 1.936393141746521,
      "objective/kl": 10.550077438354492,
      "objective/non_score_reward": -1.0550076961517334,
      "objective/rlhf_reward": -0.252943217754364,
      "objective/scores": 0.80078125,
      "policy/approxkl_avg": 6.545133113861084,
      "policy/clipfrac_avg": 0.341796875,
      "policy/entropy_avg": 0.039971351623535156,
      "step": 125,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 12,
      "val/ratio": 1.0001187324523926,
      "val/ratio_var": 0.00011527155584190041
    },
    {
      "episode": 6656,
      "epoch": 0.09129438874182177,
      "eps": 5,
      "loss/policy_avg": -0.00908716581761837,
      "loss/value_avg": 0.0,
      "lr": 2.8081841432225065e-06,
      "objective/entropy": 1.9167767763137817,
      "objective/kl": 10.831771850585938,
      "objective/non_score_reward": -1.0831772089004517,
      "objective/rlhf_reward": -0.24270595610141754,
      "objective/scores": 0.83984375,
      "policy/approxkl_avg": 13.507976531982422,
      "policy/clipfrac_avg": 0.25,
      "policy/entropy_avg": 0.034499168395996094,
      "step": 130,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.0004911422729492,
      "val/ratio_var": 0.00018595268193166703
    },
    {
      "episode": 6912,
      "epoch": 0.094805711385738,
      "eps": 5,
      "loss/policy_avg": -0.017197387292981148,
      "loss/value_avg": 0.0,
      "lr": 2.800511508951407e-06,
      "objective/entropy": 1.7237651348114014,
      "objective/kl": 11.095592498779297,
      "objective/non_score_reward": -1.1095592975616455,
      "objective/rlhf_reward": -0.21057555079460144,
      "objective/scores": 0.8984375,
      "policy/approxkl_avg": 2.7560040950775146,
      "policy/clipfrac_avg": 0.2841796875,
      "policy/entropy_avg": 0.032952308654785156,
      "step": 135,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 0.9994020462036133,
      "val/ratio_var": 3.074964843108319e-05
    },
    {
      "episode": 7168,
      "epoch": 0.09831703402965422,
      "eps": 5,
      "loss/policy_avg": -0.012010859325528145,
      "loss/value_avg": 0.0,
      "lr": 2.792838874680307e-06,
      "objective/entropy": 1.5862581729888916,
      "objective/kl": 10.674396514892578,
      "objective/non_score_reward": -1.0674396753311157,
      "objective/rlhf_reward": -0.14433012902736664,
      "objective/scores": 0.921875,
      "policy/approxkl_avg": 1.1186727285385132,
      "policy/clipfrac_avg": 0.2783203125,
      "policy/entropy_avg": 0.0295562744140625,
      "step": 140,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0007727146148682,
      "val/ratio_var": 4.557183274300769e-05
    },
    {
      "episode": 7424,
      "epoch": 0.10182835667357044,
      "eps": 5,
      "loss/policy_avg": -0.013728385791182518,
      "loss/value_avg": 0.0,
      "lr": 2.785166240409207e-06,
      "objective/entropy": 1.5388869047164917,
      "objective/kl": 10.359582901000977,
      "objective/non_score_reward": -1.035958170890808,
      "objective/rlhf_reward": -0.14511710405349731,
      "objective/scores": 0.890625,
      "policy/approxkl_avg": 0.5204602479934692,
      "policy/clipfrac_avg": 0.283203125,
      "policy/entropy_avg": 0.028924942016601562,
      "step": 145,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 14,
      "val/ratio": 1.056097149848938,
      "val/ratio_var": 0.13372056186199188
    },
    {
      "episode": 7680,
      "epoch": 0.10533967931748667,
      "eps": 5,
      "loss/policy_avg": -0.014945434406399727,
      "loss/value_avg": 0.0,
      "lr": 2.7774936061381074e-06,
      "objective/entropy": 2.0769755840301514,
      "objective/kl": 11.147063255310059,
      "objective/non_score_reward": -1.11470627784729,
      "objective/rlhf_reward": -0.08940108120441437,
      "objective/scores": 1.0234375,
      "policy/approxkl_avg": 0.5961493253707886,
      "policy/clipfrac_avg": 0.3681640625,
      "policy/entropy_avg": 0.037804603576660156,
      "step": 150,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0033739805221558,
      "val/ratio_var": 0.00030022990540601313
    },
    {
      "episode": 7936,
      "epoch": 0.10885100196140288,
      "eps": 5,
      "loss/policy_avg": -0.02276831492781639,
      "loss/value_avg": 0.0,
      "lr": 2.7698209718670078e-06,
      "objective/entropy": 2.1412830352783203,
      "objective/kl": 11.697949409484863,
      "objective/non_score_reward": -1.169795036315918,
      "objective/rlhf_reward": -0.13582009077072144,
      "objective/scores": 1.03125,
      "policy/approxkl_avg": 0.7155288457870483,
      "policy/clipfrac_avg": 0.3193359375,
      "policy/entropy_avg": 0.037835121154785156,
      "step": 155,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0014090538024902,
      "val/ratio_var": 5.2470270020421594e-05
    },
    {
      "episode": 8192,
      "epoch": 0.1123623246053191,
      "eps": 5,
      "loss/policy_avg": -0.013076605275273323,
      "loss/value_avg": 0.0,
      "lr": 2.762148337595908e-06,
      "objective/entropy": 1.634714126586914,
      "objective/kl": 11.629154205322266,
      "objective/non_score_reward": -1.1629154682159424,
      "objective/rlhf_reward": -0.28488799929618835,
      "objective/scores": 0.87890625,
      "policy/approxkl_avg": 0.4181188941001892,
      "policy/clipfrac_avg": 0.3037109375,
      "policy/entropy_avg": 0.029273509979248047,
      "step": 160,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.0008339881896973,
      "val/ratio_var": 1.4662801731901709e-05
    },
    {
      "episode": 8448,
      "epoch": 0.11587364724923532,
      "eps": 5,
      "loss/policy_avg": -0.01651182770729065,
      "loss/value_avg": 0.0,
      "lr": 2.7544757033248085e-06,
      "objective/entropy": 1.9540742635726929,
      "objective/kl": 11.4830322265625,
      "objective/non_score_reward": -1.1483032703399658,
      "objective/rlhf_reward": -0.05983233451843262,
      "objective/scores": 1.0859375,
      "policy/approxkl_avg": 18.791297912597656,
      "policy/clipfrac_avg": 0.2880859375,
      "policy/entropy_avg": 0.03601264953613281,
      "step": 165,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.0220942497253418,
      "val/ratio_var": 0.02208283357322216
    },
    {
      "episode": 8704,
      "epoch": 0.11938496989315155,
      "eps": 5,
      "loss/policy_avg": -0.013821810483932495,
      "loss/value_avg": 0.0,
      "lr": 2.7468030690537084e-06,
      "objective/entropy": 1.6243339776992798,
      "objective/kl": 11.435280799865723,
      "objective/non_score_reward": -1.1435281038284302,
      "objective/rlhf_reward": -0.12443088740110397,
      "objective/scores": 1.015625,
      "policy/approxkl_avg": 0.29013216495513916,
      "policy/clipfrac_avg": 0.28125,
      "policy/entropy_avg": 0.03498649597167969,
      "step": 170,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.0027971267700195,
      "val/ratio_var": 0.0002298366161994636
    },
    {
      "episode": 8960,
      "epoch": 0.12289629253706777,
      "eps": 5,
      "loss/policy_avg": -0.011003649793565273,
      "loss/value_avg": 0.0,
      "lr": 2.7391304347826087e-06,
      "objective/entropy": 2.000375986099243,
      "objective/kl": 11.78514575958252,
      "objective/non_score_reward": -1.1785145998001099,
      "objective/rlhf_reward": -0.2609584331512451,
      "objective/scores": 0.91796875,
      "policy/approxkl_avg": 0.8603074550628662,
      "policy/clipfrac_avg": 0.2998046875,
      "policy/entropy_avg": 0.034775733947753906,
      "step": 175,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.0012288093566895,
      "val/ratio_var": 3.532394111971371e-05
    },
    {
      "episode": 9216,
      "epoch": 0.126407615180984,
      "eps": 5,
      "loss/policy_avg": -0.010885423980653286,
      "loss/value_avg": 0.0,
      "lr": 2.731457800511509e-06,
      "objective/entropy": 1.5240473747253418,
      "objective/kl": 12.420597076416016,
      "objective/non_score_reward": -1.2420598268508911,
      "objective/rlhf_reward": -0.16641265153884888,
      "objective/scores": 1.078125,
      "policy/approxkl_avg": 0.46217110753059387,
      "policy/clipfrac_avg": 0.2783203125,
      "policy/entropy_avg": 0.029424667358398438,
      "step": 180,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 1.0007582902908325,
      "val/ratio_var": 2.4759892767178826e-05
    },
    {
      "episode": 9472,
      "epoch": 0.12991893782490022,
      "eps": 5,
      "loss/policy_avg": -0.01097183395177126,
      "loss/value_avg": 0.0,
      "lr": 2.7237851662404094e-06,
      "objective/entropy": 1.6292238235473633,
      "objective/kl": 12.73173713684082,
      "objective/non_score_reward": -1.2731736898422241,
      "objective/rlhf_reward": -0.10916168242692947,
      "objective/scores": 1.1640625,
      "policy/approxkl_avg": 0.5525862574577332,
      "policy/clipfrac_avg": 0.310546875,
      "policy/entropy_avg": 0.031815528869628906,
      "step": 185,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 22,
      "val/ratio": 1.0027148723602295,
      "val/ratio_var": 0.00016600274830125272
    },
    {
      "episode": 9728,
      "epoch": 0.13343026046881643,
      "eps": 5,
      "loss/policy_avg": -0.010572239756584167,
      "loss/value_avg": 0.0,
      "lr": 2.7161125319693097e-06,
      "objective/entropy": 2.028618335723877,
      "objective/kl": 12.439943313598633,
      "objective/non_score_reward": -1.2439942359924316,
      "objective/rlhf_reward": -0.06748821586370468,
      "objective/scores": 1.171875,
      "policy/approxkl_avg": 0.4930054843425751,
      "policy/clipfrac_avg": 0.2841796875,
      "policy/entropy_avg": 0.03688812255859375,
      "step": 190,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 22,
      "val/ratio": 1.001340627670288,
      "val/ratio_var": 4.4035481550963596e-05
    },
    {
      "episode": 9984,
      "epoch": 0.13694158311273266,
      "eps": 5,
      "loss/policy_avg": -0.019254155457019806,
      "loss/value_avg": 0.0,
      "lr": 2.7084398976982097e-06,
      "objective/entropy": 2.295351266860962,
      "objective/kl": 13.32223892211914,
      "objective/non_score_reward": -1.332223892211914,
      "objective/rlhf_reward": -0.1836824268102646,
      "objective/scores": 1.1484375,
      "policy/approxkl_avg": 3.1426281929016113,
      "policy/clipfrac_avg": 0.3251953125,
      "policy/entropy_avg": 0.03939247131347656,
      "step": 195,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 17,
      "val/ratio": 1.0032271146774292,
      "val/ratio_var": 0.00019827872165478766
    },
    {
      "episode": 10240,
      "epoch": 0.14045290575664887,
      "eps": 5,
      "loss/policy_avg": -0.018122296780347824,
      "loss/value_avg": 0.0,
      "lr": 2.70076726342711e-06,
      "objective/entropy": 2.345075845718384,
      "objective/kl": 12.536066055297852,
      "objective/non_score_reward": -1.2536065578460693,
      "objective/rlhf_reward": -0.056986674666404724,
      "objective/scores": 1.1953125,
      "policy/approxkl_avg": 27.5201473236084,
      "policy/clipfrac_avg": 0.3046875,
      "policy/entropy_avg": 0.04156017303466797,
      "step": 200,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 0.9993807077407837,
      "val/ratio_var": 0.00011275127326371148
    },
    {
      "episode": 10496,
      "epoch": 0.1439642284005651,
      "eps": 5,
      "loss/policy_avg": -0.019295353442430496,
      "loss/value_avg": 0.0,
      "lr": 2.6930946291560103e-06,
      "objective/entropy": 2.091012477874756,
      "objective/kl": 12.746508598327637,
      "objective/non_score_reward": -1.2746508121490479,
      "objective/rlhf_reward": -0.09065462648868561,
      "objective/scores": 1.1875,
      "policy/approxkl_avg": 0.5554059743881226,
      "policy/clipfrac_avg": 0.2998046875,
      "policy/entropy_avg": 0.03620719909667969,
      "step": 205,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 17,
      "val/ratio": 1.001387119293213,
      "val/ratio_var": 3.4958156902575865e-05
    },
    {
      "episode": 10752,
      "epoch": 0.14747555104448132,
      "eps": 5,
      "loss/policy_avg": -0.010203800164163113,
      "loss/value_avg": 0.0,
      "lr": 2.6854219948849107e-06,
      "objective/entropy": 2.1808600425720215,
      "objective/kl": 12.404802322387695,
      "objective/non_score_reward": -1.2404803037643433,
      "objective/rlhf_reward": -0.059675075113773346,
      "objective/scores": 1.1796875,
      "policy/approxkl_avg": 0.5876989364624023,
      "policy/clipfrac_avg": 0.27734375,
      "policy/entropy_avg": 0.041385650634765625,
      "step": 210,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 14,
      "val/ratio": 1.0035128593444824,
      "val/ratio_var": 0.0004931605653837323
    },
    {
      "episode": 11008,
      "epoch": 0.15098687368839755,
      "eps": 5,
      "loss/policy_avg": -0.018955286592245102,
      "loss/value_avg": 0.0,
      "lr": 2.677749360613811e-06,
      "objective/entropy": 1.968322992324829,
      "objective/kl": 13.322561264038086,
      "objective/non_score_reward": -1.3322560787200928,
      "objective/rlhf_reward": -0.0670965313911438,
      "objective/scores": 1.265625,
      "policy/approxkl_avg": 0.39782679080963135,
      "policy/clipfrac_avg": 0.373046875,
      "policy/entropy_avg": 0.03279399871826172,
      "step": 215,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 14,
      "val/ratio": 1.0017969608306885,
      "val/ratio_var": 6.843745359219611e-05
    },
    {
      "episode": 11264,
      "epoch": 0.15449819633231376,
      "eps": 5,
      "loss/policy_avg": -0.014947709627449512,
      "loss/value_avg": 0.0,
      "lr": 2.670076726342711e-06,
      "objective/entropy": 1.7985560894012451,
      "objective/kl": 12.856376647949219,
      "objective/non_score_reward": -1.285637617111206,
      "objective/rlhf_reward": -0.03251491114497185,
      "objective/scores": 1.25,
      "policy/approxkl_avg": 0.4516296982765198,
      "policy/clipfrac_avg": 0.3671875,
      "policy/entropy_avg": 0.031859397888183594,
      "step": 220,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0009992122650146,
      "val/ratio_var": 4.1194460209226236e-05
    },
    {
      "episode": 11520,
      "epoch": 0.15800951897623,
      "eps": 5,
      "loss/policy_avg": -0.019819077104330063,
      "loss/value_avg": 0.0,
      "lr": 2.6624040920716113e-06,
      "objective/entropy": 1.5284242630004883,
      "objective/kl": 14.283391952514648,
      "objective/non_score_reward": -1.4283392429351807,
      "objective/rlhf_reward": -0.014965277165174484,
      "objective/scores": 1.4140625,
      "policy/approxkl_avg": 1.5518393516540527,
      "policy/clipfrac_avg": 0.2744140625,
      "policy/entropy_avg": 0.026048660278320312,
      "step": 225,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.0007925033569336,
      "val/ratio_var": 3.721785105881281e-05
    },
    {
      "episode": 11776,
      "epoch": 0.1615208416201462,
      "eps": 5,
      "loss/policy_avg": -0.015632648020982742,
      "loss/value_avg": 0.0,
      "lr": 2.6547314578005116e-06,
      "objective/entropy": 1.5101430416107178,
      "objective/kl": 13.435927391052246,
      "objective/non_score_reward": -1.3435927629470825,
      "objective/rlhf_reward": 0.017792798578739166,
      "objective/scores": 1.359375,
      "policy/approxkl_avg": 0.22922199964523315,
      "policy/clipfrac_avg": 0.271484375,
      "policy/entropy_avg": 0.025536060333251953,
      "step": 230,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.0355898141860962,
      "val/ratio_var": 0.09009659290313721
    },
    {
      "episode": 12032,
      "epoch": 0.16503216426406245,
      "eps": 5,
      "loss/policy_avg": -0.014460040256381035,
      "loss/value_avg": 0.0,
      "lr": 2.647058823529412e-06,
      "objective/entropy": 1.412046194076538,
      "objective/kl": 13.981653213500977,
      "objective/non_score_reward": -1.3981653451919556,
      "objective/rlhf_reward": -0.19434592127799988,
      "objective/scores": 1.203125,
      "policy/approxkl_avg": 0.48358476161956787,
      "policy/clipfrac_avg": 0.287109375,
      "policy/entropy_avg": 0.027116775512695312,
      "step": 235,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.0002431869506836,
      "val/ratio_var": 1.317531587119447e-05
    },
    {
      "episode": 12288,
      "epoch": 0.16854348690797866,
      "eps": 5,
      "loss/policy_avg": -0.0144148338586092,
      "loss/value_avg": 0.0,
      "lr": 2.6393861892583123e-06,
      "objective/entropy": 1.5728825330734253,
      "objective/kl": 13.091099739074707,
      "objective/non_score_reward": -1.3091099262237549,
      "objective/rlhf_reward": -0.12438549101352692,
      "objective/scores": 1.1875,
      "policy/approxkl_avg": 0.5084937810897827,
      "policy/clipfrac_avg": 0.2587890625,
      "policy/entropy_avg": 0.028881072998046875,
      "step": 240,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 14,
      "val/ratio": 1.004534125328064,
      "val/ratio_var": 0.001037073670886457
    },
    {
      "episode": 12544,
      "epoch": 0.17205480955189487,
      "eps": 5,
      "loss/policy_avg": -0.02535724639892578,
      "loss/value_avg": 0.0,
      "lr": 2.6317135549872122e-06,
      "objective/entropy": 1.6895666122436523,
      "objective/kl": 13.036446571350098,
      "objective/non_score_reward": -1.3036446571350098,
      "objective/rlhf_reward": -0.08131173253059387,
      "objective/scores": 1.21875,
      "policy/approxkl_avg": 1.397173285484314,
      "policy/clipfrac_avg": 0.296875,
      "policy/entropy_avg": 0.025877952575683594,
      "step": 245,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 0.9993641972541809,
      "val/ratio_var": 3.554957584128715e-05
    },
    {
      "episode": 12800,
      "epoch": 0.1755661321958111,
      "eps": 5,
      "loss/policy_avg": -0.013989413157105446,
      "loss/value_avg": 0.0,
      "lr": 2.6240409207161126e-06,
      "objective/entropy": 1.4321318864822388,
      "objective/kl": 13.751260757446289,
      "objective/non_score_reward": -1.3751261234283447,
      "objective/rlhf_reward": 0.024946460500359535,
      "objective/scores": 1.3984375,
      "policy/approxkl_avg": 0.3265579044818878,
      "policy/clipfrac_avg": 0.3095703125,
      "policy/entropy_avg": 0.02507495880126953,
      "step": 250,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 14,
      "val/ratio": 1.0016669034957886,
      "val/ratio_var": 3.951354665332474e-05
    },
    {
      "episode": 13056,
      "epoch": 0.1790774548397273,
      "eps": 5,
      "loss/policy_avg": -0.01614242233335972,
      "loss/value_avg": 0.0,
      "lr": 2.616368286445013e-06,
      "objective/entropy": 1.2477443218231201,
      "objective/kl": 14.385757446289062,
      "objective/non_score_reward": -1.4385757446289062,
      "objective/rlhf_reward": -0.048571567982435226,
      "objective/scores": 1.390625,
      "policy/approxkl_avg": 0.38643181324005127,
      "policy/clipfrac_avg": 0.33203125,
      "policy/entropy_avg": 0.02417755126953125,
      "step": 255,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.0020815134048462,
      "val/ratio_var": 0.00012469623470678926
    },
    {
      "episode": 13312,
      "epoch": 0.18258877748364355,
      "eps": 5,
      "loss/policy_avg": -0.013632966205477715,
      "loss/value_avg": 0.0,
      "lr": 2.6086956521739132e-06,
      "objective/entropy": 1.5228471755981445,
      "objective/kl": 14.862211227416992,
      "objective/non_score_reward": -1.486221194267273,
      "objective/rlhf_reward": -0.07545565813779831,
      "objective/scores": 1.40625,
      "policy/approxkl_avg": 2.011383056640625,
      "policy/clipfrac_avg": 0.3359375,
      "policy/entropy_avg": 0.027433395385742188,
      "step": 260,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 17,
      "val/ratio": 0.9998003244400024,
      "val/ratio_var": 2.738163857429754e-05
    },
    {
      "episode": 13568,
      "epoch": 0.18610010012755976,
      "eps": 5,
      "loss/policy_avg": -0.020372817292809486,
      "loss/value_avg": 0.0,
      "lr": 2.6010230179028136e-06,
      "objective/entropy": 1.633180856704712,
      "objective/kl": 14.094629287719727,
      "objective/non_score_reward": -1.4094629287719727,
      "objective/rlhf_reward": -0.015713702887296677,
      "objective/scores": 1.390625,
      "policy/approxkl_avg": 0.47778478264808655,
      "policy/clipfrac_avg": 0.3701171875,
      "policy/entropy_avg": 0.02643442153930664,
      "step": 265,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 14,
      "val/ratio": 1.0079278945922852,
      "val/ratio_var": 0.0023215855471789837
    },
    {
      "episode": 13824,
      "epoch": 0.189611422771476,
      "eps": 5,
      "loss/policy_avg": -0.01413625106215477,
      "loss/value_avg": 0.0,
      "lr": 2.5933503836317135e-06,
      "objective/entropy": 1.2899070978164673,
      "objective/kl": 14.59975528717041,
      "objective/non_score_reward": -1.4599756002426147,
      "objective/rlhf_reward": -0.09675531834363937,
      "objective/scores": 1.359375,
      "policy/approxkl_avg": 0.4568091630935669,
      "policy/clipfrac_avg": 0.3076171875,
      "policy/entropy_avg": 0.02667713165283203,
      "step": 270,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.003631830215454,
      "val/ratio_var": 0.00041694415267556906
    },
    {
      "episode": 14080,
      "epoch": 0.1931227454153922,
      "eps": 5,
      "loss/policy_avg": -0.018304049968719482,
      "loss/value_avg": 0.0,
      "lr": 2.585677749360614e-06,
      "objective/entropy": 1.3464603424072266,
      "objective/kl": 14.915502548217773,
      "objective/non_score_reward": -1.491550326347351,
      "objective/rlhf_reward": -0.013601185753941536,
      "objective/scores": 1.4765625,
      "policy/approxkl_avg": 0.5154660940170288,
      "policy/clipfrac_avg": 0.3251953125,
      "policy/entropy_avg": 0.024927139282226562,
      "step": 275,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 1.0014865398406982,
      "val/ratio_var": 8.263294148491696e-05
    },
    {
      "episode": 14336,
      "epoch": 0.19663406805930844,
      "eps": 5,
      "loss/policy_avg": -0.009162629023194313,
      "loss/value_avg": 0.0,
      "lr": 2.578005115089514e-06,
      "objective/entropy": 1.3251242637634277,
      "objective/kl": 14.600137710571289,
      "objective/non_score_reward": -1.460013747215271,
      "objective/rlhf_reward": -0.10571230947971344,
      "objective/scores": 1.3515625,
      "policy/approxkl_avg": 0.4187917411327362,
      "policy/clipfrac_avg": 0.3046875,
      "policy/entropy_avg": 0.023657798767089844,
      "step": 280,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.008554458618164,
      "val/ratio_var": 0.001467635971494019
    },
    {
      "episode": 14592,
      "epoch": 0.20014539070322465,
      "eps": 5,
      "loss/policy_avg": -0.01609072834253311,
      "loss/value_avg": 0.0,
      "lr": 2.5703324808184145e-06,
      "objective/entropy": 1.3078004121780396,
      "objective/kl": 14.999523162841797,
      "objective/non_score_reward": -1.4999523162841797,
      "objective/rlhf_reward": -0.15238332748413086,
      "objective/scores": 1.3515625,
      "policy/approxkl_avg": 0.3968128561973572,
      "policy/clipfrac_avg": 0.36328125,
      "policy/entropy_avg": 0.023943424224853516,
      "step": 285,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 12,
      "val/ratio": 1.0019521713256836,
      "val/ratio_var": 8.228437945945188e-05
    },
    {
      "episode": 14848,
      "epoch": 0.2036567133471409,
      "eps": 5,
      "loss/policy_avg": -0.014186807908117771,
      "loss/value_avg": 0.0,
      "lr": 2.562659846547315e-06,
      "objective/entropy": 1.2583755254745483,
      "objective/kl": 15.623100280761719,
      "objective/non_score_reward": -1.5623100996017456,
      "objective/rlhf_reward": -0.09625323116779327,
      "objective/scores": 1.46875,
      "policy/approxkl_avg": 0.5678977370262146,
      "policy/clipfrac_avg": 0.3076171875,
      "policy/entropy_avg": 0.024990558624267578,
      "step": 290,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0014848709106445,
      "val/ratio_var": 4.219371476210654e-05
    },
    {
      "episode": 15104,
      "epoch": 0.2071680359910571,
      "eps": 5,
      "loss/policy_avg": -0.013804701156914234,
      "loss/value_avg": 0.0,
      "lr": 2.5549872122762148e-06,
      "objective/entropy": 1.568720817565918,
      "objective/kl": 14.687668800354004,
      "objective/non_score_reward": -1.4687669277191162,
      "objective/rlhf_reward": -0.17009752988815308,
      "objective/scores": 1.296875,
      "policy/approxkl_avg": 0.3046334981918335,
      "policy/clipfrac_avg": 0.26953125,
      "policy/entropy_avg": 0.027862548828125,
      "step": 295,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.0005717277526855,
      "val/ratio_var": 1.1324932529532816e-05
    },
    {
      "episode": 15360,
      "epoch": 0.21067935863497333,
      "eps": 5,
      "loss/policy_avg": -0.018133502453565598,
      "loss/value_avg": 0.0,
      "lr": 2.547314578005115e-06,
      "objective/entropy": 1.2987349033355713,
      "objective/kl": 13.89183235168457,
      "objective/non_score_reward": -1.3891831636428833,
      "objective/rlhf_reward": -0.12500587105751038,
      "objective/scores": 1.265625,
      "policy/approxkl_avg": 0.31936001777648926,
      "policy/clipfrac_avg": 0.33984375,
      "policy/entropy_avg": 0.02469015121459961,
      "step": 300,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 18,
      "val/ratio": 1.0020601749420166,
      "val/ratio_var": 3.1293042411562055e-05
    },
    {
      "episode": 15616,
      "epoch": 0.21419068127888954,
      "eps": 5,
      "loss/policy_avg": -0.01710616797208786,
      "loss/value_avg": 0.0,
      "lr": 2.5396419437340155e-06,
      "objective/entropy": 1.4288297891616821,
      "objective/kl": 14.952780723571777,
      "objective/non_score_reward": -1.4952781200408936,
      "objective/rlhf_reward": -0.15792769193649292,
      "objective/scores": 1.3359375,
      "policy/approxkl_avg": 0.6461950540542603,
      "policy/clipfrac_avg": 0.3125,
      "policy/entropy_avg": 0.02637958526611328,
      "step": 305,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.0013606548309326,
      "val/ratio_var": 4.203971548122354e-05
    },
    {
      "episode": 15872,
      "epoch": 0.21770200392280575,
      "eps": 5,
      "loss/policy_avg": -0.016128187999129295,
      "loss/value_avg": 0.0,
      "lr": 2.531969309462916e-06,
      "objective/entropy": 1.3288850784301758,
      "objective/kl": 15.583921432495117,
      "objective/non_score_reward": -1.55839204788208,
      "objective/rlhf_reward": -0.07665687799453735,
      "objective/scores": 1.484375,
      "policy/approxkl_avg": 0.3284182548522949,
      "policy/clipfrac_avg": 0.328125,
      "policy/entropy_avg": 0.02404165267944336,
      "step": 310,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.0064442157745361,
      "val/ratio_var": 0.0015344778075814247
    },
    {
      "episode": 16128,
      "epoch": 0.221213326566722,
      "eps": 5,
      "loss/policy_avg": -0.015278931707143784,
      "loss/value_avg": 0.0,
      "lr": 2.524296675191816e-06,
      "objective/entropy": 1.3236112594604492,
      "objective/kl": 14.773448944091797,
      "objective/non_score_reward": -1.4773449897766113,
      "objective/rlhf_reward": -0.08708612620830536,
      "objective/scores": 1.390625,
      "policy/approxkl_avg": 0.2980467975139618,
      "policy/clipfrac_avg": 0.34765625,
      "policy/entropy_avg": 0.02494335174560547,
      "step": 315,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.0008249282836914,
      "val/ratio_var": 1.927867742779199e-05
    },
    {
      "episode": 16384,
      "epoch": 0.2247246492106382,
      "eps": 5,
      "loss/policy_avg": -0.020951703190803528,
      "loss/value_avg": 0.0,
      "lr": 2.516624040920716e-06,
      "objective/entropy": 1.2670817375183105,
      "objective/kl": 14.8348970413208,
      "objective/non_score_reward": -1.483489751815796,
      "objective/rlhf_reward": 0.03382519632577896,
      "objective/scores": 1.515625,
      "policy/approxkl_avg": 1.0973663330078125,
      "policy/clipfrac_avg": 0.361328125,
      "policy/entropy_avg": 0.02091073989868164,
      "step": 320,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.003138542175293,
      "val/ratio_var": 0.0001205340595333837
    },
    {
      "episode": 16640,
      "epoch": 0.22823597185455444,
      "eps": 5,
      "loss/policy_avg": -0.006676271557807922,
      "loss/value_avg": 0.0,
      "lr": 2.5089514066496164e-06,
      "objective/entropy": 1.191056728363037,
      "objective/kl": 16.46404457092285,
      "objective/non_score_reward": -1.646404504776001,
      "objective/rlhf_reward": -0.14990828931331635,
      "objective/scores": 1.5,
      "policy/approxkl_avg": 0.31475046277046204,
      "policy/clipfrac_avg": 0.2578125,
      "policy/entropy_avg": 0.021608352661132812,
      "step": 325,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 25,
      "val/ratio": 1.000575065612793,
      "val/ratio_var": 1.735856494633481e-05
    },
    {
      "episode": 16896,
      "epoch": 0.23174729449847065,
      "eps": 5,
      "loss/policy_avg": -0.02127978205680847,
      "loss/value_avg": 0.0,
      "lr": 2.5012787723785167e-06,
      "objective/entropy": 1.490570068359375,
      "objective/kl": 16.22044563293457,
      "objective/non_score_reward": -1.622044563293457,
      "objective/rlhf_reward": -0.015120631083846092,
      "objective/scores": 1.609375,
      "policy/approxkl_avg": 2.5871665477752686,
      "policy/clipfrac_avg": 0.35546875,
      "policy/entropy_avg": 0.027353286743164062,
      "step": 330,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 25,
      "val/ratio": 1.003063440322876,
      "val/ratio_var": 6.509448576252908e-05
    },
    {
      "episode": 17152,
      "epoch": 0.23525861714238688,
      "eps": 5,
      "loss/policy_avg": -0.013108542189002037,
      "loss/value_avg": 0.0,
      "lr": 2.493606138107417e-06,
      "objective/entropy": 1.2718842029571533,
      "objective/kl": 16.047882080078125,
      "objective/non_score_reward": -1.604788064956665,
      "objective/rlhf_reward": -0.12415145337581635,
      "objective/scores": 1.484375,
      "policy/approxkl_avg": 0.2758824825286865,
      "policy/clipfrac_avg": 0.287109375,
      "policy/entropy_avg": 0.024268627166748047,
      "step": 335,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.0036962032318115,
      "val/ratio_var": 0.0003905180492438376
    },
    {
      "episode": 17408,
      "epoch": 0.2387699397863031,
      "eps": 5,
      "loss/policy_avg": -0.014837839640676975,
      "loss/value_avg": 0.0,
      "lr": 2.4859335038363174e-06,
      "objective/entropy": 1.3406567573547363,
      "objective/kl": 16.428348541259766,
      "objective/non_score_reward": -1.642835021018982,
      "objective/rlhf_reward": 0.018702151253819466,
      "objective/scores": 1.6640625,
      "policy/approxkl_avg": 0.192110076546669,
      "policy/clipfrac_avg": 0.3388671875,
      "policy/entropy_avg": 0.025295734405517578,
      "step": 340,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.0008959770202637,
      "val/ratio_var": 1.205760781886056e-05
    },
    {
      "episode": 17664,
      "epoch": 0.24228126243021933,
      "eps": 5,
      "loss/policy_avg": -0.01899782381951809,
      "loss/value_avg": 0.0,
      "lr": 2.4782608695652173e-06,
      "objective/entropy": 1.5880205631256104,
      "objective/kl": 15.775943756103516,
      "objective/non_score_reward": -1.577594518661499,
      "objective/rlhf_reward": -0.08363974094390869,
      "objective/scores": 1.4921875,
      "policy/approxkl_avg": 0.9254180192947388,
      "policy/clipfrac_avg": 0.3251953125,
      "policy/entropy_avg": 0.024907588958740234,
      "step": 345,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.000749111175537,
      "val/ratio_var": 2.8957187168998644e-05
    },
    {
      "episode": 17920,
      "epoch": 0.24579258507413554,
      "eps": 5,
      "loss/policy_avg": -0.014669202268123627,
      "loss/value_avg": 0.0,
      "lr": 2.4705882352941177e-06,
      "objective/entropy": 1.2656543254852295,
      "objective/kl": 16.034730911254883,
      "objective/non_score_reward": -1.60347318649292,
      "objective/rlhf_reward": -0.011744961142539978,
      "objective/scores": 1.59375,
      "policy/approxkl_avg": 1.004683017730713,
      "policy/clipfrac_avg": 0.2646484375,
      "policy/entropy_avg": 0.023741722106933594,
      "step": 350,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.0009608268737793,
      "val/ratio_var": 4.649764014175162e-05
    }
  ],
  "logging_steps": 100,
  "max_steps": 391,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1.3716104077797742,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0,
  "train_batch_size": null,
  "trial_name": null,
  "trial_params": null
}