Text Generation
Safetensors
Russian
qwen2
conversational
RefalMachine commited on
Commit
616a2e7
·
verified ·
1 Parent(s): e7be766

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. config.json +1 -1
  3. llmtf_eval/NEREL.jsonl +0 -0
  4. llmtf_eval/NEREL_params.jsonl +54 -0
  5. llmtf_eval/NEREL_total.jsonl +10 -0
  6. llmtf_eval/daru_treewayabstractive.jsonl +0 -0
  7. llmtf_eval/daru_treewayabstractive_params.jsonl +3 -3
  8. llmtf_eval/daru_treewayabstractive_total.jsonl +3 -3
  9. llmtf_eval/darumeru_MultiQ.jsonl +0 -0
  10. llmtf_eval/darumeru_MultiQ_params.jsonl +3 -3
  11. llmtf_eval/darumeru_MultiQ_total.jsonl +3 -3
  12. llmtf_eval/darumeru_PARus.jsonl +486 -486
  13. llmtf_eval/darumeru_PARus_params.jsonl +3 -3
  14. llmtf_eval/darumeru_PARus_total.jsonl +2 -2
  15. llmtf_eval/darumeru_RCB.jsonl +761 -761
  16. llmtf_eval/darumeru_RCB_params.jsonl +3 -3
  17. llmtf_eval/darumeru_RCB_total.jsonl +3 -3
  18. llmtf_eval/darumeru_RWSD.jsonl +492 -492
  19. llmtf_eval/darumeru_RWSD_params.jsonl +3 -3
  20. llmtf_eval/darumeru_RWSD_total.jsonl +2 -2
  21. llmtf_eval/darumeru_USE.jsonl +0 -0
  22. llmtf_eval/darumeru_USE_params.jsonl +54 -0
  23. llmtf_eval/darumeru_USE_total.jsonl +7 -0
  24. llmtf_eval/darumeru_cp_para_ru.jsonl +0 -0
  25. llmtf_eval/darumeru_cp_para_ru_params.jsonl +3 -3
  26. llmtf_eval/darumeru_cp_para_ru_total.jsonl +5 -4
  27. llmtf_eval/evaluation_log.txt +265 -244
  28. llmtf_eval/evaluation_results.txt +2 -2
  29. llmtf_eval/llm_as_judge.jsonl +0 -0
  30. llmtf_eval/llm_as_judge_params.jsonl +22 -0
  31. llmtf_eval/llm_as_judge_total.jsonl +7 -0
  32. llmtf_eval/nlpcoreteam_enMMLU.jsonl +2 -2
  33. llmtf_eval/nlpcoreteam_enMMLU_params.jsonl +3 -3
  34. llmtf_eval/nlpcoreteam_enMMLU_total.jsonl +2 -2
  35. llmtf_eval/nlpcoreteam_ruMMLU.jsonl +2 -2
  36. llmtf_eval/nlpcoreteam_ruMMLU_params.jsonl +3 -3
  37. llmtf_eval/nlpcoreteam_ruMMLU_total.jsonl +2 -2
  38. llmtf_eval/ruopinionne.jsonl +0 -0
  39. llmtf_eval/ruopinionne_params.jsonl +54 -0
  40. llmtf_eval/ruopinionne_total.jsonl +7 -0
  41. llmtf_eval/ruparam.jsonl +3 -0
  42. llmtf_eval/ruparam_params.jsonl +54 -0
  43. llmtf_eval/ruparam_total.jsonl +7 -0
  44. llmtf_eval/shlepa_books_mc.jsonl +0 -0
  45. llmtf_eval/shlepa_books_mc_params.jsonl +54 -0
  46. llmtf_eval/shlepa_books_mc_total.jsonl +7 -0
  47. llmtf_eval/shlepa_law_mc.jsonl +0 -0
  48. llmtf_eval/shlepa_law_mc_params.jsonl +54 -0
  49. llmtf_eval/shlepa_law_mc_total.jsonl +7 -0
  50. llmtf_eval/shlepa_movie_mc.jsonl +0 -0
.gitattributes CHANGED
@@ -37,3 +37,5 @@ llmtf_eval/daru_treewayextractive.jsonl filter=lfs diff=lfs merge=lfs -text
37
  llmtf_eval/nlpcoreteam_enMMLU.jsonl filter=lfs diff=lfs merge=lfs -text
38
  llmtf_eval/nlpcoreteam_ruMMLU.jsonl filter=lfs diff=lfs merge=lfs -text
39
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
37
  llmtf_eval/nlpcoreteam_enMMLU.jsonl filter=lfs diff=lfs merge=lfs -text
38
  llmtf_eval/nlpcoreteam_ruMMLU.jsonl filter=lfs diff=lfs merge=lfs -text
39
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ llmtf_eval/ruparam.jsonl filter=lfs diff=lfs merge=lfs -text
41
+ llmtf_eval/vikhrmodels_habr_qa_sbs.jsonl filter=lfs diff=lfs merge=lfs -text
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/kto1",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/sft1",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
llmtf_eval/NEREL.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/NEREL_params.jsonl ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
+ "generation_config": {
6
+ "bos_token_id": 145109,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 145111
10
+ ],
11
+ "max_length": 32768,
12
+ "max_new_tokens": 128,
13
+ "pad_token_id": 145109,
14
+ "stop_strings": [
15
+ "<|im_end|>"
16
+ ],
17
+ "temperature": 0.1,
18
+ "top_k": 40,
19
+ "top_p": 0.9,
20
+ "transformers_version": "4.45.2",
21
+ "trust_remote_code": false
22
+ },
23
+ "conversation_template": {
24
+ "system_prompt": "",
25
+ "system_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
26
+ "user_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
27
+ "bot_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
28
+ "bot_message_template_incomplete": "<|im_start|>{role}\n{content}",
29
+ "user_role": "user",
30
+ "bot_role": "assistant",
31
+ "system_role": "system",
32
+ "global_prefix": "",
33
+ "suffix": "<|im_start|>assistant\n",
34
+ "add_special_tokens": false,
35
+ "eos_token": "<|im_end|>"
36
+ },
37
+ "load_in_8bit": false,
38
+ "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
+ "device_map": "cuda:0",
41
+ "use_fast_tokenizer": true,
42
+ "leading_space": false,
43
+ "space_token": null,
44
+ "trust_remote_code": false,
45
+ "max_model_len": 32768
46
+ },
47
+ "task_params": {
48
+ "max_len": 4000,
49
+ "few_shot_count": 0,
50
+ "batch_size": 16,
51
+ "max_sample_per_dataset": 200,
52
+ "method": "generate"
53
+ }
54
+ }
llmtf_eval/NEREL_total.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "NEREL",
3
+ "results": {
4
+ "tp": 2.0,
5
+ "fp": 27.0,
6
+ "fn": 519.0,
7
+ "micro-f1": 0.00727272727272595
8
+ },
9
+ "leaderboard_result": 0.00727272727272595
10
+ }
llmtf_eval/daru_treewayabstractive.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/daru_treewayabstractive_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 200,
52
  "method": "generate"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 200,
52
  "method": "generate"
53
  }
llmtf_eval/daru_treewayabstractive_total.jsonl CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "task_name": "daru/treewayabstractive",
3
  "results": {
4
- "rouge1": 0.31023763628891676,
5
- "rouge2": 0.09443696323171702
6
  },
7
- "leaderboard_result": 0.2023372997603169
8
  }
 
1
  {
2
  "task_name": "daru/treewayabstractive",
3
  "results": {
4
+ "rouge1": 0.3138417117532064,
5
+ "rouge2": 0.10462617373556911
6
  },
7
+ "leaderboard_result": 0.20923394274438775
8
  }
llmtf_eval/darumeru_MultiQ.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/darumeru_MultiQ_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "generate"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "generate"
53
  }
llmtf_eval/darumeru_MultiQ_total.jsonl CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "task_name": "darumeru/MultiQ",
3
  "results": {
4
- "f1": 0.20613243758223346,
5
- "em": 0.11281070745697896
6
  },
7
- "leaderboard_result": 0.1594715725196062
8
  }
 
1
  {
2
  "task_name": "darumeru/MultiQ",
3
  "results": {
4
+ "f1": 0.3016876852043635,
5
+ "em": 0.21319311663479923
6
  },
7
+ "leaderboard_result": 0.25744040091958137
8
  }
llmtf_eval/darumeru_PARus.jsonl CHANGED
@@ -6,8 +6,8 @@
6
  }
7
  },
8
  "predict": {
9
- "1": 0.6791735887527466,
10
- "2": 0.3208189010620117
11
  },
12
  "sample": {
13
  "messages": [
@@ -47,8 +47,8 @@
47
  }
48
  },
49
  "predict": {
50
- "1": 0.7772925496101379,
51
- "2": 0.22269804775714874
52
  },
53
  "sample": {
54
  "messages": [
@@ -88,8 +88,8 @@
88
  }
89
  },
90
  "predict": {
91
- "1": 0.9324424862861633,
92
- "2": 0.06754591315984726
93
  },
94
  "sample": {
95
  "messages": [
@@ -129,8 +129,8 @@
129
  }
130
  },
131
  "predict": {
132
- "1": 0.05340291187167168,
133
- "2": 0.9465891718864441
134
  },
135
  "sample": {
136
  "messages": [
@@ -170,8 +170,8 @@
170
  }
171
  },
172
  "predict": {
173
- "1": 0.9980727434158325,
174
- "2": 0.0019267337629571557
175
  },
176
  "sample": {
177
  "messages": [
@@ -211,8 +211,8 @@
211
  }
212
  },
213
  "predict": {
214
- "1": 1.3709549193663406e-06,
215
- "2": 0.9999969005584717
216
  },
217
  "sample": {
218
  "messages": [
@@ -252,8 +252,8 @@
252
  }
253
  },
254
  "predict": {
255
- "1": 0.003593580098822713,
256
- "2": 0.9964001178741455
257
  },
258
  "sample": {
259
  "messages": [
@@ -293,8 +293,8 @@
293
  }
294
  },
295
  "predict": {
296
- "1": 0.18242034316062927,
297
- "2": 0.8175512552261353
298
  },
299
  "sample": {
300
  "messages": [
@@ -334,8 +334,8 @@
334
  }
335
  },
336
  "predict": {
337
- "1": 0.1480443924665451,
338
- "2": 0.8519367575645447
339
  },
340
  "sample": {
341
  "messages": [
@@ -375,8 +375,8 @@
375
  }
376
  },
377
  "predict": {
378
- "1": 0.7981746196746826,
379
- "2": 0.20181015133857727
380
  },
381
  "sample": {
382
  "messages": [
@@ -416,8 +416,8 @@
416
  }
417
  },
418
  "predict": {
419
- "1": 0.5312058329582214,
420
- "2": 0.4687875211238861
421
  },
422
  "sample": {
423
  "messages": [
@@ -457,8 +457,8 @@
457
  }
458
  },
459
  "predict": {
460
- "1": 0.0019267323659732938,
461
- "2": 0.9980720281600952
462
  },
463
  "sample": {
464
  "messages": [
@@ -498,8 +498,8 @@
498
  }
499
  },
500
  "predict": {
501
- "1": 0.00460954662412405,
502
- "2": 0.9953848719596863
503
  },
504
  "sample": {
505
  "messages": [
@@ -539,8 +539,8 @@
539
  }
540
  },
541
  "predict": {
542
- "1": 0.0059110443107783794,
543
- "2": 0.994084894657135
544
  },
545
  "sample": {
546
  "messages": [
@@ -580,8 +580,8 @@
580
  }
581
  },
582
  "predict": {
583
- "1": 0.5926629304885864,
584
- "2": 0.4073309004306793
585
  },
586
  "sample": {
587
  "messages": [
@@ -621,8 +621,8 @@
621
  }
622
  },
623
  "predict": {
624
- "1": 0.004609555937349796,
625
- "2": 0.9953868985176086
626
  },
627
  "sample": {
628
  "messages": [
@@ -662,8 +662,8 @@
662
  }
663
  },
664
  "predict": {
665
- "1": 0.02595728635787964,
666
- "2": 0.9740400314331055
667
  },
668
  "sample": {
669
  "messages": [
@@ -703,8 +703,8 @@
703
  }
704
  },
705
  "predict": {
706
- "1": 0.20181089639663696,
707
- "2": 0.7981775999069214
708
  },
709
  "sample": {
710
  "messages": [
@@ -739,13 +739,13 @@
739
  {
740
  "metric": {
741
  "acc": {
742
- "val": false,
743
  "id": 9
744
  }
745
  },
746
  "predict": {
747
- "1": 0.18240754306316376,
748
- "2": 0.8174938559532166
749
  },
750
  "sample": {
751
  "messages": [
@@ -774,7 +774,7 @@
774
  "prompt_len": 78,
775
  "generated_len": 1,
776
  "generated_cumulative_logprob": "TODO: calculate for hf model",
777
- "generated_token": "2"
778
  }
779
  }
780
  {
@@ -785,8 +785,8 @@
785
  }
786
  },
787
  "predict": {
788
- "1": 0.005219845101237297,
789
- "2": 0.9947263598442078
790
  },
791
  "sample": {
792
  "messages": [
@@ -821,13 +821,13 @@
821
  {
822
  "metric": {
823
  "acc": {
824
- "val": false,
825
  "id": 10
826
  }
827
  },
828
  "predict": {
829
- "1": 0.4073287546634674,
830
- "2": 0.5926598310470581
831
  },
832
  "sample": {
833
  "messages": [
@@ -856,7 +856,7 @@
856
  "prompt_len": 87,
857
  "generated_len": 1,
858
  "generated_cumulative_logprob": "TODO: calculate for hf model",
859
- "generated_token": "2"
860
  }
861
  }
862
  {
@@ -867,8 +867,8 @@
867
  }
868
  },
869
  "predict": {
870
- "1": 0.1480454057455063,
871
- "2": 0.8519425392150879
872
  },
873
  "sample": {
874
  "messages": [
@@ -908,8 +908,8 @@
908
  }
909
  },
910
  "predict": {
911
- "1": 0.03732459619641304,
912
- "2": 0.9626140594482422
913
  },
914
  "sample": {
915
  "messages": [
@@ -949,8 +949,8 @@
949
  }
950
  },
951
  "predict": {
952
- "1": 0.26892003417015076,
953
- "2": 0.7310003638267517
954
  },
955
  "sample": {
956
  "messages": [
@@ -990,8 +990,8 @@
990
  }
991
  },
992
  "predict": {
993
- "1": 0.26893946528434753,
994
- "2": 0.7310531735420227
995
  },
996
  "sample": {
997
  "messages": [
@@ -1031,8 +1031,8 @@
1031
  }
1032
  },
1033
  "predict": {
1034
- "1": 0.5312056541442871,
1035
- "2": 0.4687873125076294
1036
  },
1037
  "sample": {
1038
  "messages": [
@@ -1072,8 +1072,8 @@
1072
  }
1073
  },
1074
  "predict": {
1075
- "1": 0.025957094505429268,
1076
- "2": 0.9740327596664429
1077
  },
1078
  "sample": {
1079
  "messages": [
@@ -1108,13 +1108,13 @@
1108
  {
1109
  "metric": {
1110
  "acc": {
1111
- "val": false,
1112
  "id": 13
1113
  }
1114
  },
1115
  "predict": {
1116
- "1": 0.2689380645751953,
1117
- "2": 0.7310493588447571
1118
  },
1119
  "sample": {
1120
  "messages": [
@@ -1143,7 +1143,7 @@
1143
  "prompt_len": 106,
1144
  "generated_len": 1,
1145
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1146
- "generated_token": "2"
1147
  }
1148
  }
1149
  {
@@ -1154,8 +1154,8 @@
1154
  }
1155
  },
1156
  "predict": {
1157
- "1": 0.09534876048564911,
1158
- "2": 0.9046438932418823
1159
  },
1160
  "sample": {
1161
  "messages": [
@@ -1195,8 +1195,8 @@
1195
  }
1196
  },
1197
  "predict": {
1198
- "1": 0.43782007694244385,
1199
- "2": 0.5621721148490906
1200
  },
1201
  "sample": {
1202
  "messages": [
@@ -1231,13 +1231,13 @@
1231
  {
1232
  "metric": {
1233
  "acc": {
1234
- "val": false,
1235
  "id": 15
1236
  }
1237
  },
1238
  "predict": {
1239
- "1": 0.3208208382129669,
1240
- "2": 0.6791777014732361
1241
  },
1242
  "sample": {
1243
  "messages": [
@@ -1266,7 +1266,7 @@
1266
  "prompt_len": 91,
1267
  "generated_len": 1,
1268
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1269
- "generated_token": "2"
1270
  }
1271
  }
1272
  {
@@ -1277,8 +1277,8 @@
1277
  }
1278
  },
1279
  "predict": {
1280
- "1": 0.0013250199845060706,
1281
- "2": 0.9986732006072998
1282
  },
1283
  "sample": {
1284
  "messages": [
@@ -1313,13 +1313,13 @@
1313
  {
1314
  "metric": {
1315
  "acc": {
1316
- "val": false,
1317
  "id": 16
1318
  }
1319
  },
1320
  "predict": {
1321
- "1": 0.8354753851890564,
1322
- "2": 0.16451483964920044
1323
  },
1324
  "sample": {
1325
  "messages": [
@@ -1348,7 +1348,7 @@
1348
  "prompt_len": 92,
1349
  "generated_len": 1,
1350
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1351
- "generated_token": "1"
1352
  }
1353
  }
1354
  {
@@ -1359,8 +1359,8 @@
1359
  }
1360
  },
1361
  "predict": {
1362
- "1": 0.437816858291626,
1363
- "2": 0.5621679425239563
1364
  },
1365
  "sample": {
1366
  "messages": [
@@ -1395,13 +1395,13 @@
1395
  {
1396
  "metric": {
1397
  "acc": {
1398
- "val": false,
1399
  "id": 17
1400
  }
1401
  },
1402
  "predict": {
1403
- "1": 0.047425612807273865,
1404
- "2": 0.9525689482688904
1405
  },
1406
  "sample": {
1407
  "messages": [
@@ -1430,7 +1430,7 @@
1430
  "prompt_len": 75,
1431
  "generated_len": 1,
1432
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1433
- "generated_token": "2"
1434
  }
1435
  }
1436
  {
@@ -1441,8 +1441,8 @@
1441
  }
1442
  },
1443
  "predict": {
1444
- "1": 0.003172677243128419,
1445
- "2": 0.9968255758285522
1446
  },
1447
  "sample": {
1448
  "messages": [
@@ -1482,8 +1482,8 @@
1482
  }
1483
  },
1484
  "predict": {
1485
- "1": 0.04741906747221947,
1486
- "2": 0.9524374604225159
1487
  },
1488
  "sample": {
1489
  "messages": [
@@ -1518,13 +1518,13 @@
1518
  {
1519
  "metric": {
1520
  "acc": {
1521
- "val": false,
1522
  "id": 18
1523
  }
1524
  },
1525
  "predict": {
1526
- "1": 0.22267712652683258,
1527
- "2": 0.7772195339202881
1528
  },
1529
  "sample": {
1530
  "messages": [
@@ -1553,7 +1553,7 @@
1553
  "prompt_len": 74,
1554
  "generated_len": 1,
1555
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1556
- "generated_token": "2"
1557
  }
1558
  }
1559
  {
@@ -1564,8 +1564,8 @@
1564
  }
1565
  },
1566
  "predict": {
1567
- "1": 0.9959231615066528,
1568
- "2": 0.004070110619068146
1569
  },
1570
  "sample": {
1571
  "messages": [
@@ -1605,8 +1605,8 @@
1605
  }
1606
  },
1607
  "predict": {
1608
- "1": 0.02931191585958004,
1609
- "2": 0.9706773161888123
1610
  },
1611
  "sample": {
1612
  "messages": [
@@ -1641,13 +1641,13 @@
1641
  {
1642
  "metric": {
1643
  "acc": {
1644
- "val": false,
1645
  "id": 20
1646
  }
1647
  },
1648
  "predict": {
1649
- "1": 0.29421061277389526,
1650
- "2": 0.7057746052742004
1651
  },
1652
  "sample": {
1653
  "messages": [
@@ -1676,7 +1676,7 @@
1676
  "prompt_len": 80,
1677
  "generated_len": 1,
1678
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1679
- "generated_token": "2"
1680
  }
1681
  }
1682
  {
@@ -1687,8 +1687,8 @@
1687
  }
1688
  },
1689
  "predict": {
1690
- "1": 0.009708438068628311,
1691
- "2": 0.9902875423431396
1692
  },
1693
  "sample": {
1694
  "messages": [
@@ -1728,8 +1728,8 @@
1728
  }
1729
  },
1730
  "predict": {
1731
- "1": 0.005220096092671156,
1732
- "2": 0.9947742819786072
1733
  },
1734
  "sample": {
1735
  "messages": [
@@ -1769,8 +1769,8 @@
1769
  }
1770
  },
1771
  "predict": {
1772
- "1": 0.8807904720306396,
1773
- "2": 0.11920202523469925
1774
  },
1775
  "sample": {
1776
  "messages": [
@@ -1810,8 +1810,8 @@
1810
  }
1811
  },
1812
  "predict": {
1813
- "1": 0.00029595597879961133,
1814
- "2": 0.9997001886367798
1815
  },
1816
  "sample": {
1817
  "messages": [
@@ -1846,13 +1846,13 @@
1846
  {
1847
  "metric": {
1848
  "acc": {
1849
- "val": false,
1850
  "id": 22
1851
  }
1852
  },
1853
  "predict": {
1854
- "1": 0.7981850504875183,
1855
- "2": 0.20181278884410858
1856
  },
1857
  "sample": {
1858
  "messages": [
@@ -1881,7 +1881,7 @@
1881
  "prompt_len": 96,
1882
  "generated_len": 1,
1883
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1884
- "generated_token": "1"
1885
  }
1886
  }
1887
  {
@@ -1892,8 +1892,8 @@
1892
  }
1893
  },
1894
  "predict": {
1895
- "1": 0.37753984332084656,
1896
- "2": 0.6224579215049744
1897
  },
1898
  "sample": {
1899
  "messages": [
@@ -1933,8 +1933,8 @@
1933
  }
1934
  },
1935
  "predict": {
1936
- "1": 0.003172674449160695,
1937
- "2": 0.9968246221542358
1938
  },
1939
  "sample": {
1940
  "messages": [
@@ -1969,13 +1969,13 @@
1969
  {
1970
  "metric": {
1971
  "acc": {
1972
- "val": true,
1973
  "id": 24
1974
  }
1975
  },
1976
  "predict": {
1977
- "1": 0.9398970007896423,
1978
- "2": 0.06008560210466385
1979
  },
1980
  "sample": {
1981
  "messages": [
@@ -2004,7 +2004,7 @@
2004
  "prompt_len": 85,
2005
  "generated_len": 1,
2006
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2007
- "generated_token": "1"
2008
  }
2009
  }
2010
  {
@@ -2015,8 +2015,8 @@
2015
  }
2016
  },
2017
  "predict": {
2018
- "1": 0.24508117139339447,
2019
- "2": 0.7549031972885132
2020
  },
2021
  "sample": {
2022
  "messages": [
@@ -2056,8 +2056,8 @@
2056
  }
2057
  },
2058
  "predict": {
2059
- "1": 0.07585714012384415,
2060
- "2": 0.9241291880607605
2061
  },
2062
  "sample": {
2063
  "messages": [
@@ -2097,8 +2097,8 @@
2097
  }
2098
  },
2099
  "predict": {
2100
- "1": 0.8807794451713562,
2101
- "2": 0.11920053511857986
2102
  },
2103
  "sample": {
2104
  "messages": [
@@ -2138,8 +2138,8 @@
2138
  }
2139
  },
2140
  "predict": {
2141
- "1": 0.033085692673921585,
2142
- "2": 0.9669057130813599
2143
  },
2144
  "sample": {
2145
  "messages": [
@@ -2174,13 +2174,13 @@
2174
  {
2175
  "metric": {
2176
  "acc": {
2177
- "val": false,
2178
  "id": 26
2179
  }
2180
  },
2181
  "predict": {
2182
- "1": 0.13296259939670563,
2183
- "2": 0.8670250773429871
2184
  },
2185
  "sample": {
2186
  "messages": [
@@ -2209,7 +2209,7 @@
2209
  "prompt_len": 80,
2210
  "generated_len": 1,
2211
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2212
- "generated_token": "2"
2213
  }
2214
  }
2215
  {
@@ -2220,8 +2220,8 @@
2220
  }
2221
  },
2222
  "predict": {
2223
- "1": 0.0004305326147004962,
2224
- "2": 0.9995126724243164
2225
  },
2226
  "sample": {
2227
  "messages": [
@@ -2261,8 +2261,8 @@
2261
  }
2262
  },
2263
  "predict": {
2264
- "1": 0.3208002746105194,
2265
- "2": 0.6791341304779053
2266
  },
2267
  "sample": {
2268
  "messages": [
@@ -2302,8 +2302,8 @@
2302
  }
2303
  },
2304
  "predict": {
2305
- "1": 0.014063562266528606,
2306
- "2": 0.9859318733215332
2307
  },
2308
  "sample": {
2309
  "messages": [
@@ -2343,8 +2343,8 @@
2343
  }
2344
  },
2345
  "predict": {
2346
- "1": 0.5926593542098999,
2347
- "2": 0.40732845664024353
2348
  },
2349
  "sample": {
2350
  "messages": [
@@ -2379,13 +2379,13 @@
2379
  {
2380
  "metric": {
2381
  "acc": {
2382
- "val": true,
2383
  "id": 29
2384
  }
2385
  },
2386
  "predict": {
2387
- "1": 0.4378196895122528,
2388
- "2": 0.5621716380119324
2389
  },
2390
  "sample": {
2391
  "messages": [
@@ -2414,19 +2414,19 @@
2414
  "prompt_len": 91,
2415
  "generated_len": 1,
2416
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2417
- "generated_token": "2"
2418
  }
2419
  }
2420
  {
2421
  "metric": {
2422
  "acc": {
2423
- "val": false,
2424
  "id": 29
2425
  }
2426
  },
2427
  "predict": {
2428
- "1": 0.407329797744751,
2429
- "2": 0.5926613211631775
2430
  },
2431
  "sample": {
2432
  "messages": [
@@ -2455,7 +2455,7 @@
2455
  "prompt_len": 91,
2456
  "generated_len": 1,
2457
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2458
- "generated_token": "2"
2459
  }
2460
  }
2461
  {
@@ -2466,8 +2466,8 @@
2466
  }
2467
  },
2468
  "predict": {
2469
- "1": 0.00015842854918446392,
2470
- "2": 0.9997932314872742
2471
  },
2472
  "sample": {
2473
  "messages": [
@@ -2507,8 +2507,8 @@
2507
  }
2508
  },
2509
  "predict": {
2510
- "1": 0.07585276663303375,
2511
- "2": 0.9240758419036865
2512
  },
2513
  "sample": {
2514
  "messages": [
@@ -2548,8 +2548,8 @@
2548
  }
2549
  },
2550
  "predict": {
2551
- "1": 0.0021827055606991053,
2552
- "2": 0.9978122711181641
2553
  },
2554
  "sample": {
2555
  "messages": [
@@ -2584,13 +2584,13 @@
2584
  {
2585
  "metric": {
2586
  "acc": {
2587
- "val": false,
2588
  "id": 31
2589
  }
2590
  },
2591
  "predict": {
2592
- "1": 0.2689363360404968,
2593
- "2": 0.7310447096824646
2594
  },
2595
  "sample": {
2596
  "messages": [
@@ -2619,19 +2619,19 @@
2619
  "prompt_len": 80,
2620
  "generated_len": 1,
2621
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2622
- "generated_token": "2"
2623
  }
2624
  }
2625
  {
2626
  "metric": {
2627
  "acc": {
2628
- "val": false,
2629
  "id": 32
2630
  }
2631
  },
2632
  "predict": {
2633
- "1": 0.0474257692694664,
2634
- "2": 0.9525721073150635
2635
  },
2636
  "sample": {
2637
  "messages": [
@@ -2660,7 +2660,7 @@
2660
  "prompt_len": 83,
2661
  "generated_len": 1,
2662
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2663
- "generated_token": "2"
2664
  }
2665
  }
2666
  {
@@ -2671,8 +2671,8 @@
2671
  }
2672
  },
2673
  "predict": {
2674
- "1": 0.0010322295129299164,
2675
- "2": 0.9989663362503052
2676
  },
2677
  "sample": {
2678
  "messages": [
@@ -2712,8 +2712,8 @@
2712
  }
2713
  },
2714
  "predict": {
2715
- "1": 0.01590634509921074,
2716
- "2": 0.9840907454490662
2717
  },
2718
  "sample": {
2719
  "messages": [
@@ -2753,8 +2753,8 @@
2753
  }
2754
  },
2755
  "predict": {
2756
- "1": 0.2450830489397049,
2757
- "2": 0.7549089789390564
2758
  },
2759
  "sample": {
2760
  "messages": [
@@ -2794,8 +2794,8 @@
2794
  }
2795
  },
2796
  "predict": {
2797
- "1": 0.9525681734085083,
2798
- "2": 0.04742557555437088
2799
  },
2800
  "sample": {
2801
  "messages": [
@@ -2835,8 +2835,8 @@
2835
  }
2836
  },
2837
  "predict": {
2838
- "1": 0.7549096941947937,
2839
- "2": 0.2450833022594452
2840
  },
2841
  "sample": {
2842
  "messages": [
@@ -2876,8 +2876,8 @@
2876
  }
2877
  },
2878
  "predict": {
2879
- "1": 0.047425732016563416,
2880
- "2": 0.9525713324546814
2881
  },
2882
  "sample": {
2883
  "messages": [
@@ -2917,8 +2917,8 @@
2917
  }
2918
  },
2919
  "predict": {
2920
- "1": 0.029312150552868843,
2921
- "2": 0.9706850647926331
2922
  },
2923
  "sample": {
2924
  "messages": [
@@ -2958,8 +2958,8 @@
2958
  }
2959
  },
2960
  "predict": {
2961
- "1": 0.017984773963689804,
2962
- "2": 0.9819353818893433
2963
  },
2964
  "sample": {
2965
  "messages": [
@@ -2999,8 +2999,8 @@
2999
  }
3000
  },
3001
  "predict": {
3002
- "1": 0.017984716221690178,
3003
- "2": 0.9819322824478149
3004
  },
3005
  "sample": {
3006
  "messages": [
@@ -3040,8 +3040,8 @@
3040
  }
3041
  },
3042
  "predict": {
3043
- "1": 0.10668963193893433,
3044
- "2": 0.8933013677597046
3045
  },
3046
  "sample": {
3047
  "messages": [
@@ -3081,8 +3081,8 @@
3081
  }
3082
  },
3083
  "predict": {
3084
- "1": 0.7310492396354675,
3085
- "2": 0.26893800497055054
3086
  },
3087
  "sample": {
3088
  "messages": [
@@ -3122,8 +3122,8 @@
3122
  }
3123
  },
3124
  "predict": {
3125
- "1": 0.06008534878492355,
3126
- "2": 0.9398930072784424
3127
  },
3128
  "sample": {
3129
  "messages": [
@@ -3158,13 +3158,13 @@
3158
  {
3159
  "metric": {
3160
  "acc": {
3161
- "val": false,
3162
  "id": 38
3163
  }
3164
  },
3165
  "predict": {
3166
- "1": 0.2018088847398758,
3167
- "2": 0.7981696128845215
3168
  },
3169
  "sample": {
3170
  "messages": [
@@ -3193,19 +3193,19 @@
3193
  "prompt_len": 85,
3194
  "generated_len": 1,
3195
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3196
- "generated_token": "2"
3197
  }
3198
  }
3199
  {
3200
  "metric": {
3201
  "acc": {
3202
- "val": false,
3203
  "id": 39
3204
  }
3205
  },
3206
  "predict": {
3207
- "1": 0.34864377975463867,
3208
- "2": 0.6513522863388062
3209
  },
3210
  "sample": {
3211
  "messages": [
@@ -3234,7 +3234,7 @@
3234
  "prompt_len": 86,
3235
  "generated_len": 1,
3236
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3237
- "generated_token": "2"
3238
  }
3239
  }
3240
  {
@@ -3245,8 +3245,8 @@
3245
  }
3246
  },
3247
  "predict": {
3248
- "1": 0.0011695047141984105,
3249
- "2": 0.99882572889328
3250
  },
3251
  "sample": {
3252
  "messages": [
@@ -3286,8 +3286,8 @@
3286
  }
3287
  },
3288
  "predict": {
3289
- "1": 0.29419824481010437,
3290
- "2": 0.7057448625564575
3291
  },
3292
  "sample": {
3293
  "messages": [
@@ -3322,13 +3322,13 @@
3322
  {
3323
  "metric": {
3324
  "acc": {
3325
- "val": false,
3326
  "id": 40
3327
  }
3328
  },
3329
  "predict": {
3330
- "1": 0.6224275827407837,
3331
- "2": 0.37752142548561096
3332
  },
3333
  "sample": {
3334
  "messages": [
@@ -3357,7 +3357,7 @@
3357
  "prompt_len": 91,
3358
  "generated_len": 1,
3359
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3360
- "generated_token": "1"
3361
  }
3362
  }
3363
  {
@@ -3368,8 +3368,8 @@
3368
  }
3369
  },
3370
  "predict": {
3371
- "1": 0.4999956488609314,
3372
- "2": 0.4999956488609314
3373
  },
3374
  "sample": {
3375
  "messages": [
@@ -3404,13 +3404,13 @@
3404
  {
3405
  "metric": {
3406
  "acc": {
3407
- "val": true,
3408
  "id": 41
3409
  }
3410
  },
3411
  "predict": {
3412
- "1": 0.06754608452320099,
3413
- "2": 0.9324448704719543
3414
  },
3415
  "sample": {
3416
  "messages": [
@@ -3439,19 +3439,19 @@
3439
  "prompt_len": 77,
3440
  "generated_len": 1,
3441
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3442
- "generated_token": "2"
3443
  }
3444
  }
3445
  {
3446
  "metric": {
3447
  "acc": {
3448
- "val": true,
3449
  "id": 42
3450
  }
3451
  },
3452
  "predict": {
3453
- "1": 0.34864187240600586,
3454
- "2": 0.6513487696647644
3455
  },
3456
  "sample": {
3457
  "messages": [
@@ -3480,19 +3480,19 @@
3480
  "prompt_len": 84,
3481
  "generated_len": 1,
3482
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3483
- "generated_token": "2"
3484
  }
3485
  }
3486
  {
3487
  "metric": {
3488
  "acc": {
3489
- "val": false,
3490
  "id": 42
3491
  }
3492
  },
3493
  "predict": {
3494
- "1": 0.18242338299751282,
3495
- "2": 0.817564845085144
3496
  },
3497
  "sample": {
3498
  "messages": [
@@ -3521,7 +3521,7 @@
3521
  "prompt_len": 84,
3522
  "generated_len": 1,
3523
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3524
- "generated_token": "2"
3525
  }
3526
  }
3527
  {
@@ -3532,8 +3532,8 @@
3532
  }
3533
  },
3534
  "predict": {
3535
- "1": 0.0009110159589909017,
3536
- "2": 0.9990503191947937
3537
  },
3538
  "sample": {
3539
  "messages": [
@@ -3573,8 +3573,8 @@
3573
  }
3574
  },
3575
  "predict": {
3576
- "1": 0.09534304589033127,
3577
- "2": 0.9045896530151367
3578
  },
3579
  "sample": {
3580
  "messages": [
@@ -3614,8 +3614,8 @@
3614
  }
3615
  },
3616
  "predict": {
3617
- "1": 3.288740344942198e-06,
3618
- "2": 0.9999940395355225
3619
  },
3620
  "sample": {
3621
  "messages": [
@@ -3655,8 +3655,8 @@
3655
  }
3656
  },
3657
  "predict": {
3658
- "1": 0.8670345544815063,
3659
- "2": 0.13296405971050262
3660
  },
3661
  "sample": {
3662
  "messages": [
@@ -3696,8 +3696,8 @@
3696
  }
3697
  },
3698
  "predict": {
3699
- "1": 0.04208729416131973,
3700
- "2": 0.9579023718833923
3701
  },
3702
  "sample": {
3703
  "messages": [
@@ -3737,8 +3737,8 @@
3737
  }
3738
  },
3739
  "predict": {
3740
- "1": 0.8933014273643494,
3741
- "2": 0.10668964684009552
3742
  },
3743
  "sample": {
3744
  "messages": [
@@ -3773,13 +3773,13 @@
3773
  {
3774
  "metric": {
3775
  "acc": {
3776
- "val": false,
3777
  "id": 46
3778
  }
3779
  },
3780
  "predict": {
3781
- "1": 0.11920077353715897,
3782
- "2": 0.8807812333106995
3783
  },
3784
  "sample": {
3785
  "messages": [
@@ -3808,19 +3808,19 @@
3808
  "prompt_len": 97,
3809
  "generated_len": 1,
3810
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3811
- "generated_token": "2"
3812
  }
3813
  }
3814
  {
3815
  "metric": {
3816
  "acc": {
3817
- "val": true,
3818
  "id": 46
3819
  }
3820
  },
3821
  "predict": {
3822
- "1": 0.26893293857574463,
3823
- "2": 0.7310354709625244
3824
  },
3825
  "sample": {
3826
  "messages": [
@@ -3849,7 +3849,7 @@
3849
  "prompt_len": 97,
3850
  "generated_len": 1,
3851
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3852
- "generated_token": "2"
3853
  }
3854
  }
3855
  {
@@ -3860,8 +3860,8 @@
3860
  }
3861
  },
3862
  "predict": {
3863
- "1": 0.4687843322753906,
3864
- "2": 0.5312022566795349
3865
  },
3866
  "sample": {
3867
  "messages": [
@@ -3896,13 +3896,13 @@
3896
  {
3897
  "metric": {
3898
  "acc": {
3899
- "val": true,
3900
  "id": 47
3901
  }
3902
  },
3903
  "predict": {
3904
- "1": 0.11920198798179626,
3905
- "2": 0.8807901740074158
3906
  },
3907
  "sample": {
3908
  "messages": [
@@ -3931,7 +3931,7 @@
3931
  "prompt_len": 84,
3932
  "generated_len": 1,
3933
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3934
- "generated_token": "2"
3935
  }
3936
  }
3937
  {
@@ -3942,8 +3942,8 @@
3942
  }
3943
  },
3944
  "predict": {
3945
- "1": 0.0004305548791307956,
3946
- "2": 0.9995643496513367
3947
  },
3948
  "sample": {
3949
  "messages": [
@@ -3978,13 +3978,13 @@
3978
  {
3979
  "metric": {
3980
  "acc": {
3981
- "val": false,
3982
  "id": 48
3983
  }
3984
  },
3985
  "predict": {
3986
- "1": 0.29421404004096985,
3987
- "2": 0.7057828307151794
3988
  },
3989
  "sample": {
3990
  "messages": [
@@ -4013,7 +4013,7 @@
4013
  "prompt_len": 87,
4014
  "generated_len": 1,
4015
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4016
- "generated_token": "2"
4017
  }
4018
  }
4019
  {
@@ -4024,8 +4024,8 @@
4024
  }
4025
  },
4026
  "predict": {
4027
- "1": 0.1645117849111557,
4028
- "2": 0.83545982837677
4029
  },
4030
  "sample": {
4031
  "messages": [
@@ -4065,8 +4065,8 @@
4065
  }
4066
  },
4067
  "predict": {
4068
- "1": 0.06008599326014519,
4069
- "2": 0.9399030804634094
4070
  },
4071
  "sample": {
4072
  "messages": [
@@ -4106,8 +4106,8 @@
4106
  }
4107
  },
4108
  "predict": {
4109
- "1": 0.053403060883283615,
4110
- "2": 0.946591854095459
4111
  },
4112
  "sample": {
4113
  "messages": [
@@ -4147,8 +4147,8 @@
4147
  }
4148
  },
4149
  "predict": {
4150
- "1": 0.00026118988171219826,
4151
- "2": 0.9997372031211853
4152
  },
4153
  "sample": {
4154
  "messages": [
@@ -4188,8 +4188,8 @@
4188
  }
4189
  },
4190
  "predict": {
4191
- "1": 0.01798619143664837,
4192
- "2": 0.9820127487182617
4193
  },
4194
  "sample": {
4195
  "messages": [
@@ -4224,13 +4224,13 @@
4224
  {
4225
  "metric": {
4226
  "acc": {
4227
- "val": false,
4228
  "id": 51
4229
  }
4230
  },
4231
  "predict": {
4232
- "1": 0.3486437201499939,
4233
- "2": 0.6513522267341614
4234
  },
4235
  "sample": {
4236
  "messages": [
@@ -4259,7 +4259,7 @@
4259
  "prompt_len": 79,
4260
  "generated_len": 1,
4261
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4262
- "generated_token": "2"
4263
  }
4264
  }
4265
  {
@@ -4270,8 +4270,8 @@
4270
  }
4271
  },
4272
  "predict": {
4273
- "1": 0.00017952173948287964,
4274
- "2": 0.999785840511322
4275
  },
4276
  "sample": {
4277
  "messages": [
@@ -4311,8 +4311,8 @@
4311
  }
4312
  },
4313
  "predict": {
4314
- "1": 0.003593507455661893,
4315
- "2": 0.9963800311088562
4316
  },
4317
  "sample": {
4318
  "messages": [
@@ -4352,8 +4352,8 @@
4352
  }
4353
  },
4354
  "predict": {
4355
- "1": 0.04208560660481453,
4356
- "2": 0.957863986492157
4357
  },
4358
  "sample": {
4359
  "messages": [
@@ -4393,8 +4393,8 @@
4393
  }
4394
  },
4395
  "predict": {
4396
- "1": 0.02595537342131138,
4397
- "2": 0.9739682078361511
4398
  },
4399
  "sample": {
4400
  "messages": [
@@ -4429,13 +4429,13 @@
4429
  {
4430
  "metric": {
4431
  "acc": {
4432
- "val": false,
4433
  "id": 54
4434
  }
4435
  },
4436
  "predict": {
4437
- "1": 0.34864017367362976,
4438
- "2": 0.6513455510139465
4439
  },
4440
  "sample": {
4441
  "messages": [
@@ -4464,7 +4464,7 @@
4464
  "prompt_len": 93,
4465
  "generated_len": 1,
4466
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4467
- "generated_token": "2"
4468
  }
4469
  }
4470
  {
@@ -4475,8 +4475,8 @@
4475
  }
4476
  },
4477
  "predict": {
4478
- "1": 0.3208187520503998,
4479
- "2": 0.6791732311248779
4480
  },
4481
  "sample": {
4482
  "messages": [
@@ -4516,8 +4516,8 @@
4516
  }
4517
  },
4518
  "predict": {
4519
- "1": 0.04208741337060928,
4520
- "2": 0.957905113697052
4521
  },
4522
  "sample": {
4523
  "messages": [
@@ -4557,8 +4557,8 @@
4557
  }
4558
  },
4559
  "predict": {
4560
- "1": 0.6224420070648193,
4561
- "2": 0.37753015756607056
4562
  },
4563
  "sample": {
4564
  "messages": [
@@ -4598,8 +4598,8 @@
4598
  }
4599
  },
4600
  "predict": {
4601
- "1": 0.00017952758935280144,
4602
- "2": 0.9998183846473694
4603
  },
4604
  "sample": {
4605
  "messages": [
@@ -4639,8 +4639,8 @@
4639
  }
4640
  },
4641
  "predict": {
4642
- "1": 0.04208764061331749,
4643
- "2": 0.9579102993011475
4644
  },
4645
  "sample": {
4646
  "messages": [
@@ -4680,8 +4680,8 @@
4680
  }
4681
  },
4682
  "predict": {
4683
- "1": 0.9982919096946716,
4684
- "2": 0.0017007100395858288
4685
  },
4686
  "sample": {
4687
  "messages": [
@@ -4721,8 +4721,8 @@
4721
  }
4722
  },
4723
  "predict": {
4724
- "1": 3.120183464488946e-05,
4725
- "2": 0.9999662637710571
4726
  },
4727
  "sample": {
4728
  "messages": [
@@ -4762,8 +4762,8 @@
4762
  }
4763
  },
4764
  "predict": {
4765
- "1": 0.13296319544315338,
4766
- "2": 0.8670289516448975
4767
  },
4768
  "sample": {
4769
  "messages": [
@@ -4798,13 +4798,13 @@
4798
  {
4799
  "metric": {
4800
  "acc": {
4801
- "val": false,
4802
  "id": 58
4803
  }
4804
  },
4805
  "predict": {
4806
- "1": 0.34863853454589844,
4807
- "2": 0.651342511177063
4808
  },
4809
  "sample": {
4810
  "messages": [
@@ -4833,7 +4833,7 @@
4833
  "prompt_len": 72,
4834
  "generated_len": 1,
4835
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4836
- "generated_token": "2"
4837
  }
4838
  }
4839
  {
@@ -4844,8 +4844,8 @@
4844
  }
4845
  },
4846
  "predict": {
4847
- "1": 0.022976001724600792,
4848
- "2": 0.9769644141197205
4849
  },
4850
  "sample": {
4851
  "messages": [
@@ -4885,8 +4885,8 @@
4885
  }
4886
  },
4887
  "predict": {
4888
- "1": 0.0015010697534307837,
4889
- "2": 0.9984240531921387
4890
  },
4891
  "sample": {
4892
  "messages": [
@@ -4921,13 +4921,13 @@
4921
  {
4922
  "metric": {
4923
  "acc": {
4924
- "val": true,
4925
  "id": 60
4926
  }
4927
  },
4928
  "predict": {
4929
- "1": 0.26893940567970276,
4930
- "2": 0.7310529947280884
4931
  },
4932
  "sample": {
4933
  "messages": [
@@ -4956,7 +4956,7 @@
4956
  "prompt_len": 94,
4957
  "generated_len": 1,
4958
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4959
- "generated_token": "2"
4960
  }
4961
  }
4962
  {
@@ -4967,8 +4967,8 @@
4967
  }
4968
  },
4969
  "predict": {
4970
- "1": 0.9046451449394226,
4971
- "2": 0.09534889459609985
4972
  },
4973
  "sample": {
4974
  "messages": [
@@ -5003,13 +5003,13 @@
5003
  {
5004
  "metric": {
5005
  "acc": {
5006
- "val": false,
5007
  "id": 61
5008
  }
5009
  },
5010
  "predict": {
5011
- "1": 0.3208046853542328,
5012
- "2": 0.679143488407135
5013
  },
5014
  "sample": {
5015
  "messages": [
@@ -5038,7 +5038,7 @@
5038
  "prompt_len": 80,
5039
  "generated_len": 1,
5040
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5041
- "generated_token": "2"
5042
  }
5043
  }
5044
  {
@@ -5049,8 +5049,8 @@
5049
  }
5050
  },
5051
  "predict": {
5052
- "1": 0.01098682638257742,
5053
- "2": 0.9890025854110718
5054
  },
5055
  "sample": {
5056
  "messages": [
@@ -5090,8 +5090,8 @@
5090
  }
5091
  },
5092
  "predict": {
5093
- "1": 0.3486441969871521,
5094
- "2": 0.651353120803833
5095
  },
5096
  "sample": {
5097
  "messages": [
@@ -5131,8 +5131,8 @@
5131
  }
5132
  },
5133
  "predict": {
5134
- "1": 8.481059921905398e-05,
5135
- "2": 0.9999099969863892
5136
  },
5137
  "sample": {
5138
  "messages": [
@@ -5167,13 +5167,13 @@
5167
  {
5168
  "metric": {
5169
  "acc": {
5170
- "val": true,
5171
  "id": 63
5172
  }
5173
  },
5174
  "predict": {
5175
- "1": 0.29421329498291016,
5176
- "2": 0.7057809829711914
5177
  },
5178
  "sample": {
5179
  "messages": [
@@ -5202,19 +5202,19 @@
5202
  "prompt_len": 87,
5203
  "generated_len": 1,
5204
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5205
- "generated_token": "2"
5206
  }
5207
  }
5208
  {
5209
  "metric": {
5210
  "acc": {
5211
- "val": false,
5212
  "id": 63
5213
  }
5214
  },
5215
  "predict": {
5216
- "1": 0.1480463743209839,
5217
- "2": 0.8519482016563416
5218
  },
5219
  "sample": {
5220
  "messages": [
@@ -5243,7 +5243,7 @@
5243
  "prompt_len": 87,
5244
  "generated_len": 1,
5245
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5246
- "generated_token": "2"
5247
  }
5248
  }
5249
  {
@@ -5254,8 +5254,8 @@
5254
  }
5255
  },
5256
  "predict": {
5257
- "1": 0.18242448568344116,
5258
- "2": 0.8175697922706604
5259
  },
5260
  "sample": {
5261
  "messages": [
@@ -5295,8 +5295,8 @@
5295
  }
5296
  },
5297
  "predict": {
5298
- "1": 1.1478750820970163e-05,
5299
- "2": 0.999987006187439
5300
  },
5301
  "sample": {
5302
  "messages": [
@@ -5336,8 +5336,8 @@
5336
  }
5337
  },
5338
  "predict": {
5339
- "1": 0.29418981075286865,
5340
- "2": 0.7057247161865234
5341
  },
5342
  "sample": {
5343
  "messages": [
@@ -5377,8 +5377,8 @@
5377
  }
5378
  },
5379
  "predict": {
5380
- "1": 0.06754294782876968,
5381
- "2": 0.9324015378952026
5382
  },
5383
  "sample": {
5384
  "messages": [
@@ -5418,8 +5418,8 @@
5418
  }
5419
  },
5420
  "predict": {
5421
- "1": 0.40732821822166443,
5422
- "2": 0.592659056186676
5423
  },
5424
  "sample": {
5425
  "messages": [
@@ -5459,8 +5459,8 @@
5459
  }
5460
  },
5461
  "predict": {
5462
- "1": 0.06754574179649353,
5463
- "2": 0.9324401021003723
5464
  },
5465
  "sample": {
5466
  "messages": [
@@ -5500,8 +5500,8 @@
5500
  }
5501
  },
5502
  "predict": {
5503
- "1": 0.651343047618866,
5504
- "2": 0.3486388325691223
5505
  },
5506
  "sample": {
5507
  "messages": [
@@ -5541,8 +5541,8 @@
5541
  }
5542
  },
5543
  "predict": {
5544
- "1": 0.9668970108032227,
5545
- "2": 0.03308539465069771
5546
  },
5547
  "sample": {
5548
  "messages": [
@@ -5577,13 +5577,13 @@
5577
  {
5578
  "metric": {
5579
  "acc": {
5580
- "val": false,
5581
  "id": 68
5582
  }
5583
  },
5584
  "predict": {
5585
- "1": 0.14804446697235107,
5586
- "2": 0.8519371747970581
5587
  },
5588
  "sample": {
5589
  "messages": [
@@ -5612,7 +5612,7 @@
5612
  "prompt_len": 87,
5613
  "generated_len": 1,
5614
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5615
- "generated_token": "2"
5616
  }
5617
  }
5618
  {
@@ -5623,8 +5623,8 @@
5623
  }
5624
  },
5625
  "predict": {
5626
- "1": 0.007577181328088045,
5627
- "2": 0.9924150109291077
5628
  },
5629
  "sample": {
5630
  "messages": [
@@ -5664,8 +5664,8 @@
5664
  }
5665
  },
5666
  "predict": {
5667
- "1": 0.8519507050514221,
5668
- "2": 0.1480468213558197
5669
  },
5670
  "sample": {
5671
  "messages": [
@@ -5705,8 +5705,8 @@
5705
  }
5706
  },
5707
  "predict": {
5708
- "1": 0.007577224634587765,
5709
- "2": 0.9924206137657166
5710
  },
5711
  "sample": {
5712
  "messages": [
@@ -5746,8 +5746,8 @@
5746
  }
5747
  },
5748
  "predict": {
5749
- "1": 0.07585762441158295,
5750
- "2": 0.9241350889205933
5751
  },
5752
  "sample": {
5753
  "messages": [
@@ -5787,8 +5787,8 @@
5787
  }
5788
  },
5789
  "predict": {
5790
- "1": 0.26893875002861023,
5791
- "2": 0.7310512661933899
5792
  },
5793
  "sample": {
5794
  "messages": [
@@ -5828,8 +5828,8 @@
5828
  }
5829
  },
5830
  "predict": {
5831
- "1": 0.004070112016052008,
5832
- "2": 0.9959235191345215
5833
  },
5834
  "sample": {
5835
  "messages": [
@@ -5869,8 +5869,8 @@
5869
  }
5870
  },
5871
  "predict": {
5872
- "1": 0.001325014280155301,
5873
- "2": 0.998668909072876
5874
  },
5875
  "sample": {
5876
  "messages": [
@@ -5910,8 +5910,8 @@
5910
  }
5911
  },
5912
  "predict": {
5913
- "1": 0.6791690587997437,
5914
- "2": 0.3208167552947998
5915
  },
5916
  "sample": {
5917
  "messages": [
@@ -5951,8 +5951,8 @@
5951
  }
5952
  },
5953
  "predict": {
5954
- "1": 0.009708426892757416,
5955
- "2": 0.9902864098548889
5956
  },
5957
  "sample": {
5958
  "messages": [
@@ -5992,8 +5992,8 @@
5992
  }
5993
  },
5994
  "predict": {
5995
- "1": 0.0003353240608703345,
5996
- "2": 0.9995869994163513
5997
  },
5998
  "sample": {
5999
  "messages": [
@@ -6028,13 +6028,13 @@
6028
  {
6029
  "metric": {
6030
  "acc": {
6031
- "val": false,
6032
  "id": 73
6033
  }
6034
  },
6035
  "predict": {
6036
- "1": 0.07585203647613525,
6037
- "2": 0.924066960811615
6038
  },
6039
  "sample": {
6040
  "messages": [
@@ -6063,7 +6063,7 @@
6063
  "prompt_len": 70,
6064
  "generated_len": 1,
6065
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6066
- "generated_token": "2"
6067
  }
6068
  }
6069
  {
@@ -6074,8 +6074,8 @@
6074
  }
6075
  },
6076
  "predict": {
6077
- "1": 0.010986875742673874,
6078
- "2": 0.9890069961547852
6079
  },
6080
  "sample": {
6081
  "messages": [
@@ -6115,8 +6115,8 @@
6115
  }
6116
  },
6117
  "predict": {
6118
- "1": 0.9579020738601685,
6119
- "2": 0.04208727926015854
6120
  },
6121
  "sample": {
6122
  "messages": [
@@ -6151,13 +6151,13 @@
6151
  {
6152
  "metric": {
6153
  "acc": {
6154
- "val": true,
6155
  "id": 75
6156
  }
6157
  },
6158
  "predict": {
6159
- "1": 0.16451428830623627,
6160
- "2": 0.8354725241661072
6161
  },
6162
  "sample": {
6163
  "messages": [
@@ -6186,7 +6186,7 @@
6186
  "prompt_len": 90,
6187
  "generated_len": 1,
6188
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6189
- "generated_token": "2"
6190
  }
6191
  }
6192
  {
@@ -6197,8 +6197,8 @@
6197
  }
6198
  },
6199
  "predict": {
6200
- "1": 0.3486413061618805,
6201
- "2": 0.6513476967811584
6202
  },
6203
  "sample": {
6204
  "messages": [
@@ -6238,8 +6238,8 @@
6238
  }
6239
  },
6240
  "predict": {
6241
- "1": 2.430020867905114e-05,
6242
- "2": 0.9999743700027466
6243
  },
6244
  "sample": {
6245
  "messages": [
@@ -6279,8 +6279,8 @@
6279
  }
6280
  },
6281
  "predict": {
6282
- "1": 0.5621755123138428,
6283
- "2": 0.43782275915145874
6284
  },
6285
  "sample": {
6286
  "messages": [
@@ -6320,8 +6320,8 @@
6320
  }
6321
  },
6322
  "predict": {
6323
- "1": 0.2018096148967743,
6324
- "2": 0.7981725335121155
6325
  },
6326
  "sample": {
6327
  "messages": [
@@ -6361,8 +6361,8 @@
6361
  }
6362
  },
6363
  "predict": {
6364
- "1": 0.020332183688879013,
6365
- "2": 0.9796594977378845
6366
  },
6367
  "sample": {
6368
  "messages": [
@@ -6402,8 +6402,8 @@
6402
  }
6403
  },
6404
  "predict": {
6405
- "1": 0.0009110494866035879,
6406
- "2": 0.9990870952606201
6407
  },
6408
  "sample": {
6409
  "messages": [
@@ -6443,8 +6443,8 @@
6443
  }
6444
  },
6445
  "predict": {
6446
- "1": 0.04742567986249924,
6447
- "2": 0.9525702595710754
6448
  },
6449
  "sample": {
6450
  "messages": [
@@ -6479,13 +6479,13 @@
6479
  {
6480
  "metric": {
6481
  "acc": {
6482
- "val": false,
6483
  "id": 79
6484
  }
6485
  },
6486
  "predict": {
6487
- "1": 0.16451598703861237,
6488
- "2": 0.8354811668395996
6489
  },
6490
  "sample": {
6491
  "messages": [
@@ -6514,7 +6514,7 @@
6514
  "prompt_len": 95,
6515
  "generated_len": 1,
6516
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6517
- "generated_token": "2"
6518
  }
6519
  }
6520
  {
@@ -6525,8 +6525,8 @@
6525
  }
6526
  },
6527
  "predict": {
6528
- "1": 2.7535574190551415e-05,
6529
- "2": 0.9999681711196899
6530
  },
6531
  "sample": {
6532
  "messages": [
@@ -6566,8 +6566,8 @@
6566
  }
6567
  },
6568
  "predict": {
6569
- "1": 0.34864094853401184,
6570
- "2": 0.6513469815254211
6571
  },
6572
  "sample": {
6573
  "messages": [
@@ -6607,8 +6607,8 @@
6607
  }
6608
  },
6609
  "predict": {
6610
- "1": 0.0006263329414650798,
6611
- "2": 0.9993718266487122
6612
  },
6613
  "sample": {
6614
  "messages": [
@@ -6648,8 +6648,8 @@
6648
  }
6649
  },
6650
  "predict": {
6651
- "1": 0.00460954662412405,
6652
- "2": 0.9953848719596863
6653
  },
6654
  "sample": {
6655
  "messages": [
@@ -6689,8 +6689,8 @@
6689
  }
6690
  },
6691
  "predict": {
6692
- "1": 0.2450818568468094,
6693
- "2": 0.7549052834510803
6694
  },
6695
  "sample": {
6696
  "messages": [
@@ -6725,13 +6725,13 @@
6725
  {
6726
  "metric": {
6727
  "acc": {
6728
- "val": false,
6729
  "id": 82
6730
  }
6731
  },
6732
  "predict": {
6733
- "1": 0.4686311185359955,
6734
- "2": 0.5310286283493042
6735
  },
6736
  "sample": {
6737
  "messages": [
@@ -6760,7 +6760,7 @@
6760
  "prompt_len": 85,
6761
  "generated_len": 1,
6762
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6763
- "generated_token": "2"
6764
  }
6765
  }
6766
  {
@@ -6771,8 +6771,8 @@
6771
  }
6772
  },
6773
  "predict": {
6774
- "1": 0.02297515980899334,
6775
- "2": 0.9769286513328552
6776
  },
6777
  "sample": {
6778
  "messages": [
@@ -6812,8 +6812,8 @@
6812
  }
6813
  },
6814
  "predict": {
6815
- "1": 0.24508321285247803,
6816
- "2": 0.7549094557762146
6817
  },
6818
  "sample": {
6819
  "messages": [
@@ -6853,8 +6853,8 @@
6853
  }
6854
  },
6855
  "predict": {
6856
- "1": 0.22269901633262634,
6857
- "2": 0.7772959470748901
6858
  },
6859
  "sample": {
6860
  "messages": [
@@ -6894,8 +6894,8 @@
6894
  }
6895
  },
6896
  "predict": {
6897
- "1": 0.010986320674419403,
6898
- "2": 0.9889569878578186
6899
  },
6900
  "sample": {
6901
  "messages": [
@@ -6935,8 +6935,8 @@
6935
  }
6936
  },
6937
  "predict": {
6938
- "1": 0.16449899971485138,
6939
- "2": 0.8353949189186096
6940
  },
6941
  "sample": {
6942
  "messages": [
@@ -6976,8 +6976,8 @@
6976
  }
6977
  },
6978
  "predict": {
6979
- "1": 0.04208748787641525,
6980
- "2": 0.9579067826271057
6981
  },
6982
  "sample": {
6983
  "messages": [
@@ -7012,13 +7012,13 @@
7012
  {
7013
  "metric": {
7014
  "acc": {
7015
- "val": true,
7016
  "id": 85
7017
  }
7018
  },
7019
  "predict": {
7020
- "1": 0.29421183466911316,
7021
- "2": 0.7057775259017944
7022
  },
7023
  "sample": {
7024
  "messages": [
@@ -7047,7 +7047,7 @@
7047
  "prompt_len": 77,
7048
  "generated_len": 1,
7049
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7050
- "generated_token": "2"
7051
  }
7052
  }
7053
  {
@@ -7058,8 +7058,8 @@
7058
  }
7059
  },
7060
  "predict": {
7061
- "1": 0.0046095517463982105,
7062
- "2": 0.9953859448432922
7063
  },
7064
  "sample": {
7065
  "messages": [
@@ -7099,8 +7099,8 @@
7099
  }
7100
  },
7101
  "predict": {
7102
- "1": 0.10668997466564178,
7103
- "2": 0.8933042287826538
7104
  },
7105
  "sample": {
7106
  "messages": [
@@ -7140,8 +7140,8 @@
7140
  }
7141
  },
7142
  "predict": {
7143
- "1": 0.004070121794939041,
7144
- "2": 0.995926022529602
7145
  },
7146
  "sample": {
7147
  "messages": [
@@ -7176,13 +7176,13 @@
7176
  {
7177
  "metric": {
7178
  "acc": {
7179
- "val": true,
7180
  "id": 87
7181
  }
7182
  },
7183
  "predict": {
7184
- "1": 0.40732234716415405,
7185
- "2": 0.5926504731178284
7186
  },
7187
  "sample": {
7188
  "messages": [
@@ -7211,7 +7211,7 @@
7211
  "prompt_len": 84,
7212
  "generated_len": 1,
7213
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7214
- "generated_token": "2"
7215
  }
7216
  }
7217
  {
@@ -7222,8 +7222,8 @@
7222
  }
7223
  },
7224
  "predict": {
7225
- "1": 0.5621723532676697,
7226
- "2": 0.4378202557563782
7227
  },
7228
  "sample": {
7229
  "messages": [
@@ -7263,8 +7263,8 @@
7263
  }
7264
  },
7265
  "predict": {
7266
- "1": 0.0017007191199809313,
7267
- "2": 0.9982972741127014
7268
  },
7269
  "sample": {
7270
  "messages": [
@@ -7304,8 +7304,8 @@
7304
  }
7305
  },
7306
  "predict": {
7307
- "1": 0.7310516238212585,
7308
- "2": 0.2689388692378998
7309
  },
7310
  "sample": {
7311
  "messages": [
@@ -7345,8 +7345,8 @@
7345
  }
7346
  },
7347
  "predict": {
7348
- "1": 3.2887437555473298e-06,
7349
- "2": 0.9999951124191284
7350
  },
7351
  "sample": {
7352
  "messages": [
@@ -7386,8 +7386,8 @@
7386
  }
7387
  },
7388
  "predict": {
7389
- "1": 0.09534507989883423,
7390
- "2": 0.904608964920044
7391
  },
7392
  "sample": {
7393
  "messages": [
@@ -7427,8 +7427,8 @@
7427
  }
7428
  },
7429
  "predict": {
7430
- "1": 0.02931118570268154,
7431
- "2": 0.9706531763076782
7432
  },
7433
  "sample": {
7434
  "messages": [
@@ -7468,8 +7468,8 @@
7468
  }
7469
  },
7470
  "predict": {
7471
- "1": 0.562170147895813,
7472
- "2": 0.4378185570240021
7473
  },
7474
  "sample": {
7475
  "messages": [
@@ -7504,13 +7504,13 @@
7504
  {
7505
  "metric": {
7506
  "acc": {
7507
- "val": false,
7508
  "id": 91
7509
  }
7510
  },
7511
  "predict": {
7512
- "1": 0.4378180503845215,
7513
- "2": 0.5621694922447205
7514
  },
7515
  "sample": {
7516
  "messages": [
@@ -7539,7 +7539,7 @@
7539
  "prompt_len": 100,
7540
  "generated_len": 1,
7541
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7542
- "generated_token": "2"
7543
  }
7544
  }
7545
  {
@@ -7550,8 +7550,8 @@
7550
  }
7551
  },
7552
  "predict": {
7553
- "1": 0.005911040119826794,
7554
- "2": 0.9940841794013977
7555
  },
7556
  "sample": {
7557
  "messages": [
@@ -7586,13 +7586,13 @@
7586
  {
7587
  "metric": {
7588
  "acc": {
7589
- "val": false,
7590
  "id": 92
7591
  }
7592
  },
7593
  "predict": {
7594
- "1": 0.18242233991622925,
7595
- "2": 0.8175601959228516
7596
  },
7597
  "sample": {
7598
  "messages": [
@@ -7621,7 +7621,7 @@
7621
  "prompt_len": 90,
7622
  "generated_len": 1,
7623
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7624
- "generated_token": "2"
7625
  }
7626
  }
7627
  {
@@ -7632,8 +7632,8 @@
7632
  }
7633
  },
7634
  "predict": {
7635
- "1": 0.9046250581741333,
7636
- "2": 0.09534677863121033
7637
  },
7638
  "sample": {
7639
  "messages": [
@@ -7673,8 +7673,8 @@
7673
  }
7674
  },
7675
  "predict": {
7676
- "1": 0.0008040774846449494,
7677
- "2": 0.9991853833198547
7678
  },
7679
  "sample": {
7680
  "messages": [
@@ -7714,8 +7714,8 @@
7714
  }
7715
  },
7716
  "predict": {
7717
- "1": 0.9859333634376526,
7718
- "2": 0.014063583686947823
7719
  },
7720
  "sample": {
7721
  "messages": [
@@ -7755,8 +7755,8 @@
7755
  }
7756
  },
7757
  "predict": {
7758
- "1": 0.007577226497232914,
7759
- "2": 0.9924208521842957
7760
  },
7761
  "sample": {
7762
  "messages": [
@@ -7796,8 +7796,8 @@
7796
  }
7797
  },
7798
  "predict": {
7799
- "1": 0.020332276821136475,
7800
- "2": 0.9796639680862427
7801
  },
7802
  "sample": {
7803
  "messages": [
@@ -7832,13 +7832,13 @@
7832
  {
7833
  "metric": {
7834
  "acc": {
7835
- "val": false,
7836
  "id": 95
7837
  }
7838
  },
7839
  "predict": {
7840
- "1": 0.34864354133605957,
7841
- "2": 0.6513518691062927
7842
  },
7843
  "sample": {
7844
  "messages": [
@@ -7867,7 +7867,7 @@
7867
  "prompt_len": 85,
7868
  "generated_len": 1,
7869
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7870
- "generated_token": "2"
7871
  }
7872
  }
7873
  {
@@ -7878,8 +7878,8 @@
7878
  }
7879
  },
7880
  "predict": {
7881
- "1": 0.9740384221076965,
7882
- "2": 0.025957245379686356
7883
  },
7884
  "sample": {
7885
  "messages": [
@@ -7919,8 +7919,8 @@
7919
  }
7920
  },
7921
  "predict": {
7922
- "1": 0.0019267325988039374,
7923
- "2": 0.9980721473693848
7924
  },
7925
  "sample": {
7926
  "messages": [
@@ -7960,8 +7960,8 @@
7960
  }
7961
  },
7962
  "predict": {
7963
- "1": 0.13296294212341309,
7964
- "2": 0.8670272827148438
7965
  },
7966
  "sample": {
7967
  "messages": [
@@ -7996,13 +7996,13 @@
7996
  {
7997
  "metric": {
7998
  "acc": {
7999
- "val": false,
8000
  "id": 97
8001
  }
8002
  },
8003
  "predict": {
8004
- "1": 0.979662299156189,
8005
- "2": 0.02033224143087864
8006
  },
8007
  "sample": {
8008
  "messages": [
@@ -8031,7 +8031,7 @@
8031
  "prompt_len": 86,
8032
  "generated_len": 1,
8033
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8034
- "generated_token": "1"
8035
  }
8036
  }
8037
  {
@@ -8042,8 +8042,8 @@
8042
  }
8043
  },
8044
  "predict": {
8045
- "1": 0.26893991231918335,
8046
- "2": 0.731054425239563
8047
  },
8048
  "sample": {
8049
  "messages": [
@@ -8083,8 +8083,8 @@
8083
  }
8084
  },
8085
  "predict": {
8086
- "1": 0.13296303153038025,
8087
- "2": 0.8670278191566467
8088
  },
8089
  "sample": {
8090
  "messages": [
@@ -8124,8 +8124,8 @@
8124
  }
8125
  },
8126
  "predict": {
8127
- "1": 0.010986246168613434,
8128
- "2": 0.9889503121376038
8129
  },
8130
  "sample": {
8131
  "messages": [
@@ -8165,8 +8165,8 @@
8165
  }
8166
  },
8167
  "predict": {
8168
- "1": 0.11918793618679047,
8169
- "2": 0.880686342716217
8170
  },
8171
  "sample": {
8172
  "messages": [
 
6
  }
7
  },
8
  "predict": {
9
+ "1": 0.5619974732398987,
10
+ "2": 0.4376840591430664
11
  },
12
  "sample": {
13
  "messages": [
 
47
  }
48
  },
49
  "predict": {
50
+ "1": 0.622265100479126,
51
+ "2": 0.37742286920547485
52
  },
53
  "sample": {
54
  "messages": [
 
88
  }
89
  },
90
  "predict": {
91
+ "1": 0.6510406732559204,
92
+ "2": 0.34847697615623474
93
  },
94
  "sample": {
95
  "messages": [
 
129
  }
130
  },
131
  "predict": {
132
+ "1": 0.32066380977630615,
133
+ "2": 0.678845226764679
134
  },
135
  "sample": {
136
  "messages": [
 
170
  }
171
  },
172
  "predict": {
173
+ "1": 0.7771914601325989,
174
+ "2": 0.2226690798997879
175
  },
176
  "sample": {
177
  "messages": [
 
211
  }
212
  },
213
  "predict": {
214
+ "1": 0.04741984233260155,
215
+ "2": 0.9524530172348022
216
  },
217
  "sample": {
218
  "messages": [
 
252
  }
253
  },
254
  "predict": {
255
+ "1": 0.24495473504066467,
256
+ "2": 0.7545137405395508
257
  },
258
  "sample": {
259
  "messages": [
 
293
  }
294
  },
295
  "predict": {
296
+ "1": 0.4071190059185028,
297
+ "2": 0.5923546552658081
298
  },
299
  "sample": {
300
  "messages": [
 
334
  }
335
  },
336
  "predict": {
337
+ "1": 0.46866750717163086,
338
+ "2": 0.5310698747634888
339
  },
340
  "sample": {
341
  "messages": [
 
375
  }
376
  },
377
  "predict": {
378
+ "1": 0.6511845588684082,
379
+ "2": 0.34855398535728455
380
  },
381
  "sample": {
382
  "messages": [
 
416
  }
417
  },
418
  "predict": {
419
+ "1": 0.5619243383407593,
420
+ "2": 0.4376271367073059
421
  },
422
  "sample": {
423
  "messages": [
 
457
  }
458
  },
459
  "predict": {
460
+ "1": 0.2688515782356262,
461
+ "2": 0.7308142781257629
462
  },
463
  "sample": {
464
  "messages": [
 
498
  }
499
  },
500
  "predict": {
501
+ "1": 0.29402416944503784,
502
+ "2": 0.7053273320198059
503
  },
504
  "sample": {
505
  "messages": [
 
539
  }
540
  },
541
  "predict": {
542
+ "1": 0.32064735889434814,
543
+ "2": 0.6788104176521301
544
  },
545
  "sample": {
546
  "messages": [
 
580
  }
581
  },
582
  "predict": {
583
+ "1": 0.730933427810669,
584
+ "2": 0.2688954174518585
585
  },
586
  "sample": {
587
  "messages": [
 
621
  }
622
  },
623
  "predict": {
624
+ "1": 0.4072516858577728,
625
+ "2": 0.5925476551055908
626
  },
627
  "sample": {
628
  "messages": [
 
662
  }
663
  },
664
  "predict": {
665
+ "1": 0.37738102674484253,
666
+ "2": 0.6221961379051208
667
  },
668
  "sample": {
669
  "messages": [
 
703
  }
704
  },
705
  "predict": {
706
+ "1": 0.3773842751979828,
707
+ "2": 0.6222015023231506
708
  },
709
  "sample": {
710
  "messages": [
 
739
  {
740
  "metric": {
741
  "acc": {
742
+ "val": true,
743
  "id": 9
744
  }
745
  },
746
  "predict": {
747
+ "1": 0.4990942180156708,
748
+ "2": 0.4990942180156708
749
  },
750
  "sample": {
751
  "messages": [
 
774
  "prompt_len": 78,
775
  "generated_len": 1,
776
  "generated_cumulative_logprob": "TODO: calculate for hf model",
777
+ "generated_token": "1"
778
  }
779
  }
780
  {
 
785
  }
786
  },
787
  "predict": {
788
+ "1": 0.2684953212738037,
789
+ "2": 0.7298458218574524
790
  },
791
  "sample": {
792
  "messages": [
 
821
  {
822
  "metric": {
823
  "acc": {
824
+ "val": true,
825
  "id": 10
826
  }
827
  },
828
  "predict": {
829
+ "1": 0.6222006678581238,
830
+ "2": 0.3773837685585022
831
  },
832
  "sample": {
833
  "messages": [
 
856
  "prompt_len": 87,
857
  "generated_len": 1,
858
  "generated_cumulative_logprob": "TODO: calculate for hf model",
859
+ "generated_token": "1"
860
  }
861
  }
862
  {
 
867
  }
868
  },
869
  "predict": {
870
+ "1": 0.46855002641677856,
871
+ "2": 0.5309367775917053
872
  },
873
  "sample": {
874
  "messages": [
 
908
  }
909
  },
910
  "predict": {
911
+ "1": 0.320069283246994,
912
+ "2": 0.6775866150856018
913
  },
914
  "sample": {
915
  "messages": [
 
949
  }
950
  },
951
  "predict": {
952
+ "1": 0.467572420835495,
953
+ "2": 0.5298289656639099
954
  },
955
  "sample": {
956
  "messages": [
 
990
  }
991
  },
992
  "predict": {
993
+ "1": 0.4377047121524811,
994
+ "2": 0.5620239973068237
995
  },
996
  "sample": {
997
  "messages": [
 
1031
  }
1032
  },
1033
  "predict": {
1034
+ "1": 0.5620283484458923,
1035
+ "2": 0.4377081096172333
1036
  },
1037
  "sample": {
1038
  "messages": [
 
1072
  }
1073
  },
1074
  "predict": {
1075
+ "1": 0.37745413184165955,
1076
+ "2": 0.6223166584968567
1077
  },
1078
  "sample": {
1079
  "messages": [
 
1108
  {
1109
  "metric": {
1110
  "acc": {
1111
+ "val": true,
1112
  "id": 13
1113
  }
1114
  },
1115
  "predict": {
1116
+ "1": 0.4998833239078522,
1117
+ "2": 0.4998833239078522
1118
  },
1119
  "sample": {
1120
  "messages": [
 
1143
  "prompt_len": 106,
1144
  "generated_len": 1,
1145
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1146
+ "generated_token": "1"
1147
  }
1148
  }
1149
  {
 
1154
  }
1155
  },
1156
  "predict": {
1157
+ "1": 0.43762025237083435,
1158
+ "2": 0.5619155168533325
1159
  },
1160
  "sample": {
1161
  "messages": [
 
1195
  }
1196
  },
1197
  "predict": {
1198
+ "1": 0.4685898721218109,
1199
+ "2": 0.5309818983078003
1200
  },
1201
  "sample": {
1202
  "messages": [
 
1231
  {
1232
  "metric": {
1233
  "acc": {
1234
+ "val": true,
1235
  "id": 15
1236
  }
1237
  },
1238
  "predict": {
1239
+ "1": 0.6512307524681091,
1240
+ "2": 0.34857872128486633
1241
  },
1242
  "sample": {
1243
  "messages": [
 
1266
  "prompt_len": 91,
1267
  "generated_len": 1,
1268
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1269
+ "generated_token": "1"
1270
  }
1271
  }
1272
  {
 
1277
  }
1278
  },
1279
  "predict": {
1280
+ "1": 0.2688847780227661,
1281
+ "2": 0.7309045195579529
1282
  },
1283
  "sample": {
1284
  "messages": [
 
1313
  {
1314
  "metric": {
1315
  "acc": {
1316
+ "val": true,
1317
  "id": 16
1318
  }
1319
  },
1320
  "predict": {
1321
+ "1": 0.4685722291469574,
1322
+ "2": 0.5309618711471558
1323
  },
1324
  "sample": {
1325
  "messages": [
 
1348
  "prompt_len": 92,
1349
  "generated_len": 1,
1350
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1351
+ "generated_token": "2"
1352
  }
1353
  }
1354
  {
 
1359
  }
1360
  },
1361
  "predict": {
1362
+ "1": 0.3773617446422577,
1363
+ "2": 0.6221643090248108
1364
  },
1365
  "sample": {
1366
  "messages": [
 
1395
  {
1396
  "metric": {
1397
  "acc": {
1398
+ "val": true,
1399
  "id": 17
1400
  }
1401
  },
1402
  "predict": {
1403
+ "1": 0.5308536887168884,
1404
+ "2": 0.46847671270370483
1405
  },
1406
  "sample": {
1407
  "messages": [
 
1430
  "prompt_len": 75,
1431
  "generated_len": 1,
1432
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1433
+ "generated_token": "1"
1434
  }
1435
  }
1436
  {
 
1441
  }
1442
  },
1443
  "predict": {
1444
+ "1": 0.348453551530838,
1445
+ "2": 0.6509969234466553
1446
  },
1447
  "sample": {
1448
  "messages": [
 
1482
  }
1483
  },
1484
  "predict": {
1485
+ "1": 0.4063173532485962,
1486
+ "2": 0.5911882519721985
1487
  },
1488
  "sample": {
1489
  "messages": [
 
1518
  {
1519
  "metric": {
1520
  "acc": {
1521
+ "val": true,
1522
  "id": 18
1523
  }
1524
  },
1525
  "predict": {
1526
+ "1": 0.4990358054637909,
1527
+ "2": 0.4990358054637909
1528
  },
1529
  "sample": {
1530
  "messages": [
 
1553
  "prompt_len": 74,
1554
  "generated_len": 1,
1555
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1556
+ "generated_token": "1"
1557
  }
1558
  }
1559
  {
 
1564
  }
1565
  },
1566
  "predict": {
1567
+ "1": 0.730846107006073,
1568
+ "2": 0.2688632905483246
1569
  },
1570
  "sample": {
1571
  "messages": [
 
1605
  }
1606
  },
1607
  "predict": {
1608
+ "1": 0.4376976490020752,
1609
+ "2": 0.5620148777961731
1610
  },
1611
  "sample": {
1612
  "messages": [
 
1641
  {
1642
  "metric": {
1643
  "acc": {
1644
+ "val": true,
1645
  "id": 20
1646
  }
1647
  },
1648
  "predict": {
1649
+ "1": 0.592413604259491,
1650
+ "2": 0.4071595370769501
1651
  },
1652
  "sample": {
1653
  "messages": [
 
1676
  "prompt_len": 80,
1677
  "generated_len": 1,
1678
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1679
+ "generated_token": "1"
1680
  }
1681
  }
1682
  {
 
1687
  }
1688
  },
1689
  "predict": {
1690
+ "1": 0.3774004280567169,
1691
+ "2": 0.6222280859947205
1692
  },
1693
  "sample": {
1694
  "messages": [
 
1728
  }
1729
  },
1730
  "predict": {
1731
+ "1": 0.34851840138435364,
1732
+ "2": 0.6511180400848389
1733
  },
1734
  "sample": {
1735
  "messages": [
 
1769
  }
1770
  },
1771
  "predict": {
1772
+ "1": 0.7055380940437317,
1773
+ "2": 0.29411202669143677
1774
  },
1775
  "sample": {
1776
  "messages": [
 
1810
  }
1811
  },
1812
  "predict": {
1813
+ "1": 0.3207487165927887,
1814
+ "2": 0.6790250539779663
1815
  },
1816
  "sample": {
1817
  "messages": [
 
1846
  {
1847
  "metric": {
1848
  "acc": {
1849
+ "val": true,
1850
  "id": 22
1851
  }
1852
  },
1853
  "predict": {
1854
+ "1": 0.4686760902404785,
1855
+ "2": 0.5310795903205872
1856
  },
1857
  "sample": {
1858
  "messages": [
 
1881
  "prompt_len": 96,
1882
  "generated_len": 1,
1883
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1884
+ "generated_token": "2"
1885
  }
1886
  }
1887
  {
 
1892
  }
1893
  },
1894
  "predict": {
1895
+ "1": 0.43775004148483276,
1896
+ "2": 0.5620821714401245
1897
  },
1898
  "sample": {
1899
  "messages": [
 
1933
  }
1934
  },
1935
  "predict": {
1936
+ "1": 0.32076162099838257,
1937
+ "2": 0.6790523529052734
1938
  },
1939
  "sample": {
1940
  "messages": [
 
1969
  {
1970
  "metric": {
1971
  "acc": {
1972
+ "val": false,
1973
  "id": 24
1974
  }
1975
  },
1976
  "predict": {
1977
+ "1": 0.37734633684158325,
1978
+ "2": 0.6221389174461365
1979
  },
1980
  "sample": {
1981
  "messages": [
 
2004
  "prompt_len": 85,
2005
  "generated_len": 1,
2006
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2007
+ "generated_token": "2"
2008
  }
2009
  }
2010
  {
 
2015
  }
2016
  },
2017
  "predict": {
2018
+ "1": 0.37736234068870544,
2019
+ "2": 0.622165322303772
2020
  },
2021
  "sample": {
2022
  "messages": [
 
2056
  }
2057
  },
2058
  "predict": {
2059
+ "1": 0.407107949256897,
2060
+ "2": 0.5923385620117188
2061
  },
2062
  "sample": {
2063
  "messages": [
 
2097
  }
2098
  },
2099
  "predict": {
2100
+ "1": 0.5619068145751953,
2101
+ "2": 0.43761345744132996
2102
  },
2103
  "sample": {
2104
  "messages": [
 
2138
  }
2139
  },
2140
  "predict": {
2141
+ "1": 0.43754732608795166,
2142
+ "2": 0.5618218779563904
2143
  },
2144
  "sample": {
2145
  "messages": [
 
2174
  {
2175
  "metric": {
2176
  "acc": {
2177
+ "val": true,
2178
  "id": 26
2179
  }
2180
  },
2181
  "predict": {
2182
+ "1": 0.5308565497398376,
2183
+ "2": 0.4684792160987854
2184
  },
2185
  "sample": {
2186
  "messages": [
 
2209
  "prompt_len": 80,
2210
  "generated_len": 1,
2211
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2212
+ "generated_token": "1"
2213
  }
2214
  }
2215
  {
 
2220
  }
2221
  },
2222
  "predict": {
2223
+ "1": 0.3765822649002075,
2224
+ "2": 0.6208791732788086
2225
  },
2226
  "sample": {
2227
  "messages": [
 
2261
  }
2262
  },
2263
  "predict": {
2264
+ "1": 0.4678250551223755,
2265
+ "2": 0.5301152467727661
2266
  },
2267
  "sample": {
2268
  "messages": [
 
2302
  }
2303
  },
2304
  "predict": {
2305
+ "1": 0.4685414433479309,
2306
+ "2": 0.5309270024299622
2307
  },
2308
  "sample": {
2309
  "messages": [
 
2343
  }
2344
  },
2345
  "predict": {
2346
+ "1": 0.49978330731391907,
2347
+ "2": 0.49978330731391907
2348
  },
2349
  "sample": {
2350
  "messages": [
 
2379
  {
2380
  "metric": {
2381
  "acc": {
2382
+ "val": false,
2383
  "id": 29
2384
  }
2385
  },
2386
  "predict": {
2387
+ "1": 0.5925133228302002,
2388
+ "2": 0.4072280526161194
2389
  },
2390
  "sample": {
2391
  "messages": [
 
2414
  "prompt_len": 91,
2415
  "generated_len": 1,
2416
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2417
+ "generated_token": "1"
2418
  }
2419
  }
2420
  {
2421
  "metric": {
2422
  "acc": {
2423
+ "val": true,
2424
  "id": 29
2425
  }
2426
  },
2427
  "predict": {
2428
+ "1": 0.5924868583679199,
2429
+ "2": 0.4072098731994629
2430
  },
2431
  "sample": {
2432
  "messages": [
 
2455
  "prompt_len": 91,
2456
  "generated_len": 1,
2457
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2458
+ "generated_token": "1"
2459
  }
2460
  }
2461
  {
 
2466
  }
2467
  },
2468
  "predict": {
2469
+ "1": 0.2013276070356369,
2470
+ "2": 0.7962661385536194
2471
  },
2472
  "sample": {
2473
  "messages": [
 
2507
  }
2508
  },
2509
  "predict": {
2510
+ "1": 0.40639403462409973,
2511
+ "2": 0.591299831867218
2512
  },
2513
  "sample": {
2514
  "messages": [
 
2548
  }
2549
  },
2550
  "predict": {
2551
+ "1": 0.4685439169406891,
2552
+ "2": 0.5309298038482666
2553
  },
2554
  "sample": {
2555
  "messages": [
 
2584
  {
2585
  "metric": {
2586
  "acc": {
2587
+ "val": true,
2588
  "id": 31
2589
  }
2590
  },
2591
  "predict": {
2592
+ "1": 0.49973681569099426,
2593
+ "2": 0.49973681569099426
2594
  },
2595
  "sample": {
2596
  "messages": [
 
2619
  "prompt_len": 80,
2620
  "generated_len": 1,
2621
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2622
+ "generated_token": "1"
2623
  }
2624
  }
2625
  {
2626
  "metric": {
2627
  "acc": {
2628
+ "val": true,
2629
  "id": 32
2630
  }
2631
  },
2632
  "predict": {
2633
+ "1": 0.5311156511306763,
2634
+ "2": 0.4687079191207886
2635
  },
2636
  "sample": {
2637
  "messages": [
 
2660
  "prompt_len": 83,
2661
  "generated_len": 1,
2662
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2663
+ "generated_token": "1"
2664
  }
2665
  }
2666
  {
 
2671
  }
2672
  },
2673
  "predict": {
2674
+ "1": 0.29415959119796753,
2675
+ "2": 0.7056521773338318
2676
  },
2677
  "sample": {
2678
  "messages": [
 
2712
  }
2713
  },
2714
  "predict": {
2715
+ "1": 0.4071362614631653,
2716
+ "2": 0.5923797488212585
2717
  },
2718
  "sample": {
2719
  "messages": [
 
2753
  }
2754
  },
2755
  "predict": {
2756
+ "1": 0.43763670325279236,
2757
+ "2": 0.561936616897583
2758
  },
2759
  "sample": {
2760
  "messages": [
 
2794
  }
2795
  },
2796
  "predict": {
2797
+ "1": 0.8173764944076538,
2798
+ "2": 0.18238134682178497
2799
  },
2800
  "sample": {
2801
  "messages": [
 
2835
  }
2836
  },
2837
  "predict": {
2838
+ "1": 0.5924760699272156,
2839
+ "2": 0.40720245242118835
2840
  },
2841
  "sample": {
2842
  "messages": [
 
2876
  }
2877
  },
2878
  "predict": {
2879
+ "1": 0.43765684962272644,
2880
+ "2": 0.5619625449180603
2881
  },
2882
  "sample": {
2883
  "messages": [
 
2917
  }
2918
  },
2919
  "predict": {
2920
+ "1": 0.29410114884376526,
2921
+ "2": 0.7055119872093201
2922
  },
2923
  "sample": {
2924
  "messages": [
 
2958
  }
2959
  },
2960
  "predict": {
2961
+ "1": 0.37653520703315735,
2962
+ "2": 0.620801568031311
2963
  },
2964
  "sample": {
2965
  "messages": [
 
2999
  }
3000
  },
3001
  "predict": {
3002
+ "1": 0.4062846899032593,
3003
+ "2": 0.5911407470703125
3004
  },
3005
  "sample": {
3006
  "messages": [
 
3040
  }
3041
  },
3042
  "predict": {
3043
+ "1": 0.46869802474975586,
3044
+ "2": 0.5311044454574585
3045
  },
3046
  "sample": {
3047
  "messages": [
 
3081
  }
3082
  },
3083
  "predict": {
3084
+ "1": 0.6512240171432495,
3085
+ "2": 0.3485751152038574
3086
  },
3087
  "sample": {
3088
  "messages": [
 
3122
  }
3123
  },
3124
  "predict": {
3125
+ "1": 0.2687711715698242,
3126
+ "2": 0.7305957078933716
3127
  },
3128
  "sample": {
3129
  "messages": [
 
3158
  {
3159
  "metric": {
3160
  "acc": {
3161
+ "val": true,
3162
  "id": 38
3163
  }
3164
  },
3165
  "predict": {
3166
+ "1": 0.5308924913406372,
3167
+ "2": 0.4685109555721283
3168
  },
3169
  "sample": {
3170
  "messages": [
 
3193
  "prompt_len": 85,
3194
  "generated_len": 1,
3195
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3196
+ "generated_token": "1"
3197
  }
3198
  }
3199
  {
3200
  "metric": {
3201
  "acc": {
3202
+ "val": true,
3203
  "id": 39
3204
  }
3205
  },
3206
  "predict": {
3207
+ "1": 0.5620588660240173,
3208
+ "2": 0.43773189187049866
3209
  },
3210
  "sample": {
3211
  "messages": [
 
3234
  "prompt_len": 86,
3235
  "generated_len": 1,
3236
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3237
+ "generated_token": "1"
3238
  }
3239
  }
3240
  {
 
3245
  }
3246
  },
3247
  "predict": {
3248
+ "1": 0.22264744341373444,
3249
+ "2": 0.7771158814430237
3250
  },
3251
  "sample": {
3252
  "messages": [
 
3286
  }
3287
  },
3288
  "predict": {
3289
+ "1": 0.4071022868156433,
3290
+ "2": 0.5923303365707397
3291
  },
3292
  "sample": {
3293
  "messages": [
 
3322
  {
3323
  "metric": {
3324
  "acc": {
3325
+ "val": true,
3326
  "id": 40
3327
  }
3328
  },
3329
  "predict": {
3330
+ "1": 0.4685532748699188,
3331
+ "2": 0.5309404134750366
3332
  },
3333
  "sample": {
3334
  "messages": [
 
3357
  "prompt_len": 91,
3358
  "generated_len": 1,
3359
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3360
+ "generated_token": "2"
3361
  }
3362
  }
3363
  {
 
3368
  }
3369
  },
3370
  "predict": {
3371
+ "1": 0.5618895888328552,
3372
+ "2": 0.4376000463962555
3373
  },
3374
  "sample": {
3375
  "messages": [
 
3404
  {
3405
  "metric": {
3406
  "acc": {
3407
+ "val": false,
3408
  "id": 41
3409
  }
3410
  },
3411
  "predict": {
3412
+ "1": 0.4997299611568451,
3413
+ "2": 0.4997299611568451
3414
  },
3415
  "sample": {
3416
  "messages": [
 
3439
  "prompt_len": 77,
3440
  "generated_len": 1,
3441
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3442
+ "generated_token": "1"
3443
  }
3444
  }
3445
  {
3446
  "metric": {
3447
  "acc": {
3448
+ "val": false,
3449
  "id": 42
3450
  }
3451
  },
3452
  "predict": {
3453
+ "1": 0.4997691512107849,
3454
+ "2": 0.4997691512107849
3455
  },
3456
  "sample": {
3457
  "messages": [
 
3480
  "prompt_len": 84,
3481
  "generated_len": 1,
3482
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3483
+ "generated_token": "1"
3484
  }
3485
  }
3486
  {
3487
  "metric": {
3488
  "acc": {
3489
+ "val": true,
3490
  "id": 42
3491
  }
3492
  },
3493
  "predict": {
3494
+ "1": 0.49974381923675537,
3495
+ "2": 0.49974381923675537
3496
  },
3497
  "sample": {
3498
  "messages": [
 
3521
  "prompt_len": 84,
3522
  "generated_len": 1,
3523
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3524
+ "generated_token": "1"
3525
  }
3526
  }
3527
  {
 
3532
  }
3533
  },
3534
  "predict": {
3535
+ "1": 0.32023319602012634,
3536
+ "2": 0.6779336333274841
3537
  },
3538
  "sample": {
3539
  "messages": [
 
3573
  }
3574
  },
3575
  "predict": {
3576
+ "1": 0.43682730197906494,
3577
+ "2": 0.5608973503112793
3578
  },
3579
  "sample": {
3580
  "messages": [
 
3614
  }
3615
  },
3616
  "predict": {
3617
+ "1": 0.04208233952522278,
3618
+ "2": 0.9577896595001221
3619
  },
3620
  "sample": {
3621
  "messages": [
 
3655
  }
3656
  },
3657
  "predict": {
3658
+ "1": 0.6223546266555786,
3659
+ "2": 0.37747716903686523
3660
  },
3661
  "sample": {
3662
  "messages": [
 
3696
  }
3697
  },
3698
  "predict": {
3699
+ "1": 0.3774266541004181,
3700
+ "2": 0.6222713589668274
3701
  },
3702
  "sample": {
3703
  "messages": [
 
3737
  }
3738
  },
3739
  "predict": {
3740
+ "1": 0.6789510846138,
3741
+ "2": 0.3207138180732727
3742
  },
3743
  "sample": {
3744
  "messages": [
 
3773
  {
3774
  "metric": {
3775
  "acc": {
3776
+ "val": true,
3777
  "id": 46
3778
  }
3779
  },
3780
  "predict": {
3781
+ "1": 0.5922853350639343,
3782
+ "2": 0.40707138180732727
3783
  },
3784
  "sample": {
3785
  "messages": [
 
3808
  "prompt_len": 97,
3809
  "generated_len": 1,
3810
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3811
+ "generated_token": "1"
3812
  }
3813
  }
3814
  {
3815
  "metric": {
3816
  "acc": {
3817
+ "val": false,
3818
  "id": 46
3819
  }
3820
  },
3821
  "predict": {
3822
+ "1": 0.49966880679130554,
3823
+ "2": 0.49966880679130554
3824
  },
3825
  "sample": {
3826
  "messages": [
 
3849
  "prompt_len": 97,
3850
  "generated_len": 1,
3851
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3852
+ "generated_token": "1"
3853
  }
3854
  }
3855
  {
 
3860
  }
3861
  },
3862
  "predict": {
3863
+ "1": 0.4375786781311035,
3864
+ "2": 0.5618621706962585
3865
  },
3866
  "sample": {
3867
  "messages": [
 
3896
  {
3897
  "metric": {
3898
  "acc": {
3899
+ "val": false,
3900
  "id": 47
3901
  }
3902
  },
3903
  "predict": {
3904
+ "1": 0.49976199865341187,
3905
+ "2": 0.49976199865341187
3906
  },
3907
  "sample": {
3908
  "messages": [
 
3931
  "prompt_len": 84,
3932
  "generated_len": 1,
3933
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3934
+ "generated_token": "1"
3935
  }
3936
  }
3937
  {
 
3942
  }
3943
  },
3944
  "predict": {
3945
+ "1": 0.29415035247802734,
3946
+ "2": 0.7056300044059753
3947
  },
3948
  "sample": {
3949
  "messages": [
 
3978
  {
3979
  "metric": {
3980
  "acc": {
3981
+ "val": true,
3982
  "id": 48
3983
  }
3984
  },
3985
  "predict": {
3986
+ "1": 0.5620615482330322,
3987
+ "2": 0.4377339780330658
3988
  },
3989
  "sample": {
3990
  "messages": [
 
4013
  "prompt_len": 87,
4014
  "generated_len": 1,
4015
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4016
+ "generated_token": "1"
4017
  }
4018
  }
4019
  {
 
4024
  }
4025
  },
4026
  "predict": {
4027
+ "1": 0.468570351600647,
4028
+ "2": 0.5309597849845886
4029
  },
4030
  "sample": {
4031
  "messages": [
 
4065
  }
4066
  },
4067
  "predict": {
4068
+ "1": 0.34851348400115967,
4069
+ "2": 0.6511088609695435
4070
  },
4071
  "sample": {
4072
  "messages": [
 
4106
  }
4107
  },
4108
  "predict": {
4109
+ "1": 0.3773762285709381,
4110
+ "2": 0.6221882104873657
4111
  },
4112
  "sample": {
4113
  "messages": [
 
4147
  }
4148
  },
4149
  "predict": {
4150
+ "1": 0.13292139768600464,
4151
+ "2": 0.8667563796043396
4152
  },
4153
  "sample": {
4154
  "messages": [
 
4188
  }
4189
  },
4190
  "predict": {
4191
+ "1": 0.3484882414340973,
4192
+ "2": 0.6510617136955261
4193
  },
4194
  "sample": {
4195
  "messages": [
 
4224
  {
4225
  "metric": {
4226
  "acc": {
4227
+ "val": true,
4228
  "id": 51
4229
  }
4230
  },
4231
  "predict": {
4232
+ "1": 0.6221812963485718,
4233
+ "2": 0.37737205624580383
4234
  },
4235
  "sample": {
4236
  "messages": [
 
4259
  "prompt_len": 79,
4260
  "generated_len": 1,
4261
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4262
+ "generated_token": "1"
4263
  }
4264
  }
4265
  {
 
4270
  }
4271
  },
4272
  "predict": {
4273
+ "1": 0.3768748641014099,
4274
+ "2": 0.6213616132736206
4275
  },
4276
  "sample": {
4277
  "messages": [
 
4311
  }
4312
  },
4313
  "predict": {
4314
+ "1": 0.24472229182720184,
4315
+ "2": 0.7537977695465088
4316
  },
4317
  "sample": {
4318
  "messages": [
 
4352
  }
4353
  },
4354
  "predict": {
4355
+ "1": 0.3767865002155304,
4356
+ "2": 0.6212159395217896
4357
  },
4358
  "sample": {
4359
  "messages": [
 
4393
  }
4394
  },
4395
  "predict": {
4396
+ "1": 0.3766106069087982,
4397
+ "2": 0.6209259033203125
4398
  },
4399
  "sample": {
4400
  "messages": [
 
4429
  {
4430
  "metric": {
4431
  "acc": {
4432
+ "val": true,
4433
  "id": 54
4434
  }
4435
  },
4436
  "predict": {
4437
+ "1": 0.5619992613792419,
4438
+ "2": 0.437685489654541
4439
  },
4440
  "sample": {
4441
  "messages": [
 
4464
  "prompt_len": 93,
4465
  "generated_len": 1,
4466
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4467
+ "generated_token": "1"
4468
  }
4469
  }
4470
  {
 
4475
  }
4476
  },
4477
  "predict": {
4478
+ "1": 0.46865031123161316,
4479
+ "2": 0.5310503840446472
4480
  },
4481
  "sample": {
4482
  "messages": [
 
4516
  }
4517
  },
4518
  "predict": {
4519
+ "1": 0.34849637746810913,
4520
+ "2": 0.6510769128799438
4521
  },
4522
  "sample": {
4523
  "messages": [
 
4557
  }
4558
  },
4559
  "predict": {
4560
+ "1": 0.561853289604187,
4561
+ "2": 0.43757179379463196
4562
  },
4563
  "sample": {
4564
  "messages": [
 
4598
  }
4599
  },
4600
  "predict": {
4601
+ "1": 0.18239745497703552,
4602
+ "2": 0.817448616027832
4603
  },
4604
  "sample": {
4605
  "messages": [
 
4639
  }
4640
  },
4641
  "predict": {
4642
+ "1": 0.3774791359901428,
4643
+ "2": 0.6223578453063965
4644
  },
4645
  "sample": {
4646
  "messages": [
 
4680
  }
4681
  },
4682
  "predict": {
4683
+ "1": 0.6509857773780823,
4684
+ "2": 0.3484475910663605
4685
  },
4686
  "sample": {
4687
  "messages": [
 
4721
  }
4722
  },
4723
  "predict": {
4724
+ "1": 0.09530383348464966,
4725
+ "2": 0.9042176604270935
4726
  },
4727
  "sample": {
4728
  "messages": [
 
4762
  }
4763
  },
4764
  "predict": {
4765
+ "1": 0.4685486853122711,
4766
+ "2": 0.5309352278709412
4767
  },
4768
  "sample": {
4769
  "messages": [
 
4798
  {
4799
  "metric": {
4800
  "acc": {
4801
+ "val": true,
4802
  "id": 58
4803
  }
4804
  },
4805
  "predict": {
4806
+ "1": 0.49975061416625977,
4807
+ "2": 0.49975061416625977
4808
  },
4809
  "sample": {
4810
  "messages": [
 
4833
  "prompt_len": 72,
4834
  "generated_len": 1,
4835
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4836
+ "generated_token": "1"
4837
  }
4838
  }
4839
  {
 
4844
  }
4845
  },
4846
  "predict": {
4847
+ "1": 0.3765953481197357,
4848
+ "2": 0.6209007501602173
4849
  },
4850
  "sample": {
4851
  "messages": [
 
4885
  }
4886
  },
4887
  "predict": {
4888
+ "1": 0.22216132283210754,
4889
+ "2": 0.7754191756248474
4890
  },
4891
  "sample": {
4892
  "messages": [
 
4921
  {
4922
  "metric": {
4923
  "acc": {
4924
+ "val": false,
4925
  "id": 60
4926
  }
4927
  },
4928
  "predict": {
4929
+ "1": 0.5310652256011963,
4930
+ "2": 0.46866339445114136
4931
  },
4932
  "sample": {
4933
  "messages": [
 
4956
  "prompt_len": 94,
4957
  "generated_len": 1,
4958
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4959
+ "generated_token": "1"
4960
  }
4961
  }
4962
  {
 
4967
  }
4968
  },
4969
  "predict": {
4970
+ "1": 0.6511978507041931,
4971
+ "2": 0.3485611081123352
4972
  },
4973
  "sample": {
4974
  "messages": [
 
5003
  {
5004
  "metric": {
5005
  "acc": {
5006
+ "val": true,
5007
  "id": 61
5008
  }
5009
  },
5010
  "predict": {
5011
+ "1": 0.5922608971595764,
5012
+ "2": 0.4070545732975006
5013
  },
5014
  "sample": {
5015
  "messages": [
 
5038
  "prompt_len": 80,
5039
  "generated_len": 1,
5040
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5041
+ "generated_token": "1"
5042
  }
5043
  }
5044
  {
 
5049
  }
5050
  },
5051
  "predict": {
5052
+ "1": 0.3484278917312622,
5053
+ "2": 0.6509489417076111
5054
  },
5055
  "sample": {
5056
  "messages": [
 
5090
  }
5091
  },
5092
  "predict": {
5093
+ "1": 0.4072406589984894,
5094
+ "2": 0.5925316214561462
5095
  },
5096
  "sample": {
5097
  "messages": [
 
5131
  }
5132
  },
5133
  "predict": {
5134
+ "1": 0.16448116302490234,
5135
+ "2": 0.835304319858551
5136
  },
5137
  "sample": {
5138
  "messages": [
 
5167
  {
5168
  "metric": {
5169
  "acc": {
5170
+ "val": false,
5171
  "id": 63
5172
  }
5173
  },
5174
  "predict": {
5175
+ "1": 0.5620059370994568,
5176
+ "2": 0.4376906752586365
5177
  },
5178
  "sample": {
5179
  "messages": [
 
5202
  "prompt_len": 87,
5203
  "generated_len": 1,
5204
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5205
+ "generated_token": "1"
5206
  }
5207
  }
5208
  {
5209
  "metric": {
5210
  "acc": {
5211
+ "val": true,
5212
  "id": 63
5213
  }
5214
  },
5215
  "predict": {
5216
+ "1": 0.5310174226760864,
5217
+ "2": 0.46862125396728516
5218
  },
5219
  "sample": {
5220
  "messages": [
 
5243
  "prompt_len": 87,
5244
  "generated_len": 1,
5245
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5246
+ "generated_token": "1"
5247
  }
5248
  }
5249
  {
 
5254
  }
5255
  },
5256
  "predict": {
5257
+ "1": 0.4071462154388428,
5258
+ "2": 0.592394232749939
5259
  },
5260
  "sample": {
5261
  "messages": [
 
5295
  }
5296
  },
5297
  "predict": {
5298
+ "1": 0.20173293352127075,
5299
+ "2": 0.7978692054748535
5300
  },
5301
  "sample": {
5302
  "messages": [
 
5336
  }
5337
  },
5338
  "predict": {
5339
+ "1": 0.46734532713890076,
5340
+ "2": 0.5295716524124146
5341
  },
5342
  "sample": {
5343
  "messages": [
 
5377
  }
5378
  },
5379
  "predict": {
5380
+ "1": 0.40638577938079834,
5381
+ "2": 0.5912877917289734
5382
  },
5383
  "sample": {
5384
  "messages": [
 
5418
  }
5419
  },
5420
  "predict": {
5421
+ "1": 0.46866509318351746,
5422
+ "2": 0.5310671329498291
5423
  },
5424
  "sample": {
5425
  "messages": [
 
5459
  }
5460
  },
5461
  "predict": {
5462
+ "1": 0.3774324953556061,
5463
+ "2": 0.622281014919281
5464
  },
5465
  "sample": {
5466
  "messages": [
 
5500
  }
5501
  },
5502
  "predict": {
5503
+ "1": 0.5924903750419617,
5504
+ "2": 0.4072122871875763
5505
  },
5506
  "sample": {
5507
  "messages": [
 
5541
  }
5542
  },
5543
  "predict": {
5544
+ "1": 0.7308381795883179,
5545
+ "2": 0.2688603699207306
5546
  },
5547
  "sample": {
5548
  "messages": [
 
5577
  {
5578
  "metric": {
5579
  "acc": {
5580
+ "val": true,
5581
  "id": 68
5582
  }
5583
  },
5584
  "predict": {
5585
+ "1": 0.4996921420097351,
5586
+ "2": 0.4996921420097351
5587
  },
5588
  "sample": {
5589
  "messages": [
 
5612
  "prompt_len": 87,
5613
  "generated_len": 1,
5614
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5615
+ "generated_token": "1"
5616
  }
5617
  }
5618
  {
 
5623
  }
5624
  },
5625
  "predict": {
5626
+ "1": 0.37733572721481323,
5627
+ "2": 0.6221213936805725
5628
  },
5629
  "sample": {
5630
  "messages": [
 
5664
  }
5665
  },
5666
  "predict": {
5667
+ "1": 0.6790658235549927,
5668
+ "2": 0.32076799869537354
5669
  },
5670
  "sample": {
5671
  "messages": [
 
5705
  }
5706
  },
5707
  "predict": {
5708
+ "1": 0.40725547075271606,
5709
+ "2": 0.5925531983375549
5710
  },
5711
  "sample": {
5712
  "messages": [
 
5746
  }
5747
  },
5748
  "predict": {
5749
+ "1": 0.43762415647506714,
5750
+ "2": 0.5619205236434937
5751
  },
5752
  "sample": {
5753
  "messages": [
 
5787
  }
5788
  },
5789
  "predict": {
5790
+ "1": 0.4685721695423126,
5791
+ "2": 0.530961811542511
5792
  },
5793
  "sample": {
5794
  "messages": [
 
5828
  }
5829
  },
5830
  "predict": {
5831
+ "1": 0.2449629306793213,
5832
+ "2": 0.7545389533042908
5833
  },
5834
  "sample": {
5835
  "messages": [
 
5869
  }
5870
  },
5871
  "predict": {
5872
+ "1": 0.14798715710639954,
5873
+ "2": 0.8516073226928711
5874
  },
5875
  "sample": {
5876
  "messages": [
 
5910
  }
5911
  },
5912
  "predict": {
5913
+ "1": 0.5309675335884094,
5914
+ "2": 0.46857720613479614
5915
  },
5916
  "sample": {
5917
  "messages": [
 
5951
  }
5952
  },
5953
  "predict": {
5954
+ "1": 0.32067590951919556,
5955
+ "2": 0.6788709163665771
5956
  },
5957
  "sample": {
5958
  "messages": [
 
5992
  }
5993
  },
5994
  "predict": {
5995
+ "1": 0.24464736878871918,
5996
+ "2": 0.7535669803619385
5997
  },
5998
  "sample": {
5999
  "messages": [
 
6028
  {
6029
  "metric": {
6030
  "acc": {
6031
+ "val": true,
6032
  "id": 73
6033
  }
6034
  },
6035
  "predict": {
6036
+ "1": 0.4990968406200409,
6037
+ "2": 0.4990968406200409
6038
  },
6039
  "sample": {
6040
  "messages": [
 
6063
  "prompt_len": 70,
6064
  "generated_len": 1,
6065
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6066
+ "generated_token": "1"
6067
  }
6068
  }
6069
  {
 
6074
  }
6075
  },
6076
  "predict": {
6077
+ "1": 0.29412874579429626,
6078
+ "2": 0.7055782079696655
6079
  },
6080
  "sample": {
6081
  "messages": [
 
6115
  }
6116
  },
6117
  "predict": {
6118
+ "1": 0.6511863470077515,
6119
+ "2": 0.34855493903160095
6120
  },
6121
  "sample": {
6122
  "messages": [
 
6151
  {
6152
  "metric": {
6153
  "acc": {
6154
+ "val": false,
6155
  "id": 75
6156
  }
6157
  },
6158
  "predict": {
6159
+ "1": 0.49970299005508423,
6160
+ "2": 0.49970299005508423
6161
  },
6162
  "sample": {
6163
  "messages": [
 
6186
  "prompt_len": 90,
6187
  "generated_len": 1,
6188
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6189
+ "generated_token": "1"
6190
  }
6191
  }
6192
  {
 
6197
  }
6198
  },
6199
  "predict": {
6200
+ "1": 0.4685269892215729,
6201
+ "2": 0.5309106707572937
6202
  },
6203
  "sample": {
6204
  "messages": [
 
6238
  }
6239
  },
6240
  "predict": {
6241
+ "1": 0.08508796244859695,
6242
+ "2": 0.9147818088531494
6243
  },
6244
  "sample": {
6245
  "messages": [
 
6279
  }
6280
  },
6281
  "predict": {
6282
+ "1": 0.6223557591438293,
6283
+ "2": 0.37747785449028015
6284
  },
6285
  "sample": {
6286
  "messages": [
 
6320
  }
6321
  },
6322
  "predict": {
6323
+ "1": 0.37738266587257385,
6324
+ "2": 0.6221988201141357
6325
  },
6326
  "sample": {
6327
  "messages": [
 
6361
  }
6362
  },
6363
  "predict": {
6364
+ "1": 0.29406994581222534,
6365
+ "2": 0.705437183380127
6366
  },
6367
  "sample": {
6368
  "messages": [
 
6402
  }
6403
  },
6404
  "predict": {
6405
+ "1": 0.18236002326011658,
6406
+ "2": 0.8172808885574341
6407
  },
6408
  "sample": {
6409
  "messages": [
 
6443
  }
6444
  },
6445
  "predict": {
6446
+ "1": 0.46857959032058716,
6447
+ "2": 0.5309702157974243
6448
  },
6449
  "sample": {
6450
  "messages": [
 
6479
  {
6480
  "metric": {
6481
  "acc": {
6482
+ "val": true,
6483
  "id": 79
6484
  }
6485
  },
6486
  "predict": {
6487
+ "1": 0.49989816546440125,
6488
+ "2": 0.49989816546440125
6489
  },
6490
  "sample": {
6491
  "messages": [
 
6514
  "prompt_len": 95,
6515
  "generated_len": 1,
6516
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6517
+ "generated_token": "1"
6518
  }
6519
  }
6520
  {
 
6525
  }
6526
  },
6527
  "predict": {
6528
+ "1": 0.10667160898447037,
6529
+ "2": 0.8931503891944885
6530
  },
6531
  "sample": {
6532
  "messages": [
 
6566
  }
6567
  },
6568
  "predict": {
6569
+ "1": 0.40714889764785767,
6570
+ "2": 0.5923981070518494
6571
  },
6572
  "sample": {
6573
  "messages": [
 
6607
  }
6608
  },
6609
  "predict": {
6610
+ "1": 0.20172545313835144,
6611
+ "2": 0.7978396415710449
6612
  },
6613
  "sample": {
6614
  "messages": [
 
6648
  }
6649
  },
6650
  "predict": {
6651
+ "1": 0.2688082754611969,
6652
+ "2": 0.7306965589523315
6653
  },
6654
  "sample": {
6655
  "messages": [
 
6689
  }
6690
  },
6691
  "predict": {
6692
+ "1": 0.4376171827316284,
6693
+ "2": 0.5619115829467773
6694
  },
6695
  "sample": {
6696
  "messages": [
 
6725
  {
6726
  "metric": {
6727
  "acc": {
6728
+ "val": true,
6729
  "id": 82
6730
  }
6731
  },
6732
  "predict": {
6733
+ "1": 0.5290731191635132,
6734
+ "2": 0.4669053554534912
6735
  },
6736
  "sample": {
6737
  "messages": [
 
6760
  "prompt_len": 85,
6761
  "generated_len": 1,
6762
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6763
+ "generated_token": "1"
6764
  }
6765
  }
6766
  {
 
6771
  }
6772
  },
6773
  "predict": {
6774
+ "1": 0.29339078068733215,
6775
+ "2": 0.7038079500198364
6776
  },
6777
  "sample": {
6778
  "messages": [
 
6812
  }
6813
  },
6814
  "predict": {
6815
+ "1": 0.40722301602363586,
6816
+ "2": 0.5925059914588928
6817
  },
6818
  "sample": {
6819
  "messages": [
 
6853
  }
6854
  },
6855
  "predict": {
6856
+ "1": 0.4686603844165802,
6857
+ "2": 0.5310617685317993
6858
  },
6859
  "sample": {
6860
  "messages": [
 
6894
  }
6895
  },
6896
  "predict": {
6897
+ "1": 0.376331090927124,
6898
+ "2": 0.620465099811554
6899
  },
6900
  "sample": {
6901
  "messages": [
 
6935
  }
6936
  },
6937
  "predict": {
6938
+ "1": 0.4365803003311157,
6939
+ "2": 0.5605801939964294
6940
  },
6941
  "sample": {
6942
  "messages": [
 
6976
  }
6977
  },
6978
  "predict": {
6979
+ "1": 0.4375999867916107,
6980
+ "2": 0.5618895292282104
6981
  },
6982
  "sample": {
6983
  "messages": [
 
7012
  {
7013
  "metric": {
7014
  "acc": {
7015
+ "val": false,
7016
  "id": 85
7017
  }
7018
  },
7019
  "predict": {
7020
+ "1": 0.5309422612190247,
7021
+ "2": 0.46855491399765015
7022
  },
7023
  "sample": {
7024
  "messages": [
 
7047
  "prompt_len": 77,
7048
  "generated_len": 1,
7049
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7050
+ "generated_token": "1"
7051
  }
7052
  }
7053
  {
 
7058
  }
7059
  },
7060
  "predict": {
7061
+ "1": 0.40724000334739685,
7062
+ "2": 0.5925306677818298
7063
  },
7064
  "sample": {
7065
  "messages": [
 
7099
  }
7100
  },
7101
  "predict": {
7102
+ "1": 0.46869274973869324,
7103
+ "2": 0.531098484992981
7104
  },
7105
  "sample": {
7106
  "messages": [
 
7140
  }
7141
  },
7142
  "predict": {
7143
+ "1": 0.2687942385673523,
7144
+ "2": 0.7306584119796753
7145
  },
7146
  "sample": {
7147
  "messages": [
 
7176
  {
7177
  "metric": {
7178
  "acc": {
7179
+ "val": false,
7180
  "id": 87
7181
  }
7182
  },
7183
  "predict": {
7184
+ "1": 0.4997122287750244,
7185
+ "2": 0.4997122287750244
7186
  },
7187
  "sample": {
7188
  "messages": [
 
7211
  "prompt_len": 84,
7212
  "generated_len": 1,
7213
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7214
+ "generated_token": "1"
7215
  }
7216
  }
7217
  {
 
7222
  }
7223
  },
7224
  "predict": {
7225
+ "1": 0.5310584902763367,
7226
+ "2": 0.4686574637889862
7227
  },
7228
  "sample": {
7229
  "messages": [
 
7263
  }
7264
  },
7265
  "predict": {
7266
+ "1": 0.22263851761817932,
7267
+ "2": 0.7770847678184509
7268
  },
7269
  "sample": {
7270
  "messages": [
 
7304
  }
7305
  },
7306
  "predict": {
7307
+ "1": 0.7307744026184082,
7308
+ "2": 0.2688368856906891
7309
  },
7310
  "sample": {
7311
  "messages": [
 
7345
  }
7346
  },
7347
  "predict": {
7348
+ "1": 0.06006486713886261,
7349
+ "2": 0.9395726323127747
7350
  },
7351
  "sample": {
7352
  "messages": [
 
7386
  }
7387
  },
7388
  "predict": {
7389
+ "1": 0.3480902314186096,
7390
+ "2": 0.6503181457519531
7391
  },
7392
  "sample": {
7393
  "messages": [
 
7427
  }
7428
  },
7429
  "predict": {
7430
+ "1": 0.37690505385398865,
7431
+ "2": 0.6214113831520081
7432
  },
7433
  "sample": {
7434
  "messages": [
 
7468
  }
7469
  },
7470
  "predict": {
7471
+ "1": 0.6222553253173828,
7472
+ "2": 0.3774169385433197
7473
  },
7474
  "sample": {
7475
  "messages": [
 
7504
  {
7505
  "metric": {
7506
  "acc": {
7507
+ "val": true,
7508
  "id": 91
7509
  }
7510
  },
7511
  "predict": {
7512
+ "1": 0.562000036239624,
7513
+ "2": 0.4376860558986664
7514
  },
7515
  "sample": {
7516
  "messages": [
 
7539
  "prompt_len": 100,
7540
  "generated_len": 1,
7541
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7542
+ "generated_token": "1"
7543
  }
7544
  }
7545
  {
 
7550
  }
7551
  },
7552
  "predict": {
7553
+ "1": 0.29406145215034485,
7554
+ "2": 0.705416738986969
7555
  },
7556
  "sample": {
7557
  "messages": [
 
7586
  {
7587
  "metric": {
7588
  "acc": {
7589
+ "val": true,
7590
  "id": 92
7591
  }
7592
  },
7593
  "predict": {
7594
+ "1": 0.5923555493354797,
7595
+ "2": 0.40711963176727295
7596
  },
7597
  "sample": {
7598
  "messages": [
 
7621
  "prompt_len": 90,
7622
  "generated_len": 1,
7623
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7624
+ "generated_token": "1"
7625
  }
7626
  }
7627
  {
 
7632
  }
7633
  },
7634
  "predict": {
7635
+ "1": 0.5922176837921143,
7636
+ "2": 0.4070248603820801
7637
  },
7638
  "sample": {
7639
  "messages": [
 
7673
  }
7674
  },
7675
  "predict": {
7676
+ "1": 0.2225605547428131,
7677
+ "2": 0.7768126726150513
7678
  },
7679
  "sample": {
7680
  "messages": [
 
7714
  }
7715
  },
7716
  "predict": {
7717
+ "1": 0.6512224078178406,
7718
+ "2": 0.3485742509365082
7719
  },
7720
  "sample": {
7721
  "messages": [
 
7755
  }
7756
  },
7757
  "predict": {
7758
+ "1": 0.3485764265060425,
7759
+ "2": 0.6512265205383301
7760
  },
7761
  "sample": {
7762
  "messages": [
 
7796
  }
7797
  },
7798
  "predict": {
7799
+ "1": 0.4072412848472595,
7800
+ "2": 0.5925325751304626
7801
  },
7802
  "sample": {
7803
  "messages": [
 
7832
  {
7833
  "metric": {
7834
  "acc": {
7835
+ "val": true,
7836
  "id": 95
7837
  }
7838
  },
7839
  "predict": {
7840
+ "1": 0.5925514698028564,
7841
+ "2": 0.40725430846214294
7842
  },
7843
  "sample": {
7844
  "messages": [
 
7867
  "prompt_len": 85,
7868
  "generated_len": 1,
7869
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7870
+ "generated_token": "1"
7871
  }
7872
  }
7873
  {
 
7878
  }
7879
  },
7880
  "predict": {
7881
+ "1": 0.561974287033081,
7882
+ "2": 0.43766599893569946
7883
  },
7884
  "sample": {
7885
  "messages": [
 
7919
  }
7920
  },
7921
  "predict": {
7922
+ "1": 0.22262296080589294,
7923
+ "2": 0.7770304679870605
7924
  },
7925
  "sample": {
7926
  "messages": [
 
7960
  }
7961
  },
7962
  "predict": {
7963
+ "1": 0.294090211391449,
7964
+ "2": 0.7054857611656189
7965
  },
7966
  "sample": {
7967
  "messages": [
 
7996
  {
7997
  "metric": {
7998
  "acc": {
7999
+ "val": true,
8000
  "id": 97
8001
  }
8002
  },
8003
  "predict": {
8004
+ "1": 0.407183438539505,
8005
+ "2": 0.5924484133720398
8006
  },
8007
  "sample": {
8008
  "messages": [
 
8031
  "prompt_len": 86,
8032
  "generated_len": 1,
8033
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8034
+ "generated_token": "2"
8035
  }
8036
  }
8037
  {
 
8042
  }
8043
  },
8044
  "predict": {
8045
+ "1": 0.4685804843902588,
8046
+ "2": 0.5309712886810303
8047
  },
8048
  "sample": {
8049
  "messages": [
 
8083
  }
8084
  },
8085
  "predict": {
8086
+ "1": 0.4070993661880493,
8087
+ "2": 0.5923260450363159
8088
  },
8089
  "sample": {
8090
  "messages": [
 
8124
  }
8125
  },
8126
  "predict": {
8127
+ "1": 0.32016804814338684,
8128
+ "2": 0.6777957081794739
8129
  },
8130
  "sample": {
8131
  "messages": [
 
8165
  }
8166
  },
8167
  "predict": {
8168
+ "1": 0.4063665270805359,
8169
+ "2": 0.591259777545929
8170
  },
8171
  "sample": {
8172
  "messages": [
llmtf_eval/darumeru_PARus_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
llmtf_eval/darumeru_PARus_total.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "task_name": "darumeru/PARus",
3
  "results": {
4
- "acc": 0.24
5
  },
6
- "leaderboard_result": 0.24
7
  }
 
1
  {
2
  "task_name": "darumeru/PARus",
3
  "results": {
4
+ "acc": 0.44
5
  },
6
+ "leaderboard_result": 0.44
7
  }
llmtf_eval/darumeru_RCB.jsonl CHANGED
@@ -7,9 +7,9 @@
7
  ]
8
  },
9
  "predict": {
10
- "1": 0.9949106574058533,
11
- "2": 0.0019206294091418386,
12
- "3": 0.0031665826682001352
13
  },
14
  "sample": {
15
  "messages": [
@@ -51,9 +51,9 @@
51
  ]
52
  },
53
  "predict": {
54
- "1": 2.4222761567216367e-05,
55
- "2": 0.996787428855896,
56
- "3": 0.0031725559383630753
57
  },
58
  "sample": {
59
  "messages": [
@@ -95,9 +95,9 @@
95
  ]
96
  },
97
  "predict": {
98
- "1": 0.8311020731925964,
99
- "2": 0.12745362520217896,
100
- "3": 0.04137813299894333
101
  },
102
  "sample": {
103
  "messages": [
@@ -139,9 +139,9 @@
139
  ]
140
  },
141
  "predict": {
142
- "1": 0.6286662220954895,
143
- "2": 0.262067049741745,
144
- "3": 0.10924579203128815
145
  },
146
  "sample": {
147
  "messages": [
@@ -183,9 +183,9 @@
183
  ]
184
  },
185
  "predict": {
186
- "1": 0.980975866317749,
187
- "2": 0.012348663993179798,
188
- "3": 0.0066097634844481945
189
  },
190
  "sample": {
191
  "messages": [
@@ -227,9 +227,9 @@
227
  ]
228
  },
229
  "predict": {
230
- "1": 0.03251832723617554,
231
- "2": 0.8386587500572205,
232
- "3": 0.1286124885082245
233
  },
234
  "sample": {
235
  "messages": [
@@ -271,9 +271,9 @@
271
  ]
272
  },
273
  "predict": {
274
- "1": 0.359849750995636,
275
- "2": 0.28025126457214355,
276
- "3": 0.359849750995636
277
  },
278
  "sample": {
279
  "messages": [
@@ -315,9 +315,9 @@
315
  ]
316
  },
317
  "predict": {
318
- "1": 0.02847575768828392,
319
- "2": 0.9429875612258911,
320
- "3": 0.02847575768828392
321
  },
322
  "sample": {
323
  "messages": [
@@ -359,9 +359,9 @@
359
  ]
360
  },
361
  "predict": {
362
- "1": 0.7960770130157471,
363
- "2": 0.10773730278015137,
364
- "3": 0.09507784247398376
365
  },
366
  "sample": {
367
  "messages": [
@@ -403,9 +403,9 @@
403
  ]
404
  },
405
  "predict": {
406
- "1": 0.6739814281463623,
407
- "2": 0.007487257942557335,
408
- "3": 0.31836628913879395
409
  },
410
  "sample": {
411
  "messages": [
@@ -447,9 +447,9 @@
447
  ]
448
  },
449
  "predict": {
450
- "1": 0.9806196689605713,
451
- "2": 0.010893701575696468,
452
- "3": 0.008484022691845894
453
  },
454
  "sample": {
455
  "messages": [
@@ -491,9 +491,9 @@
491
  ]
492
  },
493
  "predict": {
494
- "1": 1.88568501471309e-05,
495
- "2": 0.9963723421096802,
496
- "3": 0.003593479748815298
497
  },
498
  "sample": {
499
  "messages": [
@@ -535,9 +535,9 @@
535
  ]
536
  },
537
  "predict": {
538
- "1": 0.9927825927734375,
539
- "2": 0.0040572755970060825,
540
- "3": 0.003159809624776244
541
  },
542
  "sample": {
543
  "messages": [
@@ -579,9 +579,9 @@
579
  ]
580
  },
581
  "predict": {
582
- "1": 0.9989213943481445,
583
- "2": 4.535096013569273e-05,
584
- "3": 0.0010321830632165074
585
  },
586
  "sample": {
587
  "messages": [
@@ -623,9 +623,9 @@
623
  ]
624
  },
625
  "predict": {
626
- "1": 0.8579575419425964,
627
- "2": 0.06215023994445801,
628
- "3": 0.0798024833202362
629
  },
630
  "sample": {
631
  "messages": [
@@ -660,16 +660,16 @@
660
  }
661
  {
662
  "metric": {
663
- "acc": true,
664
  "f1_macro": [
665
  "2",
666
- "2"
667
  ]
668
  },
669
  "predict": {
670
- "1": 0.14122389256954193,
671
- "2": 0.717194139957428,
672
- "3": 0.14122389256954193
673
  },
674
  "sample": {
675
  "messages": [
@@ -699,7 +699,7 @@
699
  "prompt_len": 101,
700
  "generated_len": 1,
701
  "generated_cumulative_logprob": "TODO: calculate for hf model",
702
- "generated_token": "2"
703
  }
704
  }
705
  {
@@ -711,9 +711,9 @@
711
  ]
712
  },
713
  "predict": {
714
- "1": 0.6938971281051636,
715
- "2": 0.05026574432849884,
716
- "3": 0.25527051091194153
717
  },
718
  "sample": {
719
  "messages": [
@@ -755,9 +755,9 @@
755
  ]
756
  },
757
  "predict": {
758
- "1": 0.928261399269104,
759
- "2": 0.019265450537204742,
760
- "3": 0.052368927747011185
761
  },
762
  "sample": {
763
  "messages": [
@@ -795,13 +795,13 @@
795
  "acc": false,
796
  "f1_macro": [
797
  "3",
798
- "2"
799
  ]
800
  },
801
  "predict": {
802
- "1": 0.103329136967659,
803
- "2": 0.7635048031806946,
804
- "3": 0.13267722725868225
805
  },
806
  "sample": {
807
  "messages": [
@@ -831,7 +831,7 @@
831
  "prompt_len": 106,
832
  "generated_len": 1,
833
  "generated_cumulative_logprob": "TODO: calculate for hf model",
834
- "generated_token": "2"
835
  }
836
  }
837
  {
@@ -843,9 +843,9 @@
843
  ]
844
  },
845
  "predict": {
846
- "1": 0.9307658076286316,
847
- "2": 0.0017968007596209645,
848
- "3": 0.06742445379495621
849
  },
850
  "sample": {
851
  "messages": [
@@ -887,9 +887,9 @@
887
  ]
888
  },
889
  "predict": {
890
- "1": 0.0007993357139639556,
891
- "2": 0.993293046951294,
892
- "3": 0.005906336009502411
893
  },
894
  "sample": {
895
  "messages": [
@@ -931,9 +931,9 @@
931
  ]
932
  },
933
  "predict": {
934
- "1": 0.9381154775619507,
935
- "2": 0.015163224190473557,
936
- "3": 0.04670601710677147
937
  },
938
  "sample": {
939
  "messages": [
@@ -975,9 +975,9 @@
975
  ]
976
  },
977
  "predict": {
978
- "1": 0.5897971987724304,
979
- "2": 0.13160154223442078,
980
- "3": 0.2786004841327667
981
  },
982
  "sample": {
983
  "messages": [
@@ -1019,9 +1019,9 @@
1019
  ]
1020
  },
1021
  "predict": {
1022
- "1": 0.5378125309944153,
1023
- "2": 0.3261997699737549,
1024
- "3": 0.1359802931547165
1025
  },
1026
  "sample": {
1027
  "messages": [
@@ -1056,16 +1056,16 @@
1056
  }
1057
  {
1058
  "metric": {
1059
- "acc": false,
1060
  "f1_macro": [
1061
  "1",
1062
- "2"
1063
  ]
1064
  },
1065
  "predict": {
1066
- "1": 0.19808343052864075,
1067
- "2": 0.47517746686935425,
1068
- "3": 0.3265843689441681
1069
  },
1070
  "sample": {
1071
  "messages": [
@@ -1095,7 +1095,7 @@
1095
  "prompt_len": 89,
1096
  "generated_len": 1,
1097
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1098
- "generated_token": "2"
1099
  }
1100
  }
1101
  {
@@ -1103,13 +1103,13 @@
1103
  "acc": false,
1104
  "f1_macro": [
1105
  "3",
1106
- "2"
1107
  ]
1108
  },
1109
  "predict": {
1110
- "1": 0.23351947963237762,
1111
- "2": 0.560184121131897,
1112
- "3": 0.20608024299144745
1113
  },
1114
  "sample": {
1115
  "messages": [
@@ -1139,7 +1139,7 @@
1139
  "prompt_len": 97,
1140
  "generated_len": 1,
1141
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1142
- "generated_token": "2"
1143
  }
1144
  }
1145
  {
@@ -1151,9 +1151,9 @@
1151
  ]
1152
  },
1153
  "predict": {
1154
- "1": 0.7419307231903076,
1155
- "2": 0.12892821431159973,
1156
- "3": 0.12892821431159973
1157
  },
1158
  "sample": {
1159
  "messages": [
@@ -1195,9 +1195,9 @@
1195
  ]
1196
  },
1197
  "predict": {
1198
- "1": 0.7944108247756958,
1199
- "2": 0.08373028039932251,
1200
- "3": 0.12182684987783432
1201
  },
1202
  "sample": {
1203
  "messages": [
@@ -1239,9 +1239,9 @@
1239
  ]
1240
  },
1241
  "predict": {
1242
- "1": 0.0012944919290021062,
1243
- "2": 0.9756640195846558,
1244
- "3": 0.02294541895389557
1245
  },
1246
  "sample": {
1247
  "messages": [
@@ -1283,9 +1283,9 @@
1283
  ]
1284
  },
1285
  "predict": {
1286
- "1": 0.628616988658905,
1287
- "2": 0.10923724621534348,
1288
- "3": 0.26204654574394226
1289
  },
1290
  "sample": {
1291
  "messages": [
@@ -1327,9 +1327,9 @@
1327
  ]
1328
  },
1329
  "predict": {
1330
- "1": 0.9346874952316284,
1331
- "2": 0.005557854659855366,
1332
- "3": 0.05975256860256195
1333
  },
1334
  "sample": {
1335
  "messages": [
@@ -1364,16 +1364,16 @@
1364
  }
1365
  {
1366
  "metric": {
1367
- "acc": false,
1368
  "f1_macro": [
1369
  "1",
1370
- "2"
1371
  ]
1372
  },
1373
  "predict": {
1374
- "1": 0.19619548320770264,
1375
- "2": 0.6847895383834839,
1376
- "3": 0.11899856477975845
1377
  },
1378
  "sample": {
1379
  "messages": [
@@ -1403,7 +1403,7 @@
1403
  "prompt_len": 164,
1404
  "generated_len": 1,
1405
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1406
- "generated_token": "2"
1407
  }
1408
  }
1409
  {
@@ -1415,9 +1415,9 @@
1415
  ]
1416
  },
1417
  "predict": {
1418
- "1": 0.9973662495613098,
1419
- "2": 0.0007083039381541312,
1420
- "3": 0.0019253698410466313
1421
  },
1422
  "sample": {
1423
  "messages": [
@@ -1459,9 +1459,9 @@
1459
  ]
1460
  },
1461
  "predict": {
1462
- "1": 0.6337988376617432,
1463
- "2": 0.06680189818143845,
1464
- "3": 0.2993853688240051
1465
  },
1466
  "sample": {
1467
  "messages": [
@@ -1503,9 +1503,9 @@
1503
  ]
1504
  },
1505
  "predict": {
1506
- "1": 0.6846823692321777,
1507
- "2": 0.1961647868156433,
1508
- "3": 0.11897994577884674
1509
  },
1510
  "sample": {
1511
  "messages": [
@@ -1547,9 +1547,9 @@
1547
  ]
1548
  },
1549
  "predict": {
1550
- "1": 0.8949613571166992,
1551
- "2": 0.021047474816441536,
1552
- "3": 0.08324436843395233
1553
  },
1554
  "sample": {
1555
  "messages": [
@@ -1587,13 +1587,13 @@
1587
  "acc": false,
1588
  "f1_macro": [
1589
  "3",
1590
- "2"
1591
  ]
1592
  },
1593
  "predict": {
1594
- "1": 0.331316202878952,
1595
- "2": 0.3754304349422455,
1596
- "3": 0.2923855185508728
1597
  },
1598
  "sample": {
1599
  "messages": [
@@ -1623,7 +1623,7 @@
1623
  "prompt_len": 128,
1624
  "generated_len": 1,
1625
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1626
- "generated_token": "2"
1627
  }
1628
  }
1629
  {
@@ -1635,9 +1635,9 @@
1635
  ]
1636
  },
1637
  "predict": {
1638
- "1": 0.7969197630882263,
1639
- "2": 0.10785136371850967,
1640
- "3": 0.0951784998178482
1641
  },
1642
  "sample": {
1643
  "messages": [
@@ -1679,9 +1679,9 @@
1679
  ]
1680
  },
1681
  "predict": {
1682
- "1": 0.4051330089569092,
1683
- "2": 0.3155179023742676,
1684
- "3": 0.2784435749053955
1685
  },
1686
  "sample": {
1687
  "messages": [
@@ -1723,9 +1723,9 @@
1723
  ]
1724
  },
1725
  "predict": {
1726
- "1": 0.26278576254844666,
1727
- "2": 0.1806098222732544,
1728
- "3": 0.5563174486160278
1729
  },
1730
  "sample": {
1731
  "messages": [
@@ -1760,16 +1760,16 @@
1760
  }
1761
  {
1762
  "metric": {
1763
- "acc": false,
1764
  "f1_macro": [
1765
  "1",
1766
- "3"
1767
  ]
1768
  },
1769
  "predict": {
1770
- "1": 0.22247056663036346,
1771
- "2": 0.0010302431182935834,
1772
- "3": 0.776498556137085
1773
  },
1774
  "sample": {
1775
  "messages": [
@@ -1799,7 +1799,7 @@
1799
  "prompt_len": 164,
1800
  "generated_len": 1,
1801
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1802
- "generated_token": "3"
1803
  }
1804
  }
1805
  {
@@ -1811,9 +1811,9 @@
1811
  ]
1812
  },
1813
  "predict": {
1814
- "1": 0.9556446075439453,
1815
- "2": 0.037054359912872314,
1816
- "3": 0.007296436000615358
1817
  },
1818
  "sample": {
1819
  "messages": [
@@ -1855,9 +1855,9 @@
1855
  ]
1856
  },
1857
  "predict": {
1858
- "1": 8.937624443206005e-06,
1859
- "2": 0.9997592568397522,
1860
- "3": 0.00023050435993354768
1861
  },
1862
  "sample": {
1863
  "messages": [
@@ -1899,9 +1899,9 @@
1899
  ]
1900
  },
1901
  "predict": {
1902
- "1": 0.9998630285263062,
1903
- "2": 2.7532680178410374e-05,
1904
- "3": 0.00010889385885093361
1905
  },
1906
  "sample": {
1907
  "messages": [
@@ -1943,9 +1943,9 @@
1943
  ]
1944
  },
1945
  "predict": {
1946
- "1": 0.99820876121521,
1947
- "2": 0.0017005683621391654,
1948
- "3": 7.471775461453944e-05
1949
  },
1950
  "sample": {
1951
  "messages": [
@@ -1987,9 +1987,9 @@
1987
  ]
1988
  },
1989
  "predict": {
1990
- "1": 5.134141611051746e-05,
1991
- "2": 0.9979891777038574,
1992
- "3": 0.0019265724113211036
1993
  },
1994
  "sample": {
1995
  "messages": [
@@ -2031,9 +2031,9 @@
2031
  ]
2032
  },
2033
  "predict": {
2034
- "1": 0.007122146897017956,
2035
- "2": 0.9328172206878662,
2036
- "3": 0.059633009135723114
2037
  },
2038
  "sample": {
2039
  "messages": [
@@ -2075,9 +2075,9 @@
2075
  ]
2076
  },
2077
  "predict": {
2078
- "1": 0.8834777474403381,
2079
- "2": 0.07252027094364166,
2080
- "3": 0.043985765427351
2081
  },
2082
  "sample": {
2083
  "messages": [
@@ -2119,9 +2119,9 @@
2119
  ]
2120
  },
2121
  "predict": {
2122
- "1": 0.0007971510640345514,
2123
- "2": 0.9905782341957092,
2124
- "3": 0.008570181205868721
2125
  },
2126
  "sample": {
2127
  "messages": [
@@ -2163,9 +2163,9 @@
2163
  ]
2164
  },
2165
  "predict": {
2166
- "1": 0.5709625482559204,
2167
- "2": 0.036500412970781326,
2168
- "3": 0.39241644740104675
2169
  },
2170
  "sample": {
2171
  "messages": [
@@ -2207,9 +2207,9 @@
2207
  ]
2208
  },
2209
  "predict": {
2210
- "1": 0.9990153312683105,
2211
- "2": 0.00017938339442480356,
2212
- "3": 0.0008039406384341419
2213
  },
2214
  "sample": {
2215
  "messages": [
@@ -2251,9 +2251,9 @@
2251
  ]
2252
  },
2253
  "predict": {
2254
- "1": 0.0011647538049146533,
2255
- "2": 0.9947682619094849,
2256
- "3": 0.0040653906762599945
2257
  },
2258
  "sample": {
2259
  "messages": [
@@ -2295,9 +2295,9 @@
2295
  ]
2296
  },
2297
  "predict": {
2298
- "1": 0.8055086135864258,
2299
- "2": 0.014753405936062336,
2300
- "3": 0.1797332763671875
2301
  },
2302
  "sample": {
2303
  "messages": [
@@ -2339,9 +2339,9 @@
2339
  ]
2340
  },
2341
  "predict": {
2342
- "1": 0.959823489189148,
2343
- "2": 0.007328342646360397,
2344
- "3": 0.03284335508942604
2345
  },
2346
  "sample": {
2347
  "messages": [
@@ -2383,9 +2383,9 @@
2383
  ]
2384
  },
2385
  "predict": {
2386
- "1": 0.4015943109989166,
2387
- "2": 0.24357926845550537,
2388
- "3": 0.3544057309627533
2389
  },
2390
  "sample": {
2391
  "messages": [
@@ -2420,16 +2420,16 @@
2420
  }
2421
  {
2422
  "metric": {
2423
- "acc": true,
2424
  "f1_macro": [
2425
  "2",
2426
- "2"
2427
  ]
2428
  },
2429
  "predict": {
2430
- "1": 0.2965918481349945,
2431
- "2": 0.6278849244117737,
2432
- "3": 0.0749901607632637
2433
  },
2434
  "sample": {
2435
  "messages": [
@@ -2459,7 +2459,7 @@
2459
  "prompt_len": 111,
2460
  "generated_len": 1,
2461
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2462
- "generated_token": "2"
2463
  }
2464
  }
2465
  {
@@ -2471,9 +2471,9 @@
2471
  ]
2472
  },
2473
  "predict": {
2474
- "1": 0.6848835945129395,
2475
- "2": 0.2223491370677948,
2476
- "3": 0.0926889106631279
2477
  },
2478
  "sample": {
2479
  "messages": [
@@ -2515,9 +2515,9 @@
2515
  ]
2516
  },
2517
  "predict": {
2518
- "1": 0.9814509153366089,
2519
- "2": 0.013999644666910172,
2520
- "3": 0.004545019473880529
2521
  },
2522
  "sample": {
2523
  "messages": [
@@ -2559,9 +2559,9 @@
2559
  ]
2560
  },
2561
  "predict": {
2562
- "1": 0.0002600970328785479,
2563
- "2": 0.9955541491508484,
2564
- "3": 0.004068602342158556
2565
  },
2566
  "sample": {
2567
  "messages": [
@@ -2603,9 +2603,9 @@
2603
  ]
2604
  },
2605
  "predict": {
2606
- "1": 0.8998140096664429,
2607
- "2": 0.016480669379234314,
2608
- "3": 0.08369573950767517
2609
  },
2610
  "sample": {
2611
  "messages": [
@@ -2647,9 +2647,9 @@
2647
  ]
2648
  },
2649
  "predict": {
2650
- "1": 0.9987452030181885,
2651
- "2": 8.47118062665686e-05,
2652
- "3": 0.0011694104177877307
2653
  },
2654
  "sample": {
2655
  "messages": [
@@ -2691,9 +2691,9 @@
2691
  ]
2692
  },
2693
  "predict": {
2694
- "1": 0.9984468817710876,
2695
- "2": 5.1364961109356955e-05,
2696
- "3": 0.0015011040959507227
2697
  },
2698
  "sample": {
2699
  "messages": [
@@ -2735,9 +2735,9 @@
2735
  ]
2736
  },
2737
  "predict": {
2738
- "1": 0.8783456087112427,
2739
- "2": 0.0720990002155304,
2740
- "3": 0.04955286905169487
2741
  },
2742
  "sample": {
2743
  "messages": [
@@ -2779,9 +2779,9 @@
2779
  ]
2780
  },
2781
  "predict": {
2782
- "1": 0.9068573117256165,
2783
- "2": 0.03516267240047455,
2784
- "3": 0.05797344818711281
2785
  },
2786
  "sample": {
2787
  "messages": [
@@ -2823,9 +2823,9 @@
2823
  ]
2824
  },
2825
  "predict": {
2826
- "1": 0.8776419758796692,
2827
- "2": 0.04951317235827446,
2828
- "3": 0.07204123586416245
2829
  },
2830
  "sample": {
2831
  "messages": [
@@ -2867,9 +2867,9 @@
2867
  ]
2868
  },
2869
  "predict": {
2870
- "1": 0.7253900170326233,
2871
- "2": 0.11124216020107269,
2872
- "3": 0.16185639798641205
2873
  },
2874
  "sample": {
2875
  "messages": [
@@ -2911,9 +2911,9 @@
2911
  ]
2912
  },
2913
  "predict": {
2914
- "1": 0.45502758026123047,
2915
- "2": 0.35437583923339844,
2916
- "3": 0.1896837055683136
2917
  },
2918
  "sample": {
2919
  "messages": [
@@ -2955,9 +2955,9 @@
2955
  ]
2956
  },
2957
  "predict": {
2958
- "1": 0.9575343728065491,
2959
- "2": 0.022519050166010857,
2960
- "3": 0.019872991368174553
2961
  },
2962
  "sample": {
2963
  "messages": [
@@ -2999,9 +2999,9 @@
2999
  ]
3000
  },
3001
  "predict": {
3002
- "1": 0.9337077736854553,
3003
- "2": 0.013318625278770924,
3004
- "3": 0.05267618969082832
3005
  },
3006
  "sample": {
3007
  "messages": [
@@ -3036,16 +3036,16 @@
3036
  }
3037
  {
3038
  "metric": {
3039
- "acc": true,
3040
  "f1_macro": [
3041
  "2",
3042
- "2"
3043
  ]
3044
  },
3045
  "predict": {
3046
- "1": 0.2582869827747345,
3047
- "2": 0.6195982694625854,
3048
- "3": 0.1220061257481575
3049
  },
3050
  "sample": {
3051
  "messages": [
@@ -3075,7 +3075,7 @@
3075
  "prompt_len": 170,
3076
  "generated_len": 1,
3077
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3078
- "generated_token": "2"
3079
  }
3080
  }
3081
  {
@@ -3083,13 +3083,13 @@
3083
  "acc": false,
3084
  "f1_macro": [
3085
  "3",
3086
- "1"
3087
  ]
3088
  },
3089
  "predict": {
3090
- "1": 0.4955792725086212,
3091
- "2": 0.4373471736907959,
3092
- "3": 0.06706935912370682
3093
  },
3094
  "sample": {
3095
  "messages": [
@@ -3119,7 +3119,7 @@
3119
  "prompt_len": 152,
3120
  "generated_len": 1,
3121
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3122
- "generated_token": "1"
3123
  }
3124
  }
3125
  {
@@ -3131,9 +3131,9 @@
3131
  ]
3132
  },
3133
  "predict": {
3134
- "1": 0.0001386207150062546,
3135
- "2": 0.9912692308425903,
3136
- "3": 0.008576159365475178
3137
  },
3138
  "sample": {
3139
  "messages": [
@@ -3175,9 +3175,9 @@
3175
  ]
3176
  },
3177
  "predict": {
3178
- "1": 0.9124384522438049,
3179
- "2": 0.06609682738780975,
3180
- "3": 0.021458497270941734
3181
  },
3182
  "sample": {
3183
  "messages": [
@@ -3219,9 +3219,9 @@
3219
  ]
3220
  },
3221
  "predict": {
3222
- "1": 0.9824451804161072,
3223
- "2": 0.0016737132100388408,
3224
- "3": 0.01587974652647972
3225
  },
3226
  "sample": {
3227
  "messages": [
@@ -3263,9 +3263,9 @@
3263
  ]
3264
  },
3265
  "predict": {
3266
- "1": 0.5119481086730957,
3267
- "2": 0.3987056016921997,
3268
- "3": 0.08896324038505554
3269
  },
3270
  "sample": {
3271
  "messages": [
@@ -3300,16 +3300,16 @@
3300
  }
3301
  {
3302
  "metric": {
3303
- "acc": false,
3304
  "f1_macro": [
3305
  "1",
3306
- "2"
3307
  ]
3308
  },
3309
  "predict": {
3310
- "1": 0.24962082505226135,
3311
- "2": 0.5988092422485352,
3312
- "3": 0.15140268206596375
3313
  },
3314
  "sample": {
3315
  "messages": [
@@ -3339,21 +3339,21 @@
3339
  "prompt_len": 221,
3340
  "generated_len": 1,
3341
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3342
- "generated_token": "2"
3343
  }
3344
  }
3345
  {
3346
  "metric": {
3347
- "acc": false,
3348
  "f1_macro": [
3349
  "1",
3350
- "2"
3351
  ]
3352
  },
3353
  "predict": {
3354
- "1": 0.11569686979055405,
3355
- "2": 0.8548907041549683,
3356
- "3": 0.029252750799059868
3357
  },
3358
  "sample": {
3359
  "messages": [
@@ -3383,7 +3383,7 @@
3383
  "prompt_len": 91,
3384
  "generated_len": 1,
3385
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3386
- "generated_token": "2"
3387
  }
3388
  }
3389
  {
@@ -3395,9 +3395,9 @@
3395
  ]
3396
  },
3397
  "predict": {
3398
- "1": 0.9447677731513977,
3399
- "2": 0.047037214040756226,
3400
- "3": 0.008173842914402485
3401
  },
3402
  "sample": {
3403
  "messages": [
@@ -3439,9 +3439,9 @@
3439
  ]
3440
  },
3441
  "predict": {
3442
- "1": 0.6938379406929016,
3443
- "2": 0.10640349239110947,
3444
- "3": 0.1987878978252411
3445
  },
3446
  "sample": {
3447
  "messages": [
@@ -3483,9 +3483,9 @@
3483
  ]
3484
  },
3485
  "predict": {
3486
- "1": 0.7288783192634583,
3487
- "2": 0.0029787591192871332,
3488
- "3": 0.2681393623352051
3489
  },
3490
  "sample": {
3491
  "messages": [
@@ -3527,9 +3527,9 @@
3527
  ]
3528
  },
3529
  "predict": {
3530
- "1": 3.7148015508137178e-06,
3531
- "2": 0.9968197345733643,
3532
- "3": 0.0031726588495075703
3533
  },
3534
  "sample": {
3535
  "messages": [
@@ -3571,9 +3571,9 @@
3571
  ]
3572
  },
3573
  "predict": {
3574
- "1": 0.715835452079773,
3575
- "2": 0.15972448885440826,
3576
- "3": 0.1243935376405716
3577
  },
3578
  "sample": {
3579
  "messages": [
@@ -3615,9 +3615,9 @@
3615
  ]
3616
  },
3617
  "predict": {
3618
- "1": 0.9993292093276978,
3619
- "2": 0.00033523759339004755,
3620
- "3": 0.00033523759339004755
3621
  },
3622
  "sample": {
3623
  "messages": [
@@ -3659,9 +3659,9 @@
3659
  ]
3660
  },
3661
  "predict": {
3662
- "1": 0.9202016592025757,
3663
- "2": 0.004261379595845938,
3664
- "3": 0.07553475350141525
3665
  },
3666
  "sample": {
3667
  "messages": [
@@ -3703,9 +3703,9 @@
3703
  ]
3704
  },
3705
  "predict": {
3706
- "1": 0.9761813879013062,
3707
- "2": 0.017879387363791466,
3708
- "3": 0.005804586689919233
3709
  },
3710
  "sample": {
3711
  "messages": [
@@ -3747,9 +3747,9 @@
3747
  ]
3748
  },
3749
  "predict": {
3750
- "1": 5.1404014811851084e-05,
3751
- "2": 0.999206006526947,
3752
- "3": 0.0007096104673109949
3753
  },
3754
  "sample": {
3755
  "messages": [
@@ -3791,9 +3791,9 @@
3791
  ]
3792
  },
3793
  "predict": {
3794
- "1": 0.5863784551620483,
3795
- "2": 0.24443891644477844,
3796
- "3": 0.1680002510547638
3797
  },
3798
  "sample": {
3799
  "messages": [
@@ -3835,9 +3835,9 @@
3835
  ]
3836
  },
3837
  "predict": {
3838
- "1": 0.7358587980270386,
3839
- "2": 0.05330543592572212,
3840
- "3": 0.2108270823955536
3841
  },
3842
  "sample": {
3843
  "messages": [
@@ -3879,9 +3879,9 @@
3879
  ]
3880
  },
3881
  "predict": {
3882
- "1": 0.0007938354974612594,
3883
- "2": 0.9864582419395447,
3884
- "3": 0.012417676858603954
3885
  },
3886
  "sample": {
3887
  "messages": [
@@ -3923,9 +3923,9 @@
3923
  ]
3924
  },
3925
  "predict": {
3926
- "1": 0.9663904309272766,
3927
- "2": 0.0005344954552128911,
3928
- "3": 0.033068060874938965
3929
  },
3930
  "sample": {
3931
  "messages": [
@@ -3967,9 +3967,9 @@
3967
  ]
3968
  },
3969
  "predict": {
3970
- "1": 6.595286686206236e-05,
3971
- "2": 0.9984310269355774,
3972
- "3": 0.00150108034722507
3973
  },
3974
  "sample": {
3975
  "messages": [
@@ -4007,13 +4007,13 @@
4007
  "acc": false,
4008
  "f1_macro": [
4009
  "3",
4010
- "2"
4011
  ]
4012
  },
4013
  "predict": {
4014
- "1": 6.58664430375211e-05,
4015
- "2": 0.9971227049827576,
4016
- "3": 0.0028007125947624445
4017
  },
4018
  "sample": {
4019
  "messages": [
@@ -4043,7 +4043,7 @@
4043
  "prompt_len": 114,
4044
  "generated_len": 1,
4045
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4046
- "generated_token": "2"
4047
  }
4048
  }
4049
  {
@@ -4055,9 +4055,9 @@
4055
  ]
4056
  },
4057
  "predict": {
4058
- "1": 0.9701799750328064,
4059
- "2": 0.003964903764426708,
4060
- "3": 0.02585442177951336
4061
  },
4062
  "sample": {
4063
  "messages": [
@@ -4099,9 +4099,9 @@
4099
  ]
4100
  },
4101
  "predict": {
4102
- "1": 0.9929752349853516,
4103
- "2": 0.0003331060870550573,
4104
- "3": 0.006690614391118288
4105
  },
4106
  "sample": {
4107
  "messages": [
@@ -4143,9 +4143,9 @@
4143
  ]
4144
  },
4145
  "predict": {
4146
- "1": 0.9844814538955688,
4147
- "2": 0.014042872935533524,
4148
- "3": 0.0013061907375231385
4149
  },
4150
  "sample": {
4151
  "messages": [
@@ -4187,9 +4187,9 @@
4187
  ]
4188
  },
4189
  "predict": {
4190
- "1": 0.000625559245236218,
4191
- "2": 0.9981372356414795,
4192
- "3": 0.001168698538094759
4193
  },
4194
  "sample": {
4195
  "messages": [
@@ -4231,9 +4231,9 @@
4231
  ]
4232
  },
4233
  "predict": {
4234
- "1": 0.00037962975329719484,
4235
- "2": 0.9986867308616638,
4236
- "3": 0.0009106844081543386
4237
  },
4238
  "sample": {
4239
  "messages": [
@@ -4275,9 +4275,9 @@
4275
  ]
4276
  },
4277
  "predict": {
4278
- "1": 0.9979950189590454,
4279
- "2": 7.470175478374586e-05,
4280
- "3": 0.001926583587191999
4281
  },
4282
  "sample": {
4283
  "messages": [
@@ -4319,9 +4319,9 @@
4319
  ]
4320
  },
4321
  "predict": {
4322
- "1": 0.9946058392524719,
4323
- "2": 0.0016944303642958403,
4324
- "3": 0.0035871088039129972
4325
  },
4326
  "sample": {
4327
  "messages": [
@@ -4363,9 +4363,9 @@
4363
  ]
4364
  },
4365
  "predict": {
4366
- "1": 0.9959226846694946,
4367
- "2": 5.4001402531866916e-06,
4368
- "3": 0.004070108290761709
4369
  },
4370
  "sample": {
4371
  "messages": [
@@ -4407,9 +4407,9 @@
4407
  ]
4408
  },
4409
  "predict": {
4410
- "1": 0.9714142680168152,
4411
- "2": 0.017792074009776115,
4412
- "3": 0.01079143863171339
4413
  },
4414
  "sample": {
4415
  "messages": [
@@ -4451,9 +4451,9 @@
4451
  ]
4452
  },
4453
  "predict": {
4454
- "1": 0.8966901302337646,
4455
- "2": 0.008790841326117516,
4456
- "3": 0.09451044350862503
4457
  },
4458
  "sample": {
4459
  "messages": [
@@ -4488,16 +4488,16 @@
4488
  }
4489
  {
4490
  "metric": {
4491
- "acc": true,
4492
  "f1_macro": [
4493
  "2",
4494
- "2"
4495
  ]
4496
  },
4497
  "predict": {
4498
- "1": 0.0021758012007921934,
4499
- "2": 0.994655966758728,
4500
- "3": 0.0031657719518989325
4501
  },
4502
  "sample": {
4503
  "messages": [
@@ -4527,7 +4527,7 @@
4527
  "prompt_len": 103,
4528
  "generated_len": 1,
4529
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4530
- "generated_token": "2"
4531
  }
4532
  }
4533
  {
@@ -4539,9 +4539,9 @@
4539
  ]
4540
  },
4541
  "predict": {
4542
- "1": 0.6097345352172852,
4543
- "2": 0.2541751563549042,
4544
- "3": 0.13605016469955444
4545
  },
4546
  "sample": {
4547
  "messages": [
@@ -4583,9 +4583,9 @@
4583
  ]
4584
  },
4585
  "predict": {
4586
- "1": 0.6989836692810059,
4587
- "2": 0.2269267737865448,
4588
- "3": 0.0736723318696022
4589
  },
4590
  "sample": {
4591
  "messages": [
@@ -4620,16 +4620,16 @@
4620
  }
4621
  {
4622
  "metric": {
4623
- "acc": false,
4624
  "f1_macro": [
4625
  "1",
4626
- "2"
4627
  ]
4628
  },
4629
  "predict": {
4630
- "1": 0.1931619942188263,
4631
- "2": 0.5250687003135681,
4632
- "3": 0.2810490131378174
4633
  },
4634
  "sample": {
4635
  "messages": [
@@ -4659,21 +4659,21 @@
4659
  "prompt_len": 114,
4660
  "generated_len": 1,
4661
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4662
- "generated_token": "2"
4663
  }
4664
  }
4665
  {
4666
  "metric": {
4667
- "acc": false,
4668
  "f1_macro": [
4669
  "1",
4670
- "2"
4671
  ]
4672
  },
4673
  "predict": {
4674
- "1": 0.16358569264411926,
4675
- "2": 0.8307567834854126,
4676
- "3": 0.005597595125436783
4677
  },
4678
  "sample": {
4679
  "messages": [
@@ -4703,21 +4703,21 @@
4703
  "prompt_len": 101,
4704
  "generated_len": 1,
4705
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4706
- "generated_token": "2"
4707
  }
4708
  }
4709
  {
4710
  "metric": {
4711
- "acc": true,
4712
  "f1_macro": [
4713
  "2",
4714
- "2"
4715
  ]
4716
  },
4717
  "predict": {
4718
- "1": 0.0016701437998563051,
4719
- "2": 0.9803500175476074,
4720
- "3": 0.017955737188458443
4721
  },
4722
  "sample": {
4723
  "messages": [
@@ -4747,7 +4747,7 @@
4747
  "prompt_len": 145,
4748
  "generated_len": 1,
4749
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4750
- "generated_token": "2"
4751
  }
4752
  }
4753
  {
@@ -4755,13 +4755,13 @@
4755
  "acc": false,
4756
  "f1_macro": [
4757
  "3",
4758
- "2"
4759
  ]
4760
  },
4761
  "predict": {
4762
- "1": 0.26198387145996094,
4763
- "2": 0.6284666657447815,
4764
- "3": 0.10921111702919006
4765
  },
4766
  "sample": {
4767
  "messages": [
@@ -4791,7 +4791,7 @@
4791
  "prompt_len": 101,
4792
  "generated_len": 1,
4793
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4794
- "generated_token": "2"
4795
  }
4796
  }
4797
  {
@@ -4803,9 +4803,9 @@
4803
  ]
4804
  },
4805
  "predict": {
4806
- "1": 0.8508579134941101,
4807
- "2": 0.001279213116504252,
4808
- "3": 0.1478569209575653
4809
  },
4810
  "sample": {
4811
  "messages": [
@@ -4847,9 +4847,9 @@
4847
  ]
4848
  },
4849
  "predict": {
4850
- "1": 8.93759624887025e-06,
4851
- "2": 0.9997561573982239,
4852
- "3": 0.00023050364688970149
4853
  },
4854
  "sample": {
4855
  "messages": [
@@ -4891,9 +4891,9 @@
4891
  ]
4892
  },
4893
  "predict": {
4894
- "1": 0.988469660282135,
4895
- "2": 0.0005467070732265711,
4896
- "3": 0.010980906896293163
4897
  },
4898
  "sample": {
4899
  "messages": [
@@ -4935,9 +4935,9 @@
4935
  ]
4936
  },
4937
  "predict": {
4938
- "1": 0.9697875380516052,
4939
- "2": 0.007404418662190437,
4940
- "3": 0.02280721813440323
4941
  },
4942
  "sample": {
4943
  "messages": [
@@ -4979,9 +4979,9 @@
4979
  ]
4980
  },
4981
  "predict": {
4982
- "1": 0.5741863250732422,
4983
- "2": 0.23935647308826447,
4984
- "3": 0.18641100823879242
4985
  },
4986
  "sample": {
4987
  "messages": [
@@ -5023,9 +5023,9 @@
5023
  ]
5024
  },
5025
  "predict": {
5026
- "1": 0.9745687246322632,
5027
- "2": 0.015752436593174934,
5028
- "3": 0.009554335847496986
5029
  },
5030
  "sample": {
5031
  "messages": [
@@ -5067,9 +5067,9 @@
5067
  ]
5068
  },
5069
  "predict": {
5070
- "1": 0.006572623271495104,
5071
- "2": 0.9754637479782104,
5072
- "3": 0.017866242676973343
5073
  },
5074
  "sample": {
5075
  "messages": [
@@ -5104,16 +5104,16 @@
5104
  }
5105
  {
5106
  "metric": {
5107
- "acc": true,
5108
  "f1_macro": [
5109
  "2",
5110
- "2"
5111
  ]
5112
  },
5113
  "predict": {
5114
- "1": 0.35423046350479126,
5115
- "2": 0.5154022574424744,
5116
- "3": 0.13031409680843353
5117
  },
5118
  "sample": {
5119
  "messages": [
@@ -5143,7 +5143,7 @@
5143
  "prompt_len": 95,
5144
  "generated_len": 1,
5145
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5146
- "generated_token": "2"
5147
  }
5148
  }
5149
  {
@@ -5155,9 +5155,9 @@
5155
  ]
5156
  },
5157
  "predict": {
5158
- "1": 0.9688199758529663,
5159
- "2": 0.008381934836506844,
5160
- "3": 0.022784462198615074
5161
  },
5162
  "sample": {
5163
  "messages": [
@@ -5199,9 +5199,9 @@
5199
  ]
5200
  },
5201
  "predict": {
5202
- "1": 0.996656060218811,
5203
- "2": 0.0008020420791581273,
5204
- "3": 0.002470463514328003
5205
  },
5206
  "sample": {
5207
  "messages": [
@@ -5243,9 +5243,9 @@
5243
  ]
5244
  },
5245
  "predict": {
5246
- "1": 0.7234998941421509,
5247
- "2": 0.01032017171382904,
5248
- "3": 0.2661607563495636
5249
  },
5250
  "sample": {
5251
  "messages": [
@@ -5287,9 +5287,9 @@
5287
  ]
5288
  },
5289
  "predict": {
5290
- "1": 0.9639463424682617,
5291
- "2": 0.0030680301133543253,
5292
- "3": 0.032984428107738495
5293
  },
5294
  "sample": {
5295
  "messages": [
@@ -5331,9 +5331,9 @@
5331
  ]
5332
  },
5333
  "predict": {
5334
- "1": 0.4442080557346344,
5335
- "2": 0.20982903242111206,
5336
- "3": 0.34594959020614624
5337
  },
5338
  "sample": {
5339
  "messages": [
@@ -5375,9 +5375,9 @@
5375
  ]
5376
  },
5377
  "predict": {
5378
- "1": 0.9747047424316406,
5379
- "2": 0.007441961672157049,
5380
- "3": 0.017852339893579483
5381
  },
5382
  "sample": {
5383
  "messages": [
@@ -5419,9 +5419,9 @@
5419
  ]
5420
  },
5421
  "predict": {
5422
- "1": 0.3836464583873749,
5423
- "2": 0.3836464583873749,
5424
- "3": 0.23269334435462952
5425
  },
5426
  "sample": {
5427
  "messages": [
@@ -5463,9 +5463,9 @@
5463
  ]
5464
  },
5465
  "predict": {
5466
- "1": 0.8900082111358643,
5467
- "2": 0.10629633069038391,
5468
- "3": 0.003637260291725397
5469
  },
5470
  "sample": {
5471
  "messages": [
@@ -5507,9 +5507,9 @@
5507
  ]
5508
  },
5509
  "predict": {
5510
- "1": 0.8241075277328491,
5511
- "2": 0.07665394246578217,
5512
- "3": 0.09842561185359955
5513
  },
5514
  "sample": {
5515
  "messages": [
@@ -5551,9 +5551,9 @@
5551
  ]
5552
  },
5553
  "predict": {
5554
- "1": 0.02890811860561371,
5555
- "2": 0.9573054313659668,
5556
- "3": 0.013655227608978748
5557
  },
5558
  "sample": {
5559
  "messages": [
@@ -5595,9 +5595,9 @@
5595
  ]
5596
  },
5597
  "predict": {
5598
- "1": 0.023534899577498436,
5599
- "2": 0.7793688774108887,
5600
- "3": 0.19705531001091003
5601
  },
5602
  "sample": {
5603
  "messages": [
@@ -5639,9 +5639,9 @@
5639
  ]
5640
  },
5641
  "predict": {
5642
- "1": 0.8651604652404785,
5643
- "2": 0.043073803186416626,
5644
- "3": 0.0911872386932373
5645
  },
5646
  "sample": {
5647
  "messages": [
@@ -5683,9 +5683,9 @@
5683
  ]
5684
  },
5685
  "predict": {
5686
- "1": 0.8635392785072327,
5687
- "2": 0.003998980391770601,
5688
- "3": 0.13242805004119873
5689
  },
5690
  "sample": {
5691
  "messages": [
@@ -5727,9 +5727,9 @@
5727
  ]
5728
  },
5729
  "predict": {
5730
- "1": 0.43489623069763184,
5731
- "2": 0.18129171431064606,
5732
- "3": 0.3837945759296417
5733
  },
5734
  "sample": {
5735
  "messages": [
@@ -5771,9 +5771,9 @@
5771
  ]
5772
  },
5773
  "predict": {
5774
- "1": 0.9524779915809631,
5775
- "2": 0.010581075213849545,
5776
- "3": 0.03693157806992531
5777
  },
5778
  "sample": {
5779
  "messages": [
@@ -5815,9 +5815,9 @@
5815
  ]
5816
  },
5817
  "predict": {
5818
- "1": 0.9840602874755859,
5819
- "2": 0.014036864973604679,
5820
- "3": 0.0018996832659468055
5821
  },
5822
  "sample": {
5823
  "messages": [
@@ -5859,9 +5859,9 @@
5859
  ]
5860
  },
5861
  "predict": {
5862
- "1": 0.8148273229598999,
5863
- "2": 0.024605652317404747,
5864
- "3": 0.16044899821281433
5865
  },
5866
  "sample": {
5867
  "messages": [
@@ -5896,16 +5896,16 @@
5896
  }
5897
  {
5898
  "metric": {
5899
- "acc": true,
5900
  "f1_macro": [
5901
  "2",
5902
- "2"
5903
  ]
5904
  },
5905
  "predict": {
5906
- "1": 0.32032930850982666,
5907
- "2": 0.528133749961853,
5908
- "3": 0.15131285786628723
5909
  },
5910
  "sample": {
5911
  "messages": [
@@ -5935,7 +5935,7 @@
5935
  "prompt_len": 90,
5936
  "generated_len": 1,
5937
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5938
- "generated_token": "2"
5939
  }
5940
  }
5941
  {
@@ -5943,13 +5943,13 @@
5943
  "acc": false,
5944
  "f1_macro": [
5945
  "3",
5946
- "2"
5947
  ]
5948
  },
5949
  "predict": {
5950
- "1": 0.01377924345433712,
5951
- "2": 0.9659996032714844,
5952
- "3": 0.020048681646585464
5953
  },
5954
  "sample": {
5955
  "messages": [
@@ -5979,7 +5979,7 @@
5979
  "prompt_len": 97,
5980
  "generated_len": 1,
5981
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5982
- "generated_token": "2"
5983
  }
5984
  }
5985
  {
@@ -5991,9 +5991,9 @@
5991
  ]
5992
  },
5993
  "predict": {
5994
- "1": 0.9683293104171753,
5995
- "2": 0.0057578966952860355,
5996
- "3": 0.025805102661252022
5997
  },
5998
  "sample": {
5999
  "messages": [
@@ -6035,9 +6035,9 @@
6035
  ]
6036
  },
6037
  "predict": {
6038
- "1": 0.8739377856254578,
6039
- "2": 0.033886246383190155,
6040
- "3": 0.09211236238479614
6041
  },
6042
  "sample": {
6043
  "messages": [
@@ -6079,9 +6079,9 @@
6079
  ]
6080
  },
6081
  "predict": {
6082
- "1": 0.8764171004295349,
6083
- "2": 0.029989343136548996,
6084
- "3": 0.09237368404865265
6085
  },
6086
  "sample": {
6087
  "messages": [
@@ -6123,9 +6123,9 @@
6123
  ]
6124
  },
6125
  "predict": {
6126
- "1": 0.9496769905090332,
6127
- "2": 0.003022613935172558,
6128
- "3": 0.04728163033723831
6129
  },
6130
  "sample": {
6131
  "messages": [
@@ -6167,9 +6167,9 @@
6167
  ]
6168
  },
6169
  "predict": {
6170
- "1": 6.1392297538986895e-06,
6171
- "2": 0.9991890788078308,
6172
- "3": 0.000804080453235656
6173
  },
6174
  "sample": {
6175
  "messages": [
@@ -6207,13 +6207,13 @@
6207
  "acc": false,
6208
  "f1_macro": [
6209
  "3",
6210
- "2"
6211
  ]
6212
  },
6213
  "predict": {
6214
- "1": 0.07572191208600998,
6215
- "2": 0.7184295058250427,
6216
- "3": 0.20583350956439972
6217
  },
6218
  "sample": {
6219
  "messages": [
@@ -6243,21 +6243,21 @@
6243
  "prompt_len": 230,
6244
  "generated_len": 1,
6245
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6246
- "generated_token": "2"
6247
  }
6248
  }
6249
  {
6250
  "metric": {
6251
- "acc": false,
6252
  "f1_macro": [
6253
  "1",
6254
- "3"
6255
  ]
6256
  },
6257
  "predict": {
6258
- "1": 0.22240525484085083,
6259
- "2": 0.0013224700232967734,
6260
- "3": 0.7762705683708191
6261
  },
6262
  "sample": {
6263
  "messages": [
@@ -6287,21 +6287,21 @@
6287
  "prompt_len": 126,
6288
  "generated_len": 1,
6289
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6290
- "generated_token": "3"
6291
  }
6292
  }
6293
  {
6294
  "metric": {
6295
- "acc": true,
6296
  "f1_macro": [
6297
  "2",
6298
- "2"
6299
  ]
6300
  },
6301
  "predict": {
6302
- "1": 0.003063197946175933,
6303
- "2": 0.8493398427963257,
6304
- "3": 0.14759312570095062
6305
  },
6306
  "sample": {
6307
  "messages": [
@@ -6331,7 +6331,7 @@
6331
  "prompt_len": 108,
6332
  "generated_len": 1,
6333
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6334
- "generated_token": "2"
6335
  }
6336
  }
6337
  {
@@ -6343,9 +6343,9 @@
6343
  ]
6344
  },
6345
  "predict": {
6346
- "1": 0.8214573264122009,
6347
- "2": 0.12597456574440002,
6348
- "3": 0.052514009177684784
6349
  },
6350
  "sample": {
6351
  "messages": [
@@ -6387,9 +6387,9 @@
6387
  ]
6388
  },
6389
  "predict": {
6390
- "1": 0.7958472371101379,
6391
- "2": 0.0950504019856453,
6392
- "3": 0.1077062115073204
6393
  },
6394
  "sample": {
6395
  "messages": [
@@ -6431,9 +6431,9 @@
6431
  ]
6432
  },
6433
  "predict": {
6434
- "1": 0.7490409016609192,
6435
- "2": 0.06148502230644226,
6436
- "3": 0.18938720226287842
6437
  },
6438
  "sample": {
6439
  "messages": [
@@ -6475,9 +6475,9 @@
6475
  ]
6476
  },
6477
  "predict": {
6478
- "1": 0.7041969299316406,
6479
- "2": 0.1386645883321762,
6480
- "3": 0.15712757408618927
6481
  },
6482
  "sample": {
6483
  "messages": [
@@ -6519,9 +6519,9 @@
6519
  ]
6520
  },
6521
  "predict": {
6522
- "1": 0.008256481029093266,
6523
- "2": 0.954319417476654,
6524
- "3": 0.03700298070907593
6525
  },
6526
  "sample": {
6527
  "messages": [
@@ -6563,9 +6563,9 @@
6563
  ]
6564
  },
6565
  "predict": {
6566
- "1": 0.6154287457466125,
6567
- "2": 0.01127197127789259,
6568
- "3": 0.37327641248703003
6569
  },
6570
  "sample": {
6571
  "messages": [
@@ -6607,9 +6607,9 @@
6607
  ]
6608
  },
6609
  "predict": {
6610
- "1": 0.9862869381904602,
6611
- "2": 0.008533054031431675,
6612
- "3": 0.005175558850169182
6613
  },
6614
  "sample": {
6615
  "messages": [
@@ -6651,9 +6651,9 @@
6651
  ]
6652
  },
6653
  "predict": {
6654
- "1": 0.970179557800293,
6655
- "2": 0.003964902367442846,
6656
- "3": 0.025854408740997314
6657
  },
6658
  "sample": {
6659
  "messages": [
@@ -6695,9 +6695,9 @@
6695
  ]
6696
  },
6697
  "predict": {
6698
- "1": 0.9508241415023804,
6699
- "2": 0.001835522474721074,
6700
- "3": 0.04733874648809433
6701
  },
6702
  "sample": {
6703
  "messages": [
@@ -6739,9 +6739,9 @@
6739
  ]
6740
  },
6741
  "predict": {
6742
- "1": 0.9564642906188965,
6743
- "2": 0.006444605998694897,
6744
- "3": 0.03708614408969879
6745
  },
6746
  "sample": {
6747
  "messages": [
@@ -6776,16 +6776,16 @@
6776
  }
6777
  {
6778
  "metric": {
6779
- "acc": true,
6780
  "f1_macro": [
6781
  "3",
6782
- "3"
6783
  ]
6784
  },
6785
  "predict": {
6786
- "1": 0.3263939917087555,
6787
- "2": 0.19796794652938843,
6788
- "3": 0.47490042448043823
6789
  },
6790
  "sample": {
6791
  "messages": [
@@ -6815,7 +6815,7 @@
6815
  "prompt_len": 87,
6816
  "generated_len": 1,
6817
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6818
- "generated_token": "3"
6819
  }
6820
  }
6821
  {
@@ -6827,9 +6827,9 @@
6827
  ]
6828
  },
6829
  "predict": {
6830
- "1": 0.0021637813188135624,
6831
- "2": 0.9891611933708191,
6832
- "3": 0.008557921275496483
6833
  },
6834
  "sample": {
6835
  "messages": [
@@ -6871,9 +6871,9 @@
6871
  ]
6872
  },
6873
  "predict": {
6874
- "1": 0.3720066547393799,
6875
- "2": 0.25567618012428284,
6876
- "3": 0.3720066547393799
6877
  },
6878
  "sample": {
6879
  "messages": [
@@ -6915,9 +6915,9 @@
6915
  ]
6916
  },
6917
  "predict": {
6918
- "1": 0.5375309586524963,
6919
- "2": 0.28771960735321045,
6920
- "3": 0.17451074719429016
6921
  },
6922
  "sample": {
6923
  "messages": [
@@ -6959,9 +6959,9 @@
6959
  ]
6960
  },
6961
  "predict": {
6962
- "1": 0.9285780787467957,
6963
- "2": 0.011689072474837303,
6964
- "3": 0.059362009167671204
6965
  },
6966
  "sample": {
6967
  "messages": [
@@ -7003,9 +7003,9 @@
7003
  ]
7004
  },
7005
  "predict": {
7006
- "1": 0.6845002174377441,
7007
- "2": 0.030074842274188995,
7008
- "3": 0.28534215688705444
7009
  },
7010
  "sample": {
7011
  "messages": [
@@ -7047,9 +7047,9 @@
7047
  ]
7048
  },
7049
  "predict": {
7050
- "1": 0.996752917766571,
7051
- "2": 7.460878987330943e-05,
7052
- "3": 0.0031724462751299143
7053
  },
7054
  "sample": {
7055
  "messages": [
@@ -7091,9 +7091,9 @@
7091
  ]
7092
  },
7093
  "predict": {
7094
- "1": 0.0034236812498420477,
7095
- "2": 0.9492919445037842,
7096
- "3": 0.04726245999336243
7097
  },
7098
  "sample": {
7099
  "messages": [
@@ -7131,13 +7131,13 @@
7131
  "acc": false,
7132
  "f1_macro": [
7133
  "3",
7134
- "2"
7135
  ]
7136
  },
7137
  "predict": {
7138
- "1": 0.34595102071762085,
7139
- "2": 0.44420990347862244,
7140
- "3": 0.2098299115896225
7141
  },
7142
  "sample": {
7143
  "messages": [
@@ -7167,7 +7167,7 @@
7167
  "prompt_len": 106,
7168
  "generated_len": 1,
7169
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7170
- "generated_token": "2"
7171
  }
7172
  }
7173
  {
@@ -7179,9 +7179,9 @@
7179
  ]
7180
  },
7181
  "predict": {
7182
- "1": 0.9988577365875244,
7183
- "2": 0.00010878437024075538,
7184
- "3": 0.0010321172885596752
7185
  },
7186
  "sample": {
7187
  "messages": [
@@ -7223,9 +7223,9 @@
7223
  ]
7224
  },
7225
  "predict": {
7226
- "1": 0.6895111203193665,
7227
- "2": 0.13577277958393097,
7228
- "3": 0.17433570325374603
7229
  },
7230
  "sample": {
7231
  "messages": [
@@ -7267,9 +7267,9 @@
7267
  ]
7268
  },
7269
  "predict": {
7270
- "1": 0.7882348299026489,
7271
- "2": 0.03463262319564819,
7272
- "3": 0.17587897181510925
7273
  },
7274
  "sample": {
7275
  "messages": [
@@ -7311,9 +7311,9 @@
7311
  ]
7312
  },
7313
  "predict": {
7314
- "1": 0.8739826679229736,
7315
- "2": 0.09211709350347519,
7316
- "3": 0.03388798609375954
7317
  },
7318
  "sample": {
7319
  "messages": [
@@ -7355,9 +7355,9 @@
7355
  ]
7356
  },
7357
  "predict": {
7358
- "1": 0.005791477393358946,
7359
- "2": 0.9739767909049988,
7360
- "3": 0.020214242860674858
7361
  },
7362
  "sample": {
7363
  "messages": [
@@ -7399,9 +7399,9 @@
7399
  ]
7400
  },
7401
  "predict": {
7402
- "1": 0.916553258895874,
7403
- "2": 0.024425316601991653,
7404
- "3": 0.05859328806400299
7405
  },
7406
  "sample": {
7407
  "messages": [
@@ -7443,9 +7443,9 @@
7443
  ]
7444
  },
7445
  "predict": {
7446
- "1": 0.6508558392524719,
7447
- "2": 0.07773364335298538,
7448
- "3": 0.2713170647621155
7449
  },
7450
  "sample": {
7451
  "messages": [
@@ -7487,9 +7487,9 @@
7487
  ]
7488
  },
7489
  "predict": {
7490
- "1": 0.38364943861961365,
7491
- "2": 0.38364943861961365,
7492
- "3": 0.23269516229629517
7493
  },
7494
  "sample": {
7495
  "messages": [
@@ -7531,9 +7531,9 @@
7531
  ]
7532
  },
7533
  "predict": {
7534
- "1": 0.009539642371237278,
7535
- "2": 0.8587311506271362,
7536
- "3": 0.13169069588184357
7537
  },
7538
  "sample": {
7539
  "messages": [
@@ -7575,9 +7575,9 @@
7575
  ]
7576
  },
7577
  "predict": {
7578
- "1": 0.9666678309440613,
7579
- "2": 0.01770513877272606,
7580
- "3": 0.015624729916453362
7581
  },
7582
  "sample": {
7583
  "messages": [
@@ -7619,9 +7619,9 @@
7619
  ]
7620
  },
7621
  "predict": {
7622
- "1": 0.9132862091064453,
7623
- "2": 0.0017630571965128183,
7624
- "3": 0.08494885265827179
7625
  },
7626
  "sample": {
7627
  "messages": [
@@ -7663,9 +7663,9 @@
7663
  ]
7664
  },
7665
  "predict": {
7666
- "1": 0.9931216835975647,
7667
- "2": 0.0059053171426057816,
7668
- "3": 0.0009056097478605807
7669
  },
7670
  "sample": {
7671
  "messages": [
@@ -7700,16 +7700,16 @@
7700
  }
7701
  {
7702
  "metric": {
7703
- "acc": false,
7704
  "f1_macro": [
7705
  "1",
7706
- "2"
7707
  ]
7708
  },
7709
  "predict": {
7710
- "1": 0.3257984519004822,
7711
- "2": 0.5371508002281189,
7712
- "3": 0.13581299781799316
7713
  },
7714
  "sample": {
7715
  "messages": [
@@ -7739,7 +7739,7 @@
7739
  "prompt_len": 126,
7740
  "generated_len": 1,
7741
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7742
- "generated_token": "2"
7743
  }
7744
  }
7745
  {
@@ -7751,9 +7751,9 @@
7751
  ]
7752
  },
7753
  "predict": {
7754
- "1": 0.46300792694091797,
7755
- "2": 0.21870946884155273,
7756
- "3": 0.3182203769683838
7757
  },
7758
  "sample": {
7759
  "messages": [
@@ -7795,9 +7795,9 @@
7795
  ]
7796
  },
7797
  "predict": {
7798
- "1": 0.4980303943157196,
7799
- "2": 0.26657646894454956,
7800
- "3": 0.23525291681289673
7801
  },
7802
  "sample": {
7803
  "messages": [
@@ -7832,16 +7832,16 @@
7832
  }
7833
  {
7834
  "metric": {
7835
- "acc": true,
7836
  "f1_macro": [
7837
  "2",
7838
- "2"
7839
  ]
7840
  },
7841
  "predict": {
7842
- "1": 0.15115134418010712,
7843
- "2": 0.7676099538803101,
7844
- "3": 0.0809054896235466
7845
  },
7846
  "sample": {
7847
  "messages": [
@@ -7871,7 +7871,7 @@
7871
  "prompt_len": 114,
7872
  "generated_len": 1,
7873
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7874
- "generated_token": "2"
7875
  }
7876
  }
7877
  {
@@ -7883,9 +7883,9 @@
7883
  ]
7884
  },
7885
  "predict": {
7886
- "1": 0.9888356328010559,
7887
- "2": 0.0001775555283529684,
7888
- "3": 0.01098497211933136
7889
  },
7890
  "sample": {
7891
  "messages": [
@@ -7927,9 +7927,9 @@
7927
  ]
7928
  },
7929
  "predict": {
7930
- "1": 0.8344322443008423,
7931
- "2": 0.07761428505182266,
7932
- "3": 0.0879485085606575
7933
  },
7934
  "sample": {
7935
  "messages": [
@@ -7967,13 +7967,13 @@
7967
  "acc": false,
7968
  "f1_macro": [
7969
  "3",
7970
- "2"
7971
  ]
7972
  },
7973
  "predict": {
7974
- "1": 0.010796502232551575,
7975
- "2": 0.857672393321991,
7976
- "3": 0.1315283179283142
7977
  },
7978
  "sample": {
7979
  "messages": [
@@ -8003,7 +8003,7 @@
8003
  "prompt_len": 116,
8004
  "generated_len": 1,
8005
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8006
- "generated_token": "2"
8007
  }
8008
  }
8009
  {
@@ -8011,13 +8011,13 @@
8011
  "acc": false,
8012
  "f1_macro": [
8013
  "3",
8014
- "2"
8015
  ]
8016
  },
8017
  "predict": {
8018
- "1": 0.1164684072136879,
8019
- "2": 0.8605915904045105,
8020
- "3": 0.022933989763259888
8021
  },
8022
  "sample": {
8023
  "messages": [
@@ -8047,21 +8047,21 @@
8047
  "prompt_len": 145,
8048
  "generated_len": 1,
8049
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8050
- "generated_token": "2"
8051
  }
8052
  }
8053
  {
8054
  "metric": {
8055
- "acc": true,
8056
  "f1_macro": [
8057
  "2",
8058
- "2"
8059
  ]
8060
  },
8061
  "predict": {
8062
- "1": 0.22721615433692932,
8063
- "2": 0.4810165762901306,
8064
- "3": 0.29175132513046265
8065
  },
8066
  "sample": {
8067
  "messages": [
@@ -8091,7 +8091,7 @@
8091
  "prompt_len": 129,
8092
  "generated_len": 1,
8093
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8094
- "generated_token": "2"
8095
  }
8096
  }
8097
  {
@@ -8103,9 +8103,9 @@
8103
  ]
8104
  },
8105
  "predict": {
8106
- "1": 0.9846836924552917,
8107
- "2": 0.014045758172869682,
8108
- "3": 0.001152946031652391
8109
  },
8110
  "sample": {
8111
  "messages": [
@@ -8147,9 +8147,9 @@
8147
  ]
8148
  },
8149
  "predict": {
8150
- "1": 0.700525164604187,
8151
- "2": 0.17712049186229706,
8152
- "3": 0.12173300981521606
8153
  },
8154
  "sample": {
8155
  "messages": [
@@ -8191,9 +8191,9 @@
8191
  ]
8192
  },
8193
  "predict": {
8194
- "1": 0.8698589205741882,
8195
- "2": 0.11772260069847107,
8196
- "3": 0.012407870963215828
8197
  },
8198
  "sample": {
8199
  "messages": [
@@ -8235,9 +8235,9 @@
8235
  ]
8236
  },
8237
  "predict": {
8238
- "1": 0.994073212146759,
8239
- "2": 0.004603472538292408,
8240
- "3": 0.0013189169112592936
8241
  },
8242
  "sample": {
8243
  "messages": [
@@ -8279,9 +8279,9 @@
8279
  ]
8280
  },
8281
  "predict": {
8282
- "1": 0.06515555083751678,
8283
- "2": 0.899444580078125,
8284
- "3": 0.03487525135278702
8285
  },
8286
  "sample": {
8287
  "messages": [
@@ -8323,9 +8323,9 @@
8323
  ]
8324
  },
8325
  "predict": {
8326
- "1": 0.8127654790878296,
8327
- "2": 0.04585309326648712,
8328
- "3": 0.14123745262622833
8329
  },
8330
  "sample": {
8331
  "messages": [
@@ -8367,9 +8367,9 @@
8367
  ]
8368
  },
8369
  "predict": {
8370
- "1": 0.48418858647346497,
8371
- "2": 0.37708646059036255,
8372
- "3": 0.13872236013412476
8373
  },
8374
  "sample": {
8375
  "messages": [
@@ -8411,9 +8411,9 @@
8411
  ]
8412
  },
8413
  "predict": {
8414
- "1": 0.9524831771850586,
8415
- "2": 0.01058113295584917,
8416
- "3": 0.036931779235601425
8417
  },
8418
  "sample": {
8419
  "messages": [
@@ -8448,16 +8448,16 @@
8448
  }
8449
  {
8450
  "metric": {
8451
- "acc": false,
8452
  "f1_macro": [
8453
  "1",
8454
- "2"
8455
  ]
8456
  },
8457
  "predict": {
8458
- "1": 0.02530047483742237,
8459
- "2": 0.9493933320045471,
8460
- "3": 0.02530047483742237
8461
  },
8462
  "sample": {
8463
  "messages": [
@@ -8487,21 +8487,21 @@
8487
  "prompt_len": 161,
8488
  "generated_len": 1,
8489
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8490
- "generated_token": "2"
8491
  }
8492
  }
8493
  {
8494
  "metric": {
8495
- "acc": true,
8496
  "f1_macro": [
8497
  "2",
8498
- "2"
8499
  ]
8500
  },
8501
  "predict": {
8502
- "1": 0.09962064027786255,
8503
- "2": 0.7361025214195251,
8504
- "3": 0.16424667835235596
8505
  },
8506
  "sample": {
8507
  "messages": [
@@ -8531,7 +8531,7 @@
8531
  "prompt_len": 114,
8532
  "generated_len": 1,
8533
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8534
- "generated_token": "2"
8535
  }
8536
  }
8537
  {
@@ -8543,9 +8543,9 @@
8543
  ]
8544
  },
8545
  "predict": {
8546
- "1": 0.7294971346855164,
8547
- "2": 0.1436464935541153,
8548
- "3": 0.1267675906419754
8549
  },
8550
  "sample": {
8551
  "messages": [
@@ -8587,9 +8587,9 @@
8587
  ]
8588
  },
8589
  "predict": {
8590
- "1": 0.000977099291048944,
8591
- "2": 0.9456126093864441,
8592
- "3": 0.053347814828157425
8593
  },
8594
  "sample": {
8595
  "messages": [
@@ -8627,13 +8627,13 @@
8627
  "acc": false,
8628
  "f1_macro": [
8629
  "3",
8630
- "2"
8631
  ]
8632
  },
8633
  "predict": {
8634
- "1": 0.13583430647850037,
8635
- "2": 0.7816725373268127,
8636
- "3": 0.08238767832517624
8637
  },
8638
  "sample": {
8639
  "messages": [
@@ -8663,7 +8663,7 @@
8663
  "prompt_len": 115,
8664
  "generated_len": 1,
8665
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8666
- "generated_token": "2"
8667
  }
8668
  }
8669
  {
@@ -8675,9 +8675,9 @@
8675
  ]
8676
  },
8677
  "predict": {
8678
- "1": 0.8978238701820374,
8679
- "2": 0.05739595741033554,
8680
- "3": 0.0447000153362751
8681
  },
8682
  "sample": {
8683
  "messages": [
@@ -8719,9 +8719,9 @@
8719
  ]
8720
  },
8721
  "predict": {
8722
- "1": 0.6564561724662781,
8723
- "2": 0.12926387786865234,
8724
- "3": 0.21312011778354645
8725
  },
8726
  "sample": {
8727
  "messages": [
@@ -8763,9 +8763,9 @@
8763
  ]
8764
  },
8765
  "predict": {
8766
- "1": 0.9665891528129578,
8767
- "2": 0.00032425453537143767,
8768
- "3": 0.033074863255023956
8769
  },
8770
  "sample": {
8771
  "messages": [
@@ -8807,9 +8807,9 @@
8807
  ]
8808
  },
8809
  "predict": {
8810
- "1": 3.2874963835638482e-06,
8811
- "2": 0.9996157884597778,
8812
- "3": 0.0003799829282797873
8813
  },
8814
  "sample": {
8815
  "messages": [
@@ -8851,9 +8851,9 @@
8851
  ]
8852
  },
8853
  "predict": {
8854
- "1": 0.0017215069383382797,
8855
- "2": 0.8917626738548279,
8856
- "3": 0.1065058633685112
8857
  },
8858
  "sample": {
8859
  "messages": [
@@ -8895,9 +8895,9 @@
8895
  ]
8896
  },
8897
  "predict": {
8898
- "1": 0.8058937788009644,
8899
- "2": 0.15868988633155823,
8900
- "3": 0.03540850058197975
8901
  },
8902
  "sample": {
8903
  "messages": [
@@ -8939,9 +8939,9 @@
8939
  ]
8940
  },
8941
  "predict": {
8942
- "1": 0.9398670792579651,
8943
- "2": 4.2669900722103193e-05,
8944
- "3": 0.06008369103074074
8945
  },
8946
  "sample": {
8947
  "messages": [
@@ -8983,9 +8983,9 @@
8983
  ]
8984
  },
8985
  "predict": {
8986
- "1": 0.7890399694442749,
8987
- "2": 0.13711456954479218,
8988
- "3": 0.07339214533567429
8989
  },
8990
  "sample": {
8991
  "messages": [
@@ -9027,9 +9027,9 @@
9027
  ]
9028
  },
9029
  "predict": {
9030
- "1": 0.02442534640431404,
9031
- "2": 0.9165543913841248,
9032
- "3": 0.05859335884451866
9033
  },
9034
  "sample": {
9035
  "messages": [
@@ -9071,9 +9071,9 @@
9071
  ]
9072
  },
9073
  "predict": {
9074
- "1": 0.9783619046211243,
9075
- "2": 0.005817552097141743,
9076
- "3": 0.01581374742090702
9077
  },
9078
  "sample": {
9079
  "messages": [
@@ -9115,9 +9115,9 @@
9115
  ]
9116
  },
9117
  "predict": {
9118
- "1": 0.9993676543235779,
9119
- "2": 0.00029585754964500666,
9120
- "3": 0.0003352504863869399
9121
  },
9122
  "sample": {
9123
  "messages": [
@@ -9152,16 +9152,16 @@
9152
  }
9153
  {
9154
  "metric": {
9155
- "acc": true,
9156
  "f1_macro": [
9157
  "2",
9158
- "2"
9159
  ]
9160
  },
9161
  "predict": {
9162
- "1": 0.2168908417224884,
9163
- "2": 0.7570233941078186,
9164
- "3": 0.025903915986418724
9165
  },
9166
  "sample": {
9167
  "messages": [
@@ -9191,7 +9191,7 @@
9191
  "prompt_len": 97,
9192
  "generated_len": 1,
9193
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9194
- "generated_token": "2"
9195
  }
9196
  }
9197
  {
@@ -9203,9 +9203,9 @@
9203
  ]
9204
  },
9205
  "predict": {
9206
- "1": 0.9699215292930603,
9207
- "2": 0.0007805278874002397,
9208
- "3": 0.029289092868566513
9209
  },
9210
  "sample": {
9211
  "messages": [
@@ -9243,13 +9243,13 @@
9243
  "acc": false,
9244
  "f1_macro": [
9245
  "3",
9246
- "2"
9247
  ]
9248
  },
9249
  "predict": {
9250
- "1": 0.12808334827423096,
9251
- "2": 0.8352083563804626,
9252
- "3": 0.03669649362564087
9253
  },
9254
  "sample": {
9255
  "messages": [
@@ -9279,21 +9279,21 @@
9279
  "prompt_len": 187,
9280
  "generated_len": 1,
9281
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9282
- "generated_token": "2"
9283
  }
9284
  }
9285
  {
9286
  "metric": {
9287
- "acc": false,
9288
  "f1_macro": [
9289
  "1",
9290
- "3"
9291
  ]
9292
  },
9293
  "predict": {
9294
- "1": 0.1864180713891983,
9295
- "2": 0.23936554789543152,
9296
- "3": 0.5742080807685852
9297
  },
9298
  "sample": {
9299
  "messages": [
@@ -9323,21 +9323,21 @@
9323
  "prompt_len": 128,
9324
  "generated_len": 1,
9325
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9326
- "generated_token": "3"
9327
  }
9328
  }
9329
  {
9330
  "metric": {
9331
- "acc": true,
9332
  "f1_macro": [
9333
  "2",
9334
- "2"
9335
  ]
9336
  },
9337
  "predict": {
9338
- "1": 0.11355941742658615,
9339
- "2": 0.8390969038009644,
9340
- "3": 0.047338612377643585
9341
  },
9342
  "sample": {
9343
  "messages": [
@@ -9367,7 +9367,7 @@
9367
  "prompt_len": 145,
9368
  "generated_len": 1,
9369
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9370
- "generated_token": "2"
9371
  }
9372
  }
9373
  {
@@ -9379,9 +9379,9 @@
9379
  ]
9380
  },
9381
  "predict": {
9382
- "1": 0.9668984413146973,
9383
- "2": 1.2576736480696127e-05,
9384
- "3": 0.03308544680476189
9385
  },
9386
  "sample": {
9387
  "messages": [
@@ -9423,9 +9423,9 @@
9423
  ]
9424
  },
9425
  "predict": {
9426
- "1": 0.44953903555870056,
9427
- "2": 0.44953903555870056,
9428
- "3": 0.1003057211637497
9429
  },
9430
  "sample": {
9431
  "messages": [
@@ -9467,9 +9467,9 @@
9467
  ]
9468
  },
9469
  "predict": {
9470
- "1": 0.8252539038658142,
9471
- "2": 0.08698112517595291,
9472
- "3": 0.08698112517595291
9473
  },
9474
  "sample": {
9475
  "messages": [
@@ -9511,9 +9511,9 @@
9511
  ]
9512
  },
9513
  "predict": {
9514
- "1": 0.017948845401406288,
9515
- "2": 0.7632042765617371,
9516
- "3": 0.21866169571876526
9517
  },
9518
  "sample": {
9519
  "messages": [
@@ -9555,9 +9555,9 @@
9555
  ]
9556
  },
9557
  "predict": {
9558
- "1": 0.43593066930770874,
9559
- "2": 0.26440533995628357,
9560
- "3": 0.2996104955673218
9561
  },
9562
  "sample": {
9563
  "messages": [
@@ -9599,9 +9599,9 @@
9599
  ]
9600
  },
9601
  "predict": {
9602
- "1": 0.8239489197731018,
9603
- "2": 0.07663918286561966,
9604
- "3": 0.09840666502714157
9605
  },
9606
  "sample": {
9607
  "messages": [
@@ -9643,9 +9643,9 @@
9643
  ]
9644
  },
9645
  "predict": {
9646
- "1": 0.8247462511062622,
9647
- "2": 0.09850189834833145,
9648
- "3": 0.07671334594488144
9649
  },
9650
  "sample": {
9651
  "messages": [
 
7
  ]
8
  },
9
  "predict": {
10
+ "1": 0.8427440524101257,
11
+ "2": 0.07838740944862366,
12
+ "3": 0.07838740944862366
13
  },
14
  "sample": {
15
  "messages": [
 
51
  ]
52
  },
53
  "predict": {
54
+ "1": 0.24942168593406677,
55
+ "2": 0.5280256867408752,
56
+ "3": 0.2201138585805893
57
  },
58
  "sample": {
59
  "messages": [
 
95
  ]
96
  },
97
  "predict": {
98
+ "1": 0.7282425165176392,
99
+ "2": 0.184128537774086,
100
+ "3": 0.08697616308927536
101
  },
102
  "sample": {
103
  "messages": [
 
139
  ]
140
  },
141
  "predict": {
142
+ "1": 0.5425964593887329,
143
+ "2": 0.25630444288253784,
144
+ "3": 0.19961009919643402
145
  },
146
  "sample": {
147
  "messages": [
 
183
  ]
184
  },
185
  "predict": {
186
+ "1": 0.904815137386322,
187
+ "2": 0.05104617774486542,
188
+ "3": 0.039754804223775864
189
  },
190
  "sample": {
191
  "messages": [
 
227
  ]
228
  },
229
  "predict": {
230
+ "1": 0.26357001066207886,
231
+ "2": 0.43455347418785095,
232
+ "3": 0.2986639440059662
233
  },
234
  "sample": {
235
  "messages": [
 
271
  ]
272
  },
273
  "predict": {
274
+ "1": 0.51088547706604,
275
+ "2": 0.27345728874206543,
276
+ "3": 0.21296873688697815
277
  },
278
  "sample": {
279
  "messages": [
 
315
  ]
316
  },
317
  "predict": {
318
+ "1": 0.3646199405193329,
319
+ "2": 0.4131685495376587,
320
+ "3": 0.22115319967269897
321
  },
322
  "sample": {
323
  "messages": [
 
359
  ]
360
  },
361
  "predict": {
362
+ "1": 0.7268967032432556,
363
+ "2": 0.12631569802761078,
364
+ "3": 0.14313443005084991
365
  },
366
  "sample": {
367
  "messages": [
 
403
  ]
404
  },
405
  "predict": {
406
+ "1": 0.5728665590286255,
407
+ "2": 0.07752905786037445,
408
+ "3": 0.34746116399765015
409
  },
410
  "sample": {
411
  "messages": [
 
447
  ]
448
  },
449
  "predict": {
450
+ "1": 0.8923271894454956,
451
+ "2": 0.0503416582942009,
452
+ "3": 0.05704456567764282
453
  },
454
  "sample": {
455
  "messages": [
 
491
  ]
492
  },
493
  "predict": {
494
+ "1": 0.10636842995882034,
495
+ "2": 0.6936092376708984,
496
+ "3": 0.19872237741947174
497
  },
498
  "sample": {
499
  "messages": [
 
535
  ]
536
  },
537
  "predict": {
538
+ "1": 0.8976678252220154,
539
+ "2": 0.04469224810600281,
540
+ "3": 0.05738598108291626
541
  },
542
  "sample": {
543
  "messages": [
 
579
  ]
580
  },
581
  "predict": {
582
+ "1": 0.9036626219749451,
583
+ "2": 0.04499071091413498,
584
+ "3": 0.05098116025328636
585
  },
586
  "sample": {
587
  "messages": [
 
623
  ]
624
  },
625
  "predict": {
626
+ "1": 0.7032796144485474,
627
+ "2": 0.17781692743301392,
628
+ "3": 0.10785142332315445
629
  },
630
  "sample": {
631
  "messages": [
 
660
  }
661
  {
662
  "metric": {
663
+ "acc": false,
664
  "f1_macro": [
665
  "2",
666
+ "1"
667
  ]
668
  },
669
  "predict": {
670
+ "1": 0.40004345774650574,
671
+ "2": 0.35303711891174316,
672
+ "3": 0.24263861775398254
673
  },
674
  "sample": {
675
  "messages": [
 
699
  "prompt_len": 101,
700
  "generated_len": 1,
701
  "generated_cumulative_logprob": "TODO: calculate for hf model",
702
+ "generated_token": "1"
703
  }
704
  }
705
  {
 
711
  ]
712
  },
713
  "predict": {
714
+ "1": 0.600358784198761,
715
+ "2": 0.17200568318367004,
716
+ "3": 0.22085967659950256
717
  },
718
  "sample": {
719
  "messages": [
 
755
  ]
756
  },
757
  "predict": {
758
+ "1": 0.784460723400116,
759
+ "2": 0.12030095607042313,
760
+ "3": 0.09369047731161118
761
  },
762
  "sample": {
763
  "messages": [
 
795
  "acc": false,
796
  "f1_macro": [
797
  "3",
798
+ "1"
799
  ]
800
  },
801
  "predict": {
802
+ "1": 0.5102749466896057,
803
+ "2": 0.27313050627708435,
804
+ "3": 0.21271422505378723
805
  },
806
  "sample": {
807
  "messages": [
 
831
  "prompt_len": 106,
832
  "generated_len": 1,
833
  "generated_cumulative_logprob": "TODO: calculate for hf model",
834
+ "generated_token": "1"
835
  }
836
  }
837
  {
 
843
  ]
844
  },
845
  "predict": {
846
+ "1": 0.7484683394432068,
847
+ "2": 0.061438024044036865,
848
+ "3": 0.1892424374818802
849
  },
850
  "sample": {
851
  "messages": [
 
887
  ]
888
  },
889
  "predict": {
890
+ "1": 0.28307968378067017,
891
+ "2": 0.46671947836875916,
892
+ "3": 0.24981695413589478
893
  },
894
  "sample": {
895
  "messages": [
 
931
  ]
932
  },
933
  "predict": {
934
+ "1": 0.7887781262397766,
935
+ "2": 0.07336778938770294,
936
+ "3": 0.13706907629966736
937
  },
938
  "sample": {
939
  "messages": [
 
975
  ]
976
  },
977
  "predict": {
978
+ "1": 0.6943702697753906,
979
+ "2": 0.19894041121006012,
980
+ "3": 0.10648513585329056
981
  },
982
  "sample": {
983
  "messages": [
 
1019
  ]
1020
  },
1021
  "predict": {
1022
+ "1": 0.48075437545776367,
1023
+ "2": 0.2915922701358795,
1024
+ "3": 0.22709229588508606
1025
  },
1026
  "sample": {
1027
  "messages": [
 
1056
  }
1057
  {
1058
  "metric": {
1059
+ "acc": true,
1060
  "f1_macro": [
1061
  "1",
1062
+ "1"
1063
  ]
1064
  },
1065
  "predict": {
1066
+ "1": 0.4402084946632385,
1067
+ "2": 0.3428347408771515,
1068
+ "3": 0.20793978869915009
1069
  },
1070
  "sample": {
1071
  "messages": [
 
1095
  "prompt_len": 89,
1096
  "generated_len": 1,
1097
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1098
+ "generated_token": "1"
1099
  }
1100
  }
1101
  {
 
1103
  "acc": false,
1104
  "f1_macro": [
1105
  "3",
1106
+ "1"
1107
  ]
1108
  },
1109
  "predict": {
1110
+ "1": 0.41778889298439026,
1111
+ "2": 0.3253743350505829,
1112
+ "3": 0.2534017860889435
1113
  },
1114
  "sample": {
1115
  "messages": [
 
1139
  "prompt_len": 97,
1140
  "generated_len": 1,
1141
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1142
+ "generated_token": "1"
1143
  }
1144
  }
1145
  {
 
1151
  ]
1152
  },
1153
  "predict": {
1154
+ "1": 0.5712354183197021,
1155
+ "2": 0.21014578640460968,
1156
+ "3": 0.21014578640460968
1157
  },
1158
  "sample": {
1159
  "messages": [
 
1195
  ]
1196
  },
1197
  "predict": {
1198
+ "1": 0.6559047102928162,
1199
+ "2": 0.21294108033180237,
1200
+ "3": 0.12915529310703278
1201
  },
1202
  "sample": {
1203
  "messages": [
 
1239
  ]
1240
  },
1241
  "predict": {
1242
+ "1": 0.06279128789901733,
1243
+ "2": 0.7649545073509216,
1244
+ "3": 0.17068442702293396
1245
  },
1246
  "sample": {
1247
  "messages": [
 
1283
  ]
1284
  },
1285
  "predict": {
1286
+ "1": 0.4659345746040344,
1287
+ "2": 0.24939681589603424,
1288
+ "3": 0.2826036214828491
1289
  },
1290
  "sample": {
1291
  "messages": [
 
1327
  ]
1328
  },
1329
  "predict": {
1330
+ "1": 0.8445973992347717,
1331
+ "2": 0.05399330332875252,
1332
+ "3": 0.10087277740240097
1333
  },
1334
  "sample": {
1335
  "messages": [
 
1364
  }
1365
  {
1366
  "metric": {
1367
+ "acc": true,
1368
  "f1_macro": [
1369
  "1",
1370
+ "1"
1371
  ]
1372
  },
1373
  "predict": {
1374
+ "1": 0.7727751135826111,
1375
+ "2": 0.09229482710361481,
1376
+ "3": 0.13428816199302673
1377
  },
1378
  "sample": {
1379
  "messages": [
 
1403
  "prompt_len": 164,
1404
  "generated_len": 1,
1405
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1406
+ "generated_token": "1"
1407
  }
1408
  }
1409
  {
 
1415
  ]
1416
  },
1417
  "predict": {
1418
+ "1": 0.9634820222854614,
1419
+ "2": 0.013743331655859947,
1420
+ "3": 0.022658925503492355
1421
  },
1422
  "sample": {
1423
  "messages": [
 
1459
  ]
1460
  },
1461
  "predict": {
1462
+ "1": 0.35959210991859436,
1463
+ "2": 0.35959210991859436,
1464
+ "3": 0.2800506353378296
1465
  },
1466
  "sample": {
1467
  "messages": [
 
1503
  ]
1504
  },
1505
  "predict": {
1506
+ "1": 0.5241402983665466,
1507
+ "2": 0.24758636951446533,
1508
+ "3": 0.21849419176578522
1509
  },
1510
  "sample": {
1511
  "messages": [
 
1547
  ]
1548
  },
1549
  "predict": {
1550
+ "1": 0.7656006813049316,
1551
+ "2": 0.08069371432065964,
1552
+ "3": 0.15075570344924927
1553
  },
1554
  "sample": {
1555
  "messages": [
 
1587
  "acc": false,
1588
  "f1_macro": [
1589
  "3",
1590
+ "1"
1591
  ]
1592
  },
1593
  "predict": {
1594
+ "1": 0.4284827709197998,
1595
+ "2": 0.3337027430534363,
1596
+ "3": 0.22935031354427338
1597
  },
1598
  "sample": {
1599
  "messages": [
 
1623
  "prompt_len": 128,
1624
  "generated_len": 1,
1625
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1626
+ "generated_token": "1"
1627
  }
1628
  }
1629
  {
 
1635
  ]
1636
  },
1637
  "predict": {
1638
+ "1": 0.5787113308906555,
1639
+ "2": 0.2733638882637024,
1640
+ "3": 0.1463211327791214
1641
  },
1642
  "sample": {
1643
  "messages": [
 
1679
  ]
1680
  },
1681
  "predict": {
1682
+ "1": 0.5426521301269531,
1683
+ "2": 0.2262110561132431,
1684
+ "3": 0.2262110561132431
1685
  },
1686
  "sample": {
1687
  "messages": [
 
1723
  ]
1724
  },
1725
  "predict": {
1726
+ "1": 0.34257927536964417,
1727
+ "2": 0.26680102944374084,
1728
+ "3": 0.38819319009780884
1729
  },
1730
  "sample": {
1731
  "messages": [
 
1760
  }
1761
  {
1762
  "metric": {
1763
+ "acc": true,
1764
  "f1_macro": [
1765
  "1",
1766
+ "1"
1767
  ]
1768
  },
1769
  "predict": {
1770
+ "1": 0.8387932181358337,
1771
+ "2": 0.04732147604227066,
1772
+ "3": 0.11351831257343292
1773
  },
1774
  "sample": {
1775
  "messages": [
 
1799
  "prompt_len": 164,
1800
  "generated_len": 1,
1801
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1802
+ "generated_token": "1"
1803
  }
1804
  }
1805
  {
 
1811
  ]
1812
  },
1813
  "predict": {
1814
+ "1": 0.8853741884231567,
1815
+ "2": 0.04994939640164375,
1816
+ "3": 0.06413629651069641
1817
  },
1818
  "sample": {
1819
  "messages": [
 
1855
  ]
1856
  },
1857
  "predict": {
1858
+ "1": 0.13088563084602356,
1859
+ "2": 0.7531948089599609,
1860
+ "3": 0.11550617218017578
1861
  },
1862
  "sample": {
1863
  "messages": [
 
1899
  ]
1900
  },
1901
  "predict": {
1902
+ "1": 0.9762156009674072,
1903
+ "2": 0.00957048125565052,
1904
+ "3": 0.013924967497587204
1905
  },
1906
  "sample": {
1907
  "messages": [
 
1943
  ]
1944
  },
1945
  "predict": {
1946
+ "1": 0.9316370487213135,
1947
+ "2": 0.052559368312358856,
1948
+ "3": 0.013289088383316994
1949
  },
1950
  "sample": {
1951
  "messages": [
 
1987
  ]
1988
  },
1989
  "predict": {
1990
+ "1": 0.02406180277466774,
1991
+ "2": 0.796817421913147,
1992
+ "3": 0.17779400944709778
1993
  },
1994
  "sample": {
1995
  "messages": [
 
2031
  ]
2032
  },
2033
  "predict": {
2034
+ "1": 0.1850671023130417,
2035
+ "2": 0.5700468420982361,
2036
+ "3": 0.23763087391853333
2037
  },
2038
  "sample": {
2039
  "messages": [
 
2075
  ]
2076
  },
2077
  "predict": {
2078
+ "1": 0.6467720866203308,
2079
+ "2": 0.2379341721534729,
2080
+ "3": 0.1123921275138855
2081
  },
2082
  "sample": {
2083
  "messages": [
 
2119
  ]
2120
  },
2121
  "predict": {
2122
+ "1": 0.22013789415359497,
2123
+ "2": 0.5280833840370178,
2124
+ "3": 0.24944892525672913
2125
  },
2126
  "sample": {
2127
  "messages": [
 
2163
  ]
2164
  },
2165
  "predict": {
2166
+ "1": 0.3714810311794281,
2167
+ "2": 0.2553149461746216,
2168
+ "3": 0.3714810311794281
2169
  },
2170
  "sample": {
2171
  "messages": [
 
2207
  ]
2208
  },
2209
  "predict": {
2210
+ "1": 0.9313455820083618,
2211
+ "2": 0.02190314792096615,
2212
+ "3": 0.04636896401643753
2213
  },
2214
  "sample": {
2215
  "messages": [
 
2251
  ]
2252
  },
2253
  "predict": {
2254
+ "1": 0.3544420301914215,
2255
+ "2": 0.45511260628700256,
2256
+ "3": 0.18971915543079376
2257
  },
2258
  "sample": {
2259
  "messages": [
 
2295
  ]
2296
  },
2297
  "predict": {
2298
+ "1": 0.6566400527954102,
2299
+ "2": 0.1293000876903534,
2300
+ "3": 0.213179811835289
2301
  },
2302
  "sample": {
2303
  "messages": [
 
2339
  ]
2340
  },
2341
  "predict": {
2342
+ "1": 0.8832191824913025,
2343
+ "2": 0.043972890824079514,
2344
+ "3": 0.07249904423952103
2345
  },
2346
  "sample": {
2347
  "messages": [
 
2383
  ]
2384
  },
2385
  "predict": {
2386
+ "1": 0.5693199634552002,
2387
+ "2": 0.18483112752437592,
2388
+ "3": 0.23732785880565643
2389
  },
2390
  "sample": {
2391
  "messages": [
 
2420
  }
2421
  {
2422
  "metric": {
2423
+ "acc": false,
2424
  "f1_macro": [
2425
  "2",
2426
+ "1"
2427
  ]
2428
  },
2429
  "predict": {
2430
+ "1": 0.5541829466819763,
2431
+ "2": 0.26177749037742615,
2432
+ "3": 0.1799168586730957
2433
  },
2434
  "sample": {
2435
  "messages": [
 
2459
  "prompt_len": 111,
2460
  "generated_len": 1,
2461
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2462
+ "generated_token": "1"
2463
  }
2464
  }
2465
  {
 
2471
  ]
2472
  },
2473
  "predict": {
2474
+ "1": 0.5337602496147156,
2475
+ "2": 0.28570127487182617,
2476
+ "3": 0.173286572098732
2477
  },
2478
  "sample": {
2479
  "messages": [
 
2515
  ]
2516
  },
2517
  "predict": {
2518
+ "1": 0.7345989942550659,
2519
+ "2": 0.16391119360923767,
2520
+ "3": 0.09941716492176056
2521
  },
2522
  "sample": {
2523
  "messages": [
 
2559
  ]
2560
  },
2561
  "predict": {
2562
+ "1": 0.044881440699100494,
2563
+ "2": 0.7955425381660461,
2564
+ "3": 0.1566516011953354
2565
  },
2566
  "sample": {
2567
  "messages": [
 
2603
  ]
2604
  },
2605
  "predict": {
2606
+ "1": 0.7258000373840332,
2607
+ "2": 0.11130504310131073,
2608
+ "3": 0.16194787621498108
2609
  },
2610
  "sample": {
2611
  "messages": [
 
2647
  ]
2648
  },
2649
  "predict": {
2650
+ "1": 0.9448137283325195,
2651
+ "2": 0.013477043248713017,
2652
+ "3": 0.04151221737265587
2653
  },
2654
  "sample": {
2655
  "messages": [
 
2691
  ]
2692
  },
2693
  "predict": {
2694
+ "1": 0.9257425665855408,
2695
+ "2": 0.021771378815174103,
2696
+ "3": 0.052226826548576355
2697
  },
2698
  "sample": {
2699
  "messages": [
 
2735
  ]
2736
  },
2737
  "predict": {
2738
+ "1": 0.8161900043487549,
2739
+ "2": 0.09747999161481857,
2740
+ "3": 0.08602578938007355
2741
  },
2742
  "sample": {
2743
  "messages": [
 
2779
  ]
2780
  },
2781
  "predict": {
2782
+ "1": 0.5138590335845947,
2783
+ "2": 0.2427298128604889,
2784
+ "3": 0.2427298128604889
2785
  },
2786
  "sample": {
2787
  "messages": [
 
2823
  ]
2824
  },
2825
  "predict": {
2826
+ "1": 0.6807488203048706,
2827
+ "2": 0.17212025821208954,
2828
+ "3": 0.13404738903045654
2829
  },
2830
  "sample": {
2831
  "messages": [
 
2867
  ]
2868
  },
2869
  "predict": {
2870
+ "1": 0.8128647208213806,
2871
+ "2": 0.09708285331726074,
2872
+ "3": 0.08567530661821365
2873
  },
2874
  "sample": {
2875
  "messages": [
 
2911
  ]
2912
  },
2913
  "predict": {
2914
+ "1": 0.4602404236793518,
2915
+ "2": 0.3163183033466339,
2916
+ "3": 0.21740218997001648
2917
  },
2918
  "sample": {
2919
  "messages": [
 
2955
  ]
2956
  },
2957
  "predict": {
2958
+ "1": 0.6141341924667358,
2959
+ "2": 0.22592736780643463,
2960
+ "3": 0.15527743101119995
2961
  },
2962
  "sample": {
2963
  "messages": [
 
2999
  ]
3000
  },
3001
  "predict": {
3002
+ "1": 0.8113856315612793,
3003
+ "2": 0.07547061890363693,
3004
+ "3": 0.10980910807847977
3005
  },
3006
  "sample": {
3007
  "messages": [
 
3036
  }
3037
  {
3038
  "metric": {
3039
+ "acc": false,
3040
  "f1_macro": [
3041
  "2",
3042
+ "1"
3043
  ]
3044
  },
3045
  "predict": {
3046
+ "1": 0.40465742349624634,
3047
+ "2": 0.3151475191116333,
3048
+ "3": 0.27811670303344727
3049
  },
3050
  "sample": {
3051
  "messages": [
 
3075
  "prompt_len": 170,
3076
  "generated_len": 1,
3077
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3078
+ "generated_token": "1"
3079
  }
3080
  }
3081
  {
 
3083
  "acc": false,
3084
  "f1_macro": [
3085
  "3",
3086
+ "2"
3087
  ]
3088
  },
3089
  "predict": {
3090
+ "1": 0.3544919788837433,
3091
+ "2": 0.4551767110824585,
3092
+ "3": 0.18974587321281433
3093
  },
3094
  "sample": {
3095
  "messages": [
 
3119
  "prompt_len": 152,
3120
  "generated_len": 1,
3121
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3122
+ "generated_token": "2"
3123
  }
3124
  }
3125
  {
 
3131
  ]
3132
  },
3133
  "predict": {
3134
+ "1": 0.14761339128017426,
3135
+ "2": 0.6615573167800903,
3136
+ "3": 0.1895393431186676
3137
  },
3138
  "sample": {
3139
  "messages": [
 
3175
  ]
3176
  },
3177
  "predict": {
3178
+ "1": 0.8731182217597961,
3179
+ "2": 0.06324847042560577,
3180
+ "3": 0.06324847042560577
3181
  },
3182
  "sample": {
3183
  "messages": [
 
3219
  ]
3220
  },
3221
  "predict": {
3222
+ "1": 0.8067973256111145,
3223
+ "2": 0.09635820239782333,
3224
+ "3": 0.09635820239782333
3225
  },
3226
  "sample": {
3227
  "messages": [
 
3263
  ]
3264
  },
3265
  "predict": {
3266
+ "1": 0.6339033842086792,
3267
+ "2": 0.2332000583410263,
3268
+ "3": 0.12482297420501709
3269
  },
3270
  "sample": {
3271
  "messages": [
 
3300
  }
3301
  {
3302
  "metric": {
3303
+ "acc": true,
3304
  "f1_macro": [
3305
  "1",
3306
+ "1"
3307
  ]
3308
  },
3309
  "predict": {
3310
+ "1": 0.6332080364227295,
3311
+ "2": 0.18141713738441467,
3312
+ "3": 0.18141713738441467
3313
  },
3314
  "sample": {
3315
  "messages": [
 
3339
  "prompt_len": 221,
3340
  "generated_len": 1,
3341
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3342
+ "generated_token": "1"
3343
  }
3344
  }
3345
  {
3346
  "metric": {
3347
+ "acc": true,
3348
  "f1_macro": [
3349
  "1",
3350
+ "1"
3351
  ]
3352
  },
3353
  "predict": {
3354
+ "1": 0.42262449860572815,
3355
+ "2": 0.3729648292064667,
3356
+ "3": 0.19963368773460388
3357
  },
3358
  "sample": {
3359
  "messages": [
 
3383
  "prompt_len": 91,
3384
  "generated_len": 1,
3385
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3386
+ "generated_token": "1"
3387
  }
3388
  }
3389
  {
 
3395
  ]
3396
  },
3397
  "predict": {
3398
+ "1": 0.6752980351448059,
3399
+ "2": 0.21923716366291046,
3400
+ "3": 0.10356030613183975
3401
  },
3402
  "sample": {
3403
  "messages": [
 
3439
  ]
3440
  },
3441
  "predict": {
3442
+ "1": 0.6745449304580688,
3443
+ "2": 0.15051132440567017,
3444
+ "3": 0.17055167257785797
3445
  },
3446
  "sample": {
3447
  "messages": [
 
3483
  ]
3484
  },
3485
  "predict": {
3486
+ "1": 0.6213181614875793,
3487
+ "2": 0.08408626914024353,
3488
+ "3": 0.293489933013916
3489
  },
3490
  "sample": {
3491
  "messages": [
 
3527
  ]
3528
  },
3529
  "predict": {
3530
+ "1": 0.09423360228538513,
3531
+ "2": 0.6144802570343018,
3532
+ "3": 0.2902599275112152
3533
  },
3534
  "sample": {
3535
  "messages": [
 
3571
  ]
3572
  },
3573
  "predict": {
3574
+ "1": 0.5732656121253967,
3575
+ "2": 0.23897266387939453,
3576
+ "3": 0.18611210584640503
3577
  },
3578
  "sample": {
3579
  "messages": [
 
3615
  ]
3616
  },
3617
  "predict": {
3618
+ "1": 0.9318123459815979,
3619
+ "2": 0.03188486397266388,
3620
+ "3": 0.036130283027887344
3621
  },
3622
  "sample": {
3623
  "messages": [
 
3659
  ]
3660
  },
3661
  "predict": {
3662
+ "1": 0.616383969783783,
3663
+ "2": 0.1558462679386139,
3664
+ "3": 0.22675500810146332
3665
  },
3666
  "sample": {
3667
  "messages": [
 
3703
  ]
3704
  },
3705
  "predict": {
3706
+ "1": 0.7337620854377747,
3707
+ "2": 0.1855241060256958,
3708
+ "3": 0.07733795046806335
3709
  },
3710
  "sample": {
3711
  "messages": [
 
3747
  ]
3748
  },
3749
  "predict": {
3750
+ "1": 0.04653485119342804,
3751
+ "2": 0.8248499035835266,
3752
+ "3": 0.12649483978748322
3753
  },
3754
  "sample": {
3755
  "messages": [
 
3791
  ]
3792
  },
3793
  "predict": {
3794
+ "1": 0.5722522139549255,
3795
+ "2": 0.21051985025405884,
3796
+ "3": 0.21051985025405884
3797
  },
3798
  "sample": {
3799
  "messages": [
 
3835
  ]
3836
  },
3837
  "predict": {
3838
+ "1": 0.5559288859367371,
3839
+ "2": 0.18048366904258728,
3840
+ "3": 0.26260221004486084
3841
  },
3842
  "sample": {
3843
  "messages": [
 
3879
  ]
3880
  },
3881
  "predict": {
3882
+ "1": 0.08464287221431732,
3883
+ "2": 0.7087060809135437,
3884
+ "3": 0.20304769277572632
3885
  },
3886
  "sample": {
3887
  "messages": [
 
3923
  ]
3924
  },
3925
  "predict": {
3926
+ "1": 0.7104495763778687,
3927
+ "2": 0.08485110849142075,
3928
+ "3": 0.20354722440242767
3929
  },
3930
  "sample": {
3931
  "messages": [
 
3967
  ]
3968
  },
3969
  "predict": {
3970
+ "1": 0.13575875759124756,
3971
+ "2": 0.6894398927688599,
3972
+ "3": 0.17431770265102386
3973
  },
3974
  "sample": {
3975
  "messages": [
 
4007
  "acc": false,
4008
  "f1_macro": [
4009
  "3",
4010
+ "1"
4011
  ]
4012
  },
4013
  "predict": {
4014
+ "1": 0.6457290053367615,
4015
+ "2": 0.14408162236213684,
4016
+ "3": 0.20963750779628754
4017
  },
4018
  "sample": {
4019
  "messages": [
 
4043
  "prompt_len": 114,
4044
  "generated_len": 1,
4045
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4046
+ "generated_token": "1"
4047
  }
4048
  }
4049
  {
 
4055
  ]
4056
  },
4057
  "predict": {
4058
+ "1": 0.9141652584075928,
4059
+ "2": 0.04551360756158829,
4060
+ "3": 0.04016561806201935
4061
  },
4062
  "sample": {
4063
  "messages": [
 
4099
  ]
4100
  },
4101
  "predict": {
4102
+ "1": 0.7747328877449036,
4103
+ "2": 0.07206138223409653,
4104
+ "3": 0.15255394577980042
4105
  },
4106
  "sample": {
4107
  "messages": [
 
4143
  ]
4144
  },
4145
  "predict": {
4146
+ "1": 0.8732091784477234,
4147
+ "2": 0.09203556925058365,
4148
+ "3": 0.029879573732614517
4149
  },
4150
  "sample": {
4151
  "messages": [
 
4187
  ]
4188
  },
4189
  "predict": {
4190
+ "1": 0.09942677617073059,
4191
+ "2": 0.7346700429916382,
4192
+ "3": 0.16392704844474792
4193
  },
4194
  "sample": {
4195
  "messages": [
 
4231
  ]
4232
  },
4233
  "predict": {
4234
+ "1": 0.15539857745170593,
4235
+ "2": 0.6146132946014404,
4236
+ "3": 0.22610361874103546
4237
  },
4238
  "sample": {
4239
  "messages": [
 
4275
  ]
4276
  },
4277
  "predict": {
4278
+ "1": 0.841286301612854,
4279
+ "2": 0.08867092430591583,
4280
+ "3": 0.06905698776245117
4281
  },
4282
  "sample": {
4283
  "messages": [
 
4319
  ]
4320
  },
4321
  "predict": {
4322
+ "1": 0.8838312029838562,
4323
+ "2": 0.06402452290058136,
4324
+ "3": 0.049862347543239594
4325
  },
4326
  "sample": {
4327
  "messages": [
 
4363
  ]
4364
  },
4365
  "predict": {
4366
+ "1": 0.8444580435752869,
4367
+ "2": 0.025500424206256866,
4368
+ "3": 0.1295018345117569
4369
  },
4370
  "sample": {
4371
  "messages": [
 
4407
  ]
4408
  },
4409
  "predict": {
4410
+ "1": 0.8658605813980103,
4411
+ "2": 0.07107416540384293,
4412
+ "3": 0.06272273510694504
4413
  },
4414
  "sample": {
4415
  "messages": [
 
4451
  ]
4452
  },
4453
  "predict": {
4454
+ "1": 0.8546397686004639,
4455
+ "2": 0.04255000501871109,
4456
+ "3": 0.10207216441631317
4457
  },
4458
  "sample": {
4459
  "messages": [
 
4488
  }
4489
  {
4490
  "metric": {
4491
+ "acc": false,
4492
  "f1_macro": [
4493
  "2",
4494
+ "1"
4495
  ]
4496
  },
4497
  "predict": {
4498
+ "1": 0.4656330645084381,
4499
+ "2": 0.36263540387153625,
4500
+ "3": 0.1712968498468399
4501
  },
4502
  "sample": {
4503
  "messages": [
 
4527
  "prompt_len": 103,
4528
  "generated_len": 1,
4529
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4530
+ "generated_token": "1"
4531
  }
4532
  }
4533
  {
 
4539
  ]
4540
  },
4541
  "predict": {
4542
+ "1": 0.449106901884079,
4543
+ "2": 0.3086663484573364,
4544
+ "3": 0.2403896003961563
4545
  },
4546
  "sample": {
4547
  "messages": [
 
4583
  ]
4584
  },
4585
  "predict": {
4586
+ "1": 0.6791367530822754,
4587
+ "2": 0.1945759356021881,
4588
+ "3": 0.11801625788211823
4589
  },
4590
  "sample": {
4591
  "messages": [
 
4620
  }
4621
  {
4622
  "metric": {
4623
+ "acc": true,
4624
  "f1_macro": [
4625
  "1",
4626
+ "1"
4627
  ]
4628
  },
4629
  "predict": {
4630
+ "1": 0.40359026193618774,
4631
+ "2": 0.31431639194488525,
4632
+ "3": 0.2773832678794861
4633
  },
4634
  "sample": {
4635
  "messages": [
 
4659
  "prompt_len": 114,
4660
  "generated_len": 1,
4661
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4662
+ "generated_token": "1"
4663
  }
4664
  }
4665
  {
4666
  "metric": {
4667
+ "acc": true,
4668
  "f1_macro": [
4669
  "1",
4670
+ "1"
4671
  ]
4672
  },
4673
  "predict": {
4674
+ "1": 0.4426115155220032,
4675
+ "2": 0.39060327410697937,
4676
+ "3": 0.16282770037651062
4677
  },
4678
  "sample": {
4679
  "messages": [
 
4703
  "prompt_len": 101,
4704
  "generated_len": 1,
4705
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4706
+ "generated_token": "1"
4707
  }
4708
  }
4709
  {
4710
  "metric": {
4711
+ "acc": false,
4712
  "f1_macro": [
4713
  "2",
4714
+ "1"
4715
  ]
4716
  },
4717
  "predict": {
4718
+ "1": 0.48508769273757935,
4719
+ "2": 0.3333955705165863,
4720
+ "3": 0.17845381796360016
4721
  },
4722
  "sample": {
4723
  "messages": [
 
4747
  "prompt_len": 145,
4748
  "generated_len": 1,
4749
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4750
+ "generated_token": "1"
4751
  }
4752
  }
4753
  {
 
4755
  "acc": false,
4756
  "f1_macro": [
4757
  "3",
4758
+ "1"
4759
  ]
4760
  },
4761
  "predict": {
4762
+ "1": 0.5238513946533203,
4763
+ "2": 0.2803974747657776,
4764
+ "3": 0.19271418452262878
4765
  },
4766
  "sample": {
4767
  "messages": [
 
4791
  "prompt_len": 101,
4792
  "generated_len": 1,
4793
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4794
+ "generated_token": "1"
4795
  }
4796
  }
4797
  {
 
4803
  ]
4804
  },
4805
  "predict": {
4806
+ "1": 0.5673630833625793,
4807
+ "2": 0.08700795471668243,
4808
+ "3": 0.34412312507629395
4809
  },
4810
  "sample": {
4811
  "messages": [
 
4847
  ]
4848
  },
4849
  "predict": {
4850
+ "1": 0.20961983501911163,
4851
+ "2": 0.6456745862960815,
4852
+ "3": 0.14406947791576385
4853
  },
4854
  "sample": {
4855
  "messages": [
 
4891
  ]
4892
  },
4893
  "predict": {
4894
+ "1": 0.9462574124336243,
4895
+ "2": 0.011911623179912567,
4896
+ "3": 0.04157565161585808
4897
  },
4898
  "sample": {
4899
  "messages": [
 
4935
  ]
4936
  },
4937
  "predict": {
4938
+ "1": 0.879802942276001,
4939
+ "2": 0.06373271346092224,
4940
+ "3": 0.05624391883611679
4941
  },
4942
  "sample": {
4943
  "messages": [
 
4979
  ]
4980
  },
4981
  "predict": {
4982
+ "1": 0.5246575474739075,
4983
+ "2": 0.28082895278930664,
4984
+ "3": 0.19301073253154755
4985
  },
4986
  "sample": {
4987
  "messages": [
 
5023
  ]
5024
  },
5025
  "predict": {
5026
+ "1": 0.8045055270195007,
5027
+ "2": 0.13980208337306976,
5028
+ "3": 0.051430314779281616
5029
  },
5030
  "sample": {
5031
  "messages": [
 
5067
  ]
5068
  },
5069
  "predict": {
5070
+ "1": 0.14742936193943024,
5071
+ "2": 0.6607325077056885,
5072
+ "3": 0.18930304050445557
5073
  },
5074
  "sample": {
5075
  "messages": [
 
5104
  }
5105
  {
5106
  "metric": {
5107
+ "acc": false,
5108
  "f1_macro": [
5109
  "2",
5110
+ "1"
5111
  ]
5112
  },
5113
  "predict": {
5114
+ "1": 0.37023016810417175,
5115
+ "2": 0.37023016810417175,
5116
+ "3": 0.25445523858070374
5117
  },
5118
  "sample": {
5119
  "messages": [
 
5143
  "prompt_len": 95,
5144
  "generated_len": 1,
5145
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5146
+ "generated_token": "1"
5147
  }
5148
  }
5149
  {
 
5155
  ]
5156
  },
5157
  "predict": {
5158
+ "1": 0.6989871263504028,
5159
+ "2": 0.17673161625862122,
5160
+ "3": 0.12146573513746262
5161
  },
5162
  "sample": {
5163
  "messages": [
 
5199
  ]
5200
  },
5201
  "predict": {
5202
+ "1": 0.9553428888320923,
5203
+ "2": 0.01749771647155285,
5204
+ "3": 0.02545902505517006
5205
  },
5206
  "sample": {
5207
  "messages": [
 
5243
  ]
5244
  },
5245
  "predict": {
5246
+ "1": 0.6160777807235718,
5247
+ "2": 0.15576885640621185,
5248
+ "3": 0.22664237022399902
5249
  },
5250
  "sample": {
5251
  "messages": [
 
5287
  ]
5288
  },
5289
  "predict": {
5290
+ "1": 0.8549993634223938,
5291
+ "2": 0.0425679087638855,
5292
+ "3": 0.10211510956287384
5293
  },
5294
  "sample": {
5295
  "messages": [
 
5331
  ]
5332
  },
5333
  "predict": {
5334
+ "1": 0.542923629283905,
5335
+ "2": 0.19973047077655792,
5336
+ "3": 0.25645899772644043
5337
  },
5338
  "sample": {
5339
  "messages": [
 
5375
  ]
5376
  },
5377
  "predict": {
5378
+ "1": 0.9360924363136292,
5379
+ "2": 0.017145130783319473,
5380
+ "3": 0.046605296432971954
5381
  },
5382
  "sample": {
5383
  "messages": [
 
5419
  ]
5420
  },
5421
  "predict": {
5422
+ "1": 0.48258522152900696,
5423
+ "2": 0.2583092749118805,
5424
+ "3": 0.2583092749118805
5425
  },
5426
  "sample": {
5427
  "messages": [
 
5463
  ]
5464
  },
5465
  "predict": {
5466
+ "1": 0.5264328122138977,
5467
+ "2": 0.3618116080760956,
5468
+ "3": 0.10366075485944748
5469
  },
5470
  "sample": {
5471
  "messages": [
 
5507
  ]
5508
  },
5509
  "predict": {
5510
+ "1": 0.5575301647186279,
5511
+ "2": 0.20510391891002655,
5512
+ "3": 0.2324131578207016
5513
  },
5514
  "sample": {
5515
  "messages": [
 
5551
  ]
5552
  },
5553
  "predict": {
5554
+ "1": 0.3523302376270294,
5555
+ "2": 0.39924246072769165,
5556
+ "3": 0.24215279519557953
5557
  },
5558
  "sample": {
5559
  "messages": [
 
5595
  ]
5596
  },
5597
  "predict": {
5598
+ "1": 0.29909470677375793,
5599
+ "2": 0.4351802170276642,
5600
+ "3": 0.2639501392841339
5601
  },
5602
  "sample": {
5603
  "messages": [
 
5639
  ]
5640
  },
5641
  "predict": {
5642
+ "1": 0.8119111061096191,
5643
+ "2": 0.07551949471235275,
5644
+ "3": 0.10988021641969681
5645
  },
5646
  "sample": {
5647
  "messages": [
 
5683
  ]
5684
  },
5685
  "predict": {
5686
+ "1": 0.5787488222122192,
5687
+ "2": 0.14633060991764069,
5688
+ "3": 0.2733815908432007
5689
  },
5690
  "sample": {
5691
  "messages": [
 
5727
  ]
5728
  },
5729
  "predict": {
5730
+ "1": 0.4806901216506958,
5731
+ "2": 0.2270619422197342,
5732
+ "3": 0.2915532886981964
5733
  },
5734
  "sample": {
5735
  "messages": [
 
5771
  ]
5772
  },
5773
  "predict": {
5774
+ "1": 0.8475168347358704,
5775
+ "2": 0.03723729029297829,
5776
+ "3": 0.11469893157482147
5777
  },
5778
  "sample": {
5779
  "messages": [
 
5815
  ]
5816
  },
5817
  "predict": {
5818
+ "1": 0.8955039978027344,
5819
+ "2": 0.07350744307041168,
5820
+ "3": 0.030642462894320488
5821
  },
5822
  "sample": {
5823
  "messages": [
 
5859
  ]
5860
  },
5861
  "predict": {
5862
+ "1": 0.7252457737922668,
5863
+ "2": 0.11122004687786102,
5864
+ "3": 0.16182421147823334
5865
  },
5866
  "sample": {
5867
  "messages": [
 
5896
  }
5897
  {
5898
  "metric": {
5899
+ "acc": false,
5900
  "f1_macro": [
5901
  "2",
5902
+ "1"
5903
  ]
5904
  },
5905
  "predict": {
5906
+ "1": 0.5111352801322937,
5907
+ "2": 0.31001922488212585,
5908
+ "3": 0.16594134271144867
5909
  },
5910
  "sample": {
5911
  "messages": [
 
5935
  "prompt_len": 90,
5936
  "generated_len": 1,
5937
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5938
+ "generated_token": "1"
5939
  }
5940
  }
5941
  {
 
5943
  "acc": false,
5944
  "f1_macro": [
5945
  "3",
5946
+ "1"
5947
  ]
5948
  },
5949
  "predict": {
5950
+ "1": 0.4119296967983246,
5951
+ "2": 0.36352667212486267,
5952
+ "3": 0.22049008309841156
5953
  },
5954
  "sample": {
5955
  "messages": [
 
5979
  "prompt_len": 97,
5980
  "generated_len": 1,
5981
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5982
+ "generated_token": "1"
5983
  }
5984
  }
5985
  {
 
5991
  ]
5992
  },
5993
  "predict": {
5994
+ "1": 0.8117878437042236,
5995
+ "2": 0.09695423394441605,
5996
+ "3": 0.08556180447340012
5997
  },
5998
  "sample": {
5999
  "messages": [
 
6035
  ]
6036
  },
6037
  "predict": {
6038
+ "1": 0.6461917757987976,
6039
+ "2": 0.11229129135608673,
6040
+ "3": 0.23772069811820984
6041
  },
6042
  "sample": {
6043
  "messages": [
 
6079
  ]
6080
  },
6081
  "predict": {
6082
+ "1": 0.8521672487258911,
6083
+ "2": 0.05447722598910332,
6084
+ "3": 0.08981776237487793
6085
  },
6086
  "sample": {
6087
  "messages": [
 
6123
  ]
6124
  },
6125
  "predict": {
6126
+ "1": 0.6733472943305969,
6127
+ "2": 0.1325899362564087,
6128
+ "3": 0.19291724264621735
6129
  },
6130
  "sample": {
6131
  "messages": [
 
6167
  ]
6168
  },
6169
  "predict": {
6170
+ "1": 0.15595495700836182,
6171
+ "2": 0.6168138384819031,
6172
+ "3": 0.22691313922405243
6173
  },
6174
  "sample": {
6175
  "messages": [
 
6207
  "acc": false,
6208
  "f1_macro": [
6209
  "3",
6210
+ "1"
6211
  ]
6212
  },
6213
  "predict": {
6214
+ "1": 0.5484347343444824,
6215
+ "2": 0.1571291983127594,
6216
+ "3": 0.29355597496032715
6217
  },
6218
  "sample": {
6219
  "messages": [
 
6243
  "prompt_len": 230,
6244
  "generated_len": 1,
6245
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6246
+ "generated_token": "1"
6247
  }
6248
  }
6249
  {
6250
  "metric": {
6251
+ "acc": true,
6252
  "f1_macro": [
6253
  "1",
6254
+ "1"
6255
  ]
6256
  },
6257
  "predict": {
6258
+ "1": 0.7922748327255249,
6259
+ "2": 0.030719827860593796,
6260
+ "3": 0.17678040266036987
6261
  },
6262
  "sample": {
6263
  "messages": [
 
6287
  "prompt_len": 126,
6288
  "generated_len": 1,
6289
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6290
+ "generated_token": "1"
6291
  }
6292
  }
6293
  {
6294
  "metric": {
6295
+ "acc": false,
6296
  "f1_macro": [
6297
  "2",
6298
+ "3"
6299
  ]
6300
  },
6301
  "predict": {
6302
+ "1": 0.24063578248023987,
6303
+ "2": 0.3089824616909027,
6304
+ "3": 0.4495667815208435
6305
  },
6306
  "sample": {
6307
  "messages": [
 
6331
  "prompt_len": 108,
6332
  "generated_len": 1,
6333
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6334
+ "generated_token": "3"
6335
  }
6336
  }
6337
  {
 
6343
  ]
6344
  },
6345
  "predict": {
6346
+ "1": 0.5717660784721375,
6347
+ "2": 0.21034102141857147,
6348
+ "3": 0.21034102141857147
6349
  },
6350
  "sample": {
6351
  "messages": [
 
6387
  ]
6388
  },
6389
  "predict": {
6390
+ "1": 0.9258212447166443,
6391
+ "2": 0.035897985100746155,
6392
+ "3": 0.035897985100746155
6393
  },
6394
  "sample": {
6395
  "messages": [
 
6431
  ]
6432
  },
6433
  "predict": {
6434
+ "1": 0.36907562613487244,
6435
+ "2": 0.36907562613487244,
6436
+ "3": 0.25366172194480896
6437
  },
6438
  "sample": {
6439
  "messages": [
 
6475
  ]
6476
  },
6477
  "predict": {
6478
+ "1": 0.55531245470047,
6479
+ "2": 0.26231104135513306,
6480
+ "3": 0.1802835613489151
6481
  },
6482
  "sample": {
6483
  "messages": [
 
6519
  ]
6520
  },
6521
  "predict": {
6522
+ "1": 0.24863293766975403,
6523
+ "2": 0.46450746059417725,
6524
+ "3": 0.2817380130290985
6525
  },
6526
  "sample": {
6527
  "messages": [
 
6563
  ]
6564
  },
6565
  "predict": {
6566
+ "1": 0.5585140585899353,
6567
+ "2": 0.14121447503566742,
6568
+ "3": 0.29895102977752686
6569
  },
6570
  "sample": {
6571
  "messages": [
 
6607
  ]
6608
  },
6609
  "predict": {
6610
+ "1": 0.850590705871582,
6611
+ "2": 0.06982073932886124,
6612
+ "3": 0.07911725342273712
6613
  },
6614
  "sample": {
6615
  "messages": [
 
6651
  ]
6652
  },
6653
  "predict": {
6654
+ "1": 0.8748275637626648,
6655
+ "2": 0.04355509951710701,
6656
+ "3": 0.08137163519859314
6657
  },
6658
  "sample": {
6659
  "messages": [
 
6695
  ]
6696
  },
6697
  "predict": {
6698
+ "1": 0.7593733072280884,
6699
+ "2": 0.09069421142339706,
6700
+ "3": 0.14952945709228516
6701
  },
6702
  "sample": {
6703
  "messages": [
 
6739
  ]
6740
  },
6741
  "predict": {
6742
+ "1": 0.6766987442970276,
6743
+ "2": 0.15099190175533295,
6744
+ "3": 0.17109623551368713
6745
  },
6746
  "sample": {
6747
  "messages": [
 
6776
  }
6777
  {
6778
  "metric": {
6779
+ "acc": false,
6780
  "f1_macro": [
6781
  "3",
6782
+ "1"
6783
  ]
6784
  },
6785
  "predict": {
6786
+ "1": 0.5178438425064087,
6787
+ "2": 0.27718183398246765,
6788
+ "3": 0.19050411880016327
6789
  },
6790
  "sample": {
6791
  "messages": [
 
6815
  "prompt_len": 87,
6816
  "generated_len": 1,
6817
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6818
+ "generated_token": "1"
6819
  }
6820
  }
6821
  {
 
6827
  ]
6828
  },
6829
  "predict": {
6830
+ "1": 0.3446602523326874,
6831
+ "2": 0.4425525367259979,
6832
+ "3": 0.20904701948165894
6833
  },
6834
  "sample": {
6835
  "messages": [
 
6871
  ]
6872
  },
6873
  "predict": {
6874
+ "1": 0.5550274848937988,
6875
+ "2": 0.23136986792087555,
6876
+ "3": 0.20418322086334229
6877
  },
6878
  "sample": {
6879
  "messages": [
 
6915
  ]
6916
  },
6917
  "predict": {
6918
+ "1": 0.5104585289955139,
6919
+ "2": 0.2732287645339966,
6920
+ "3": 0.21279077231884003
6921
  },
6922
  "sample": {
6923
  "messages": [
 
6959
  ]
6960
  },
6961
  "predict": {
6962
+ "1": 0.8524977564811707,
6963
+ "2": 0.05449835956096649,
6964
+ "3": 0.08985260128974915
6965
  },
6966
  "sample": {
6967
  "messages": [
 
7003
  ]
7004
  },
7005
  "predict": {
7006
+ "1": 0.5477773547172546,
7007
+ "2": 0.15694084763526917,
7008
+ "3": 0.293204128742218
7009
  },
7010
  "sample": {
7011
  "messages": [
 
7047
  ]
7048
  },
7049
  "predict": {
7050
+ "1": 0.9256893396377563,
7051
+ "2": 0.014962374232709408,
7052
+ "3": 0.05917733907699585
7053
  },
7054
  "sample": {
7055
  "messages": [
 
7091
  ]
7092
  },
7093
  "predict": {
7094
+ "1": 0.25393006205558777,
7095
+ "2": 0.41865989565849304,
7096
+ "3": 0.3260526657104492
7097
  },
7098
  "sample": {
7099
  "messages": [
 
7131
  "acc": false,
7132
  "f1_macro": [
7133
  "3",
7134
+ "1"
7135
  ]
7136
  },
7137
  "predict": {
7138
+ "1": 0.6281933784484863,
7139
+ "2": 0.2310994565486908,
7140
+ "3": 0.14016889035701752
7141
  },
7142
  "sample": {
7143
  "messages": [
 
7167
  "prompt_len": 106,
7168
  "generated_len": 1,
7169
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7170
+ "generated_token": "1"
7171
  }
7172
  }
7173
  {
 
7179
  ]
7180
  },
7181
  "predict": {
7182
+ "1": 0.8853259086608887,
7183
+ "2": 0.04994667321443558,
7184
+ "3": 0.06413279473781586
7185
  },
7186
  "sample": {
7187
  "messages": [
 
7223
  ]
7224
  },
7225
  "predict": {
7226
+ "1": 0.5514762997627258,
7227
+ "2": 0.260498970746994,
7228
+ "3": 0.1790381371974945
7229
  },
7230
  "sample": {
7231
  "messages": [
 
7267
  ]
7268
  },
7269
  "predict": {
7270
+ "1": 0.7609981298446655,
7271
+ "2": 0.10298989713191986,
7272
+ "3": 0.1322416365146637
7273
  },
7274
  "sample": {
7275
  "messages": [
 
7311
  ]
7312
  },
7313
  "predict": {
7314
+ "1": 0.5548850893974304,
7315
+ "2": 0.2621091604232788,
7316
+ "3": 0.18014481663703918
7317
  },
7318
  "sample": {
7319
  "messages": [
 
7355
  ]
7356
  },
7357
  "predict": {
7358
+ "1": 0.20582358539104462,
7359
+ "2": 0.4937451183795929,
7360
+ "3": 0.29947155714035034
7361
  },
7362
  "sample": {
7363
  "messages": [
 
7399
  ]
7400
  },
7401
  "predict": {
7402
+ "1": 0.6618460416793823,
7403
+ "2": 0.16734088957309723,
7404
+ "3": 0.16734088957309723
7405
  },
7406
  "sample": {
7407
  "messages": [
 
7443
  ]
7444
  },
7445
  "predict": {
7446
+ "1": 0.5680178999900818,
7447
+ "2": 0.1267419308423996,
7448
+ "3": 0.3040381073951721
7449
  },
7450
  "sample": {
7451
  "messages": [
 
7487
  ]
7488
  },
7489
  "predict": {
7490
+ "1": 0.6336524486541748,
7491
+ "2": 0.1602124273777008,
7492
+ "3": 0.2057168334722519
7493
  },
7494
  "sample": {
7495
  "messages": [
 
7531
  ]
7532
  },
7533
  "predict": {
7534
+ "1": 0.2308199256658554,
7535
+ "2": 0.431228369474411,
7536
+ "3": 0.33584100008010864
7537
  },
7538
  "sample": {
7539
  "messages": [
 
7575
  ]
7576
  },
7577
  "predict": {
7578
+ "1": 0.8781532645225525,
7579
+ "2": 0.07208320498466492,
7580
+ "3": 0.04954201728105545
7581
  },
7582
  "sample": {
7583
  "messages": [
 
7619
  ]
7620
  },
7621
  "predict": {
7622
+ "1": 0.7593562006950378,
7623
+ "2": 0.07063112407922745,
7624
+ "3": 0.1694352775812149
7625
  },
7626
  "sample": {
7627
  "messages": [
 
7663
  ]
7664
  },
7665
  "predict": {
7666
+ "1": 0.8923550844192505,
7667
+ "2": 0.0732489675283432,
7668
+ "3": 0.030534710735082626
7669
  },
7670
  "sample": {
7671
  "messages": [
 
7700
  }
7701
  {
7702
  "metric": {
7703
+ "acc": true,
7704
  "f1_macro": [
7705
  "1",
7706
+ "1"
7707
  ]
7708
  },
7709
  "predict": {
7710
+ "1": 0.6177648901939392,
7711
+ "2": 0.20055890083312988,
7712
+ "3": 0.17699261009693146
7713
  },
7714
  "sample": {
7715
  "messages": [
 
7739
  "prompt_len": 126,
7740
  "generated_len": 1,
7741
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7742
+ "generated_token": "1"
7743
  }
7744
  }
7745
  {
 
7751
  ]
7752
  },
7753
  "predict": {
7754
+ "1": 0.4607653319835663,
7755
+ "2": 0.31667909026145935,
7756
+ "3": 0.2176501452922821
7757
  },
7758
  "sample": {
7759
  "messages": [
 
7795
  ]
7796
  },
7797
  "predict": {
7798
+ "1": 0.40412601828575134,
7799
+ "2": 0.40412601828575134,
7800
+ "3": 0.19089563190937042
7801
  },
7802
  "sample": {
7803
  "messages": [
 
7832
  }
7833
  {
7834
  "metric": {
7835
+ "acc": false,
7836
  "f1_macro": [
7837
  "2",
7838
+ "1"
7839
  ]
7840
  },
7841
  "predict": {
7842
+ "1": 0.8761624693870544,
7843
+ "2": 0.07191979140043259,
7844
+ "3": 0.04942970722913742
7845
  },
7846
  "sample": {
7847
  "messages": [
 
7871
  "prompt_len": 114,
7872
  "generated_len": 1,
7873
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7874
+ "generated_token": "1"
7875
  }
7876
  }
7877
  {
 
7883
  ]
7884
  },
7885
  "predict": {
7886
+ "1": 0.7257519364356995,
7887
+ "2": 0.11129766702651978,
7888
+ "3": 0.1619371473789215
7889
  },
7890
  "sample": {
7891
  "messages": [
 
7927
  ]
7928
  },
7929
  "predict": {
7930
+ "1": 0.6352789998054504,
7931
+ "2": 0.1820105016231537,
7932
+ "3": 0.1820105016231537
7933
  },
7934
  "sample": {
7935
  "messages": [
 
7967
  "acc": false,
7968
  "f1_macro": [
7969
  "3",
7970
+ "1"
7971
  ]
7972
  },
7973
  "predict": {
7974
+ "1": 0.7201007008552551,
7975
+ "2": 0.09745503216981888,
7976
+ "3": 0.18206997215747833
7977
  },
7978
  "sample": {
7979
  "messages": [
 
8003
  "prompt_len": 116,
8004
  "generated_len": 1,
8005
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8006
+ "generated_token": "1"
8007
  }
8008
  }
8009
  {
 
8011
  "acc": false,
8012
  "f1_macro": [
8013
  "3",
8014
+ "1"
8015
  ]
8016
  },
8017
  "predict": {
8018
+ "1": 0.6284369230270386,
8019
+ "2": 0.26197147369384766,
8020
+ "3": 0.1092059537768364
8021
  },
8022
  "sample": {
8023
  "messages": [
 
8047
  "prompt_len": 145,
8048
  "generated_len": 1,
8049
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8050
+ "generated_token": "1"
8051
  }
8052
  }
8053
  {
8054
  "metric": {
8055
+ "acc": false,
8056
  "f1_macro": [
8057
  "2",
8058
+ "1"
8059
  ]
8060
  },
8061
  "predict": {
8062
+ "1": 0.5598638653755188,
8063
+ "2": 0.2059624344110489,
8064
+ "3": 0.23338598012924194
8065
  },
8066
  "sample": {
8067
  "messages": [
 
8091
  "prompt_len": 129,
8092
  "generated_len": 1,
8093
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8094
+ "generated_token": "1"
8095
  }
8096
  }
8097
  {
 
8103
  ]
8104
  },
8105
  "predict": {
8106
+ "1": 0.8510578274726868,
8107
+ "2": 0.07916070520877838,
8108
+ "3": 0.06165042519569397
8109
  },
8110
  "sample": {
8111
  "messages": [
 
8147
  ]
8148
  },
8149
  "predict": {
8150
+ "1": 0.7012267112731934,
8151
+ "2": 0.1564648300409317,
8152
+ "3": 0.1380797177553177
8153
  },
8154
  "sample": {
8155
  "messages": [
 
8191
  ]
8192
  },
8193
  "predict": {
8194
+ "1": 0.5885195136070251,
8195
+ "2": 0.2779969573020935,
8196
+ "3": 0.13131646811962128
8197
  },
8198
  "sample": {
8199
  "messages": [
 
8235
  ]
8236
  },
8237
  "predict": {
8238
+ "1": 0.901430606842041,
8239
+ "2": 0.05762653052806854,
8240
+ "3": 0.03960609808564186
8241
  },
8242
  "sample": {
8243
  "messages": [
 
8279
  ]
8280
  },
8281
  "predict": {
8282
+ "1": 0.3171122670173645,
8283
+ "2": 0.46139562129974365,
8284
+ "3": 0.21794787049293518
8285
  },
8286
  "sample": {
8287
  "messages": [
 
8323
  ]
8324
  },
8325
  "predict": {
8326
+ "1": 0.6369884610176086,
8327
+ "2": 0.12543044984340668,
8328
+ "3": 0.23433497548103333
8329
  },
8330
  "sample": {
8331
  "messages": [
 
8367
  ]
8368
  },
8369
  "predict": {
8370
+ "1": 0.6911755204200745,
8371
+ "2": 0.15422211587429047,
8372
+ "3": 0.15422211587429047
8373
  },
8374
  "sample": {
8375
  "messages": [
 
8411
  ]
8412
  },
8413
  "predict": {
8414
+ "1": 0.9095896482467651,
8415
+ "2": 0.024239743128418922,
8416
+ "3": 0.06589046120643616
8417
  },
8418
  "sample": {
8419
  "messages": [
 
8448
  }
8449
  {
8450
  "metric": {
8451
+ "acc": true,
8452
  "f1_macro": [
8453
  "1",
8454
+ "1"
8455
  ]
8456
  },
8457
  "predict": {
8458
+ "1": 0.42432355880737305,
8459
+ "2": 0.3744642436504364,
8460
+ "3": 0.20043626427650452
8461
  },
8462
  "sample": {
8463
  "messages": [
 
8487
  "prompt_len": 161,
8488
  "generated_len": 1,
8489
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8490
+ "generated_token": "1"
8491
  }
8492
  }
8493
  {
8494
  "metric": {
8495
+ "acc": false,
8496
  "f1_macro": [
8497
  "2",
8498
+ "1"
8499
  ]
8500
  },
8501
  "predict": {
8502
+ "1": 0.4509470462799072,
8503
+ "2": 0.27351319789886475,
8504
+ "3": 0.27351319789886475
8505
  },
8506
  "sample": {
8507
  "messages": [
 
8531
  "prompt_len": 114,
8532
  "generated_len": 1,
8533
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8534
+ "generated_token": "1"
8535
  }
8536
  }
8537
  {
 
8543
  ]
8544
  },
8545
  "predict": {
8546
+ "1": 0.5937157273292542,
8547
+ "2": 0.2474975287914276,
8548
+ "3": 0.15011483430862427
8549
  },
8550
  "sample": {
8551
  "messages": [
 
8587
  ]
8588
  },
8589
  "predict": {
8590
+ "1": 0.07914919406175613,
8591
+ "2": 0.5161175727844238,
8592
+ "3": 0.4019527733325958
8593
  },
8594
  "sample": {
8595
  "messages": [
 
8627
  "acc": false,
8628
  "f1_macro": [
8629
  "3",
8630
+ "1"
8631
  ]
8632
  },
8633
  "predict": {
8634
+ "1": 0.4203770160675049,
8635
+ "2": 0.4203770160675049,
8636
+ "3": 0.1546480804681778
8637
  },
8638
  "sample": {
8639
  "messages": [
 
8663
  "prompt_len": 115,
8664
  "generated_len": 1,
8665
  "generated_cumulative_logprob": "TODO: calculate for hf model",
8666
+ "generated_token": "1"
8667
  }
8668
  }
8669
  {
 
8675
  ]
8676
  },
8677
  "predict": {
8678
+ "1": 0.6712876558303833,
8679
+ "2": 0.19232714176177979,
8680
+ "3": 0.13218437135219574
8681
  },
8682
  "sample": {
8683
  "messages": [
 
8719
  ]
8720
  },
8721
  "predict": {
8722
+ "1": 0.6324385404586792,
8723
+ "2": 0.1811966896057129,
8724
+ "3": 0.1811966896057129
8725
  },
8726
  "sample": {
8727
  "messages": [
 
8763
  ]
8764
  },
8765
  "predict": {
8766
+ "1": 0.7968239784240723,
8767
+ "2": 0.044953733682632446,
8768
+ "3": 0.1569039225578308
8769
  },
8770
  "sample": {
8771
  "messages": [
 
8807
  ]
8808
  },
8809
  "predict": {
8810
+ "1": 0.14410072565078735,
8811
+ "2": 0.6458145976066589,
8812
+ "3": 0.20966529846191406
8813
  },
8814
  "sample": {
8815
  "messages": [
 
8851
  ]
8852
  },
8853
  "predict": {
8854
+ "1": 0.1411750763654709,
8855
+ "2": 0.5583582520484924,
8856
+ "3": 0.2988676428794861
8857
  },
8858
  "sample": {
8859
  "messages": [
 
8895
  ]
8896
  },
8897
  "predict": {
8898
+ "1": 0.7890230417251587,
8899
+ "2": 0.137111634016037,
8900
+ "3": 0.07339057326316833
8901
  },
8902
  "sample": {
8903
  "messages": [
 
8939
  ]
8940
  },
8941
  "predict": {
8942
+ "1": 0.7645320296287537,
8943
+ "2": 0.015867354348301888,
8944
+ "3": 0.2190420925617218
8945
  },
8946
  "sample": {
8947
  "messages": [
 
8983
  ]
8984
  },
8985
  "predict": {
8986
+ "1": 0.8434564471244812,
8987
+ "2": 0.0784536674618721,
8988
+ "3": 0.06923512369394302
8989
  },
8990
  "sample": {
8991
  "messages": [
 
9027
  ]
9028
  },
9029
  "predict": {
9030
+ "1": 0.35281944274902344,
9031
+ "2": 0.39979681372642517,
9032
+ "3": 0.2424890249967575
9033
  },
9034
  "sample": {
9035
  "messages": [
 
9071
  ]
9072
  },
9073
  "predict": {
9074
+ "1": 0.7395572662353516,
9075
+ "2": 0.12851576507091522,
9076
+ "3": 0.12851576507091522
9077
  },
9078
  "sample": {
9079
  "messages": [
 
9115
  ]
9116
  },
9117
  "predict": {
9118
+ "1": 0.9013769030570984,
9119
+ "2": 0.05762309208512306,
9120
+ "3": 0.039603736251592636
9121
  },
9122
  "sample": {
9123
  "messages": [
 
9152
  }
9153
  {
9154
  "metric": {
9155
+ "acc": false,
9156
  "f1_macro": [
9157
  "2",
9158
+ "1"
9159
  ]
9160
  },
9161
  "predict": {
9162
+ "1": 0.4002441465854645,
9163
+ "2": 0.35321420431137085,
9164
+ "3": 0.24276034533977509
9165
  },
9166
  "sample": {
9167
  "messages": [
 
9191
  "prompt_len": 97,
9192
  "generated_len": 1,
9193
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9194
+ "generated_token": "1"
9195
  }
9196
  }
9197
  {
 
9203
  ]
9204
  },
9205
  "predict": {
9206
+ "1": 0.788357138633728,
9207
+ "2": 0.07332863658666611,
9208
+ "3": 0.13699591159820557
9209
  },
9210
  "sample": {
9211
  "messages": [
 
9243
  "acc": false,
9244
  "f1_macro": [
9245
  "3",
9246
+ "1"
9247
  ]
9248
  },
9249
  "predict": {
9250
+ "1": 0.7169543504714966,
9251
+ "2": 0.14117667078971863,
9252
+ "3": 0.14117667078971863
9253
  },
9254
  "sample": {
9255
  "messages": [
 
9279
  "prompt_len": 187,
9280
  "generated_len": 1,
9281
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9282
+ "generated_token": "1"
9283
  }
9284
  }
9285
  {
9286
  "metric": {
9287
+ "acc": true,
9288
  "f1_macro": [
9289
  "1",
9290
+ "1"
9291
  ]
9292
  },
9293
  "predict": {
9294
+ "1": 0.6092379093170166,
9295
+ "2": 0.1359393447637558,
9296
+ "3": 0.25396811962127686
9297
  },
9298
  "sample": {
9299
  "messages": [
 
9323
  "prompt_len": 128,
9324
  "generated_len": 1,
9325
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9326
+ "generated_token": "1"
9327
  }
9328
  }
9329
  {
9330
  "metric": {
9331
+ "acc": false,
9332
  "f1_macro": [
9333
  "2",
9334
+ "1"
9335
  ]
9336
  },
9337
  "predict": {
9338
+ "1": 0.6041527390480042,
9339
+ "2": 0.2222554087638855,
9340
+ "3": 0.17309266328811646
9341
  },
9342
  "sample": {
9343
  "messages": [
 
9367
  "prompt_len": 145,
9368
  "generated_len": 1,
9369
  "generated_cumulative_logprob": "TODO: calculate for hf model",
9370
+ "generated_token": "1"
9371
  }
9372
  }
9373
  {
 
9379
  ]
9380
  },
9381
  "predict": {
9382
+ "1": 0.792116105556488,
9383
+ "2": 0.030713675543665886,
9384
+ "3": 0.1767449975013733
9385
  },
9386
  "sample": {
9387
  "messages": [
 
9423
  ]
9424
  },
9425
  "predict": {
9426
+ "1": 0.5611218810081482,
9427
+ "2": 0.30034691095352173,
9428
+ "3": 0.1252032220363617
9429
  },
9430
  "sample": {
9431
  "messages": [
 
9467
  ]
9468
  },
9469
  "predict": {
9470
+ "1": 0.6750280857086182,
9471
+ "2": 0.17067383229732513,
9472
+ "3": 0.15061913430690765
9473
  },
9474
  "sample": {
9475
  "messages": [
 
9511
  ]
9512
  },
9513
  "predict": {
9514
+ "1": 0.3177637457847595,
9515
+ "2": 0.3600735068321228,
9516
+ "3": 0.3177637457847595
9517
  },
9518
  "sample": {
9519
  "messages": [
 
9555
  ]
9556
  },
9557
  "predict": {
9558
+ "1": 0.4433843195438385,
9559
+ "2": 0.3453080654144287,
9560
+ "3": 0.2094399333000183
9561
  },
9562
  "sample": {
9563
  "messages": [
 
9599
  ]
9600
  },
9601
  "predict": {
9602
+ "1": 0.8568756580352783,
9603
+ "2": 0.07033663988113403,
9604
+ "3": 0.07033663988113403
9605
  },
9606
  "sample": {
9607
  "messages": [
 
9643
  ]
9644
  },
9645
  "predict": {
9646
+ "1": 0.49714383482933044,
9647
+ "2": 0.2661019265651703,
9648
+ "3": 0.23483411967754364
9649
  },
9650
  "sample": {
9651
  "messages": [
llmtf_eval/darumeru_RCB_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
llmtf_eval/darumeru_RCB_total.jsonl CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "task_name": "darumeru/RCB",
3
  "results": {
4
- "acc": 0.4727272727272727,
5
- "f1_macro": 0.39356669305497743
6
  },
7
- "leaderboard_result": 0.4331469828911251
8
  }
 
1
  {
2
  "task_name": "darumeru/RCB",
3
  "results": {
4
+ "acc": 0.4590909090909091,
5
+ "f1_macro": 0.36910715356478985
6
  },
7
+ "leaderboard_result": 0.4140990313278495
8
  }
llmtf_eval/darumeru_RWSD.jsonl CHANGED
@@ -3,8 +3,8 @@
3
  "acc": true
4
  },
5
  "predict": {
6
- "Да": 0.002472260966897011,
7
- "Нет": 0.9973811507225037
8
  },
9
  "sample": {
10
  "messages": [
@@ -42,8 +42,8 @@
42
  "acc": false
43
  },
44
  "predict": {
45
- "Да": 0.7760170102119446,
46
- "Нет": 0.22233259677886963
47
  },
48
  "sample": {
49
  "messages": [
@@ -81,8 +81,8 @@
81
  "acc": false
82
  },
83
  "predict": {
84
- "Да": 9.608079562895e-05,
85
- "Нет": 0.9996789693832397
86
  },
87
  "sample": {
88
  "messages": [
@@ -120,8 +120,8 @@
120
  "acc": false
121
  },
122
  "predict": {
123
- "Да": 0.0028004839550703764,
124
- "Нет": 0.9970412850379944
125
  },
126
  "sample": {
127
  "messages": [
@@ -159,8 +159,8 @@
159
  "acc": true
160
  },
161
  "predict": {
162
- "Да": 0.0330502949655056,
163
- "Нет": 0.9658712148666382
164
  },
165
  "sample": {
166
  "messages": [
@@ -195,11 +195,11 @@
195
  }
196
  {
197
  "metric": {
198
- "acc": true
199
  },
200
  "predict": {
201
- "Да": 0.00029593578074127436,
202
- "Нет": 0.9996318817138672
203
  },
204
  "sample": {
205
  "messages": [
@@ -229,7 +229,7 @@
229
  "prompt_len": 172,
230
  "generated_len": 1,
231
  "generated_cumulative_logprob": "TODO: calculate for hf model",
232
- "generated_token": " Нет"
233
  }
234
  }
235
  {
@@ -237,8 +237,8 @@
237
  "acc": false
238
  },
239
  "predict": {
240
- "Да": 0.07581356167793274,
241
- "Нет": 0.9235982894897461
242
  },
243
  "sample": {
244
  "messages": [
@@ -273,11 +273,11 @@
273
  }
274
  {
275
  "metric": {
276
- "acc": true
277
  },
278
  "predict": {
279
- "Да": 0.4002552032470703,
280
- "Нет": 0.5139378905296326
281
  },
282
  "sample": {
283
  "messages": [
@@ -307,7 +307,7 @@
307
  "prompt_len": 183,
308
  "generated_len": 1,
309
  "generated_cumulative_logprob": "TODO: calculate for hf model",
310
- "generated_token": " Нет"
311
  }
312
  }
313
  {
@@ -315,8 +315,8 @@
315
  "acc": false
316
  },
317
  "predict": {
318
- "Да": 0.8652609586715698,
319
- "Нет": 0.13269206881523132
320
  },
321
  "sample": {
322
  "messages": [
@@ -354,8 +354,8 @@
354
  "acc": false
355
  },
356
  "predict": {
357
- "Да": 0.0850556269288063,
358
- "Нет": 0.9144341945648193
359
  },
360
  "sample": {
361
  "messages": [
@@ -393,8 +393,8 @@
393
  "acc": false
394
  },
395
  "predict": {
396
- "Да": 0.001324817189015448,
397
- "Нет": 0.9985203146934509
398
  },
399
  "sample": {
400
  "messages": [
@@ -432,8 +432,8 @@
432
  "acc": false
433
  },
434
  "predict": {
435
- "Да": 0.8660338521003723,
436
- "Нет": 0.1328105926513672
437
  },
438
  "sample": {
439
  "messages": [
@@ -468,11 +468,11 @@
468
  }
469
  {
470
  "metric": {
471
- "acc": true
472
  },
473
  "predict": {
474
- "Да": 0.015893418341875076,
475
- "Нет": 0.9832910299301147
476
  },
477
  "sample": {
478
  "messages": [
@@ -502,7 +502,7 @@
502
  "prompt_len": 136,
503
  "generated_len": 1,
504
  "generated_cumulative_logprob": "TODO: calculate for hf model",
505
- "generated_token": " Нет"
506
  }
507
  }
508
  {
@@ -510,8 +510,8 @@
510
  "acc": false
511
  },
512
  "predict": {
513
- "Да": 0.0533820241689682,
514
- "Нет": 0.9462189674377441
515
  },
516
  "sample": {
517
  "messages": [
@@ -549,8 +549,8 @@
549
  "acc": false
550
  },
551
  "predict": {
552
- "Да": 0.9315302968025208,
553
- "Нет": 0.06747982650995255
554
  },
555
  "sample": {
556
  "messages": [
@@ -588,8 +588,8 @@
588
  "acc": false
589
  },
590
  "predict": {
591
- "Да": 0.00043051555985584855,
592
- "Нет": 0.9994730353355408
593
  },
594
  "sample": {
595
  "messages": [
@@ -627,8 +627,8 @@
627
  "acc": true
628
  },
629
  "predict": {
630
- "Да": 0.0015009645139798522,
631
- "Нет": 0.9983540773391724
632
  },
633
  "sample": {
634
  "messages": [
@@ -663,11 +663,11 @@
663
  }
664
  {
665
  "metric": {
666
- "acc": true
667
  },
668
  "predict": {
669
- "Да": 0.040754951536655426,
670
- "Нет": 0.9275783896446228
671
  },
672
  "sample": {
673
  "messages": [
@@ -697,7 +697,7 @@
697
  "prompt_len": 157,
698
  "generated_len": 1,
699
  "generated_cumulative_logprob": "TODO: calculate for hf model",
700
- "generated_token": " Нет"
701
  }
702
  }
703
  {
@@ -705,8 +705,8 @@
705
  "acc": true
706
  },
707
  "predict": {
708
- "Да": 0.7037021517753601,
709
- "Нет": 0.29334670305252075
710
  },
711
  "sample": {
712
  "messages": [
@@ -741,11 +741,11 @@
741
  }
742
  {
743
  "metric": {
744
- "acc": false
745
  },
746
  "predict": {
747
- "Да": 0.40695691108703613,
748
- "Нет": 0.5921187996864319
749
  },
750
  "sample": {
751
  "messages": [
@@ -775,7 +775,7 @@
775
  "prompt_len": 265,
776
  "generated_len": 1,
777
  "generated_cumulative_logprob": "TODO: calculate for hf model",
778
- "generated_token": " Нет"
779
  }
780
  }
781
  {
@@ -783,8 +783,8 @@
783
  "acc": false
784
  },
785
  "predict": {
786
- "Да": 0.001032112049870193,
787
- "Нет": 0.9988526105880737
788
  },
789
  "sample": {
790
  "messages": [
@@ -822,8 +822,8 @@
822
  "acc": true
823
  },
824
  "predict": {
825
- "Да": 0.16420209407806396,
826
- "Нет": 0.8338871002197266
827
  },
828
  "sample": {
829
  "messages": [
@@ -861,8 +861,8 @@
861
  "acc": true
862
  },
863
  "predict": {
864
- "Да": 4.5386968849925324e-05,
865
- "Нет": 0.9997144341468811
866
  },
867
  "sample": {
868
  "messages": [
@@ -900,8 +900,8 @@
900
  "acc": true
901
  },
902
  "predict": {
903
- "Да": 0.0004877749306615442,
904
- "Нет": 0.9993436932563782
905
  },
906
  "sample": {
907
  "messages": [
@@ -939,8 +939,8 @@
939
  "acc": true
940
  },
941
  "predict": {
942
- "Да": 0.009703288786113262,
943
- "Нет": 0.9897623658180237
944
  },
945
  "sample": {
946
  "messages": [
@@ -978,8 +978,8 @@
978
  "acc": true
979
  },
980
  "predict": {
981
- "Да": 0.00048780557699501514,
982
- "Нет": 0.9994064569473267
983
  },
984
  "sample": {
985
  "messages": [
@@ -1017,8 +1017,8 @@
1017
  "acc": true
1018
  },
1019
  "predict": {
1020
- "Да": 0.000709479849319905,
1021
- "Нет": 0.9990220069885254
1022
  },
1023
  "sample": {
1024
  "messages": [
@@ -1056,8 +1056,8 @@
1056
  "acc": true
1057
  },
1058
  "predict": {
1059
- "Да": 0.6570025682449341,
1060
- "Нет": 0.213297501206398
1061
  },
1062
  "sample": {
1063
  "messages": [
@@ -1095,8 +1095,8 @@
1095
  "acc": true
1096
  },
1097
  "predict": {
1098
- "Да": 0.7527139782905579,
1099
- "Нет": 0.2443704456090927
1100
  },
1101
  "sample": {
1102
  "messages": [
@@ -1134,8 +1134,8 @@
1134
  "acc": true
1135
  },
1136
  "predict": {
1137
- "Да": 0.10662366449832916,
1138
- "Нет": 0.892749011516571
1139
  },
1140
  "sample": {
1141
  "messages": [
@@ -1173,8 +1173,8 @@
1173
  "acc": true
1174
  },
1175
  "predict": {
1176
- "Да": 0.00029592280043289065,
1177
- "Нет": 0.9995880722999573
1178
  },
1179
  "sample": {
1180
  "messages": [
@@ -1212,8 +1212,8 @@
1212
  "acc": true
1213
  },
1214
  "predict": {
1215
- "Да": 0.009701339527964592,
1216
- "Нет": 0.989563524723053
1217
  },
1218
  "sample": {
1219
  "messages": [
@@ -1251,8 +1251,8 @@
1251
  "acc": false
1252
  },
1253
  "predict": {
1254
- "Да": 0.00020335696171969175,
1255
- "Нет": 0.9994524121284485
1256
  },
1257
  "sample": {
1258
  "messages": [
@@ -1287,11 +1287,11 @@
1287
  }
1288
  {
1289
  "metric": {
1290
- "acc": false
1291
  },
1292
  "predict": {
1293
- "Да": 0.0008039596723392606,
1294
- "Нет": 0.9990390539169312
1295
  },
1296
  "sample": {
1297
  "messages": [
@@ -1321,7 +1321,7 @@
1321
  "prompt_len": 140,
1322
  "generated_len": 1,
1323
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1324
- "generated_token": " Нет"
1325
  }
1326
  }
1327
  {
@@ -1329,8 +1329,8 @@
1329
  "acc": false
1330
  },
1331
  "predict": {
1332
- "Да": 0.7302190661430359,
1333
- "Нет": 0.2686326205730438
1334
  },
1335
  "sample": {
1336
  "messages": [
@@ -1368,8 +1368,8 @@
1368
  "acc": true
1369
  },
1370
  "predict": {
1371
- "Да": 0.000803975504823029,
1372
- "Нет": 0.9990586638450623
1373
  },
1374
  "sample": {
1375
  "messages": [
@@ -1407,8 +1407,8 @@
1407
  "acc": false
1408
  },
1409
  "predict": {
1410
- "Да": 0.014056859537959099,
1411
- "Нет": 0.985461950302124
1412
  },
1413
  "sample": {
1414
  "messages": [
@@ -1446,8 +1446,8 @@
1446
  "acc": false
1447
  },
1448
  "predict": {
1449
- "Да": 0.7435306906700134,
1450
- "Нет": 0.21302512288093567
1451
  },
1452
  "sample": {
1453
  "messages": [
@@ -1485,8 +1485,8 @@
1485
  "acc": false
1486
  },
1487
  "predict": {
1488
- "Да": 0.9825862646102905,
1489
- "Нет": 0.01588202640414238
1490
  },
1491
  "sample": {
1492
  "messages": [
@@ -1524,8 +1524,8 @@
1524
  "acc": true
1525
  },
1526
  "predict": {
1527
- "Да": 0.11913560330867767,
1528
- "Нет": 0.8802996873855591
1529
  },
1530
  "sample": {
1531
  "messages": [
@@ -1563,8 +1563,8 @@
1563
  "acc": true
1564
  },
1565
  "predict": {
1566
- "Да": 0.029305994510650635,
1567
- "Нет": 0.970481276512146
1568
  },
1569
  "sample": {
1570
  "messages": [
@@ -1602,8 +1602,8 @@
1602
  "acc": true
1603
  },
1604
  "predict": {
1605
- "Да": 0.222540944814682,
1606
- "Нет": 0.7767441868782043
1607
  },
1608
  "sample": {
1609
  "messages": [
@@ -1638,11 +1638,11 @@
1638
  }
1639
  {
1640
  "metric": {
1641
- "acc": false
1642
  },
1643
  "predict": {
1644
- "Да": 0.0015003508888185024,
1645
- "Нет": 0.9979458451271057
1646
  },
1647
  "sample": {
1648
  "messages": [
@@ -1672,16 +1672,16 @@
1672
  "prompt_len": 139,
1673
  "generated_len": 1,
1674
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1675
- "generated_token": " Нет"
1676
  }
1677
  }
1678
  {
1679
  "metric": {
1680
- "acc": false
1681
  },
1682
  "predict": {
1683
- "Да": 0.03731626644730568,
1684
- "Нет": 0.9623991847038269
1685
  },
1686
  "sample": {
1687
  "messages": [
@@ -1711,7 +1711,7 @@
1711
  "prompt_len": 129,
1712
  "generated_len": 1,
1713
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1714
- "generated_token": " Нет"
1715
  }
1716
  }
1717
  {
@@ -1719,8 +1719,8 @@
1719
  "acc": false
1720
  },
1721
  "predict": {
1722
- "Да": 0.5304835438728333,
1723
- "Нет": 0.4681500494480133
1724
  },
1725
  "sample": {
1726
  "messages": [
@@ -1758,8 +1758,8 @@
1758
  "acc": true
1759
  },
1760
  "predict": {
1761
- "Да": 0.001032113330438733,
1762
- "Нет": 0.9988539218902588
1763
  },
1764
  "sample": {
1765
  "messages": [
@@ -1797,8 +1797,8 @@
1797
  "acc": false
1798
  },
1799
  "predict": {
1800
- "Да": 0.468466579914093,
1801
- "Нет": 0.5308421850204468
1802
  },
1803
  "sample": {
1804
  "messages": [
@@ -1833,11 +1833,11 @@
1833
  }
1834
  {
1835
  "metric": {
1836
- "acc": true
1837
  },
1838
  "predict": {
1839
- "Да": 0.2149808406829834,
1840
- "Нет": 0.6621876358985901
1841
  },
1842
  "sample": {
1843
  "messages": [
@@ -1867,7 +1867,7 @@
1867
  "prompt_len": 147,
1868
  "generated_len": 1,
1869
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1870
- "generated_token": " Нет"
1871
  }
1872
  }
1873
  {
@@ -1875,8 +1875,8 @@
1875
  "acc": false
1876
  },
1877
  "predict": {
1878
- "Да": 0.9988813996315002,
1879
- "Нет": 0.0007093799649737775
1880
  },
1881
  "sample": {
1882
  "messages": [
@@ -1914,8 +1914,8 @@
1914
  "acc": true
1915
  },
1916
  "predict": {
1917
- "Да": 0.060060080140829086,
1918
- "Нет": 0.939497709274292
1919
  },
1920
  "sample": {
1921
  "messages": [
@@ -1953,8 +1953,8 @@
1953
  "acc": false
1954
  },
1955
  "predict": {
1956
- "Да": 0.0004305129696149379,
1957
- "Нет": 0.9994670748710632
1958
  },
1959
  "sample": {
1960
  "messages": [
@@ -1992,8 +1992,8 @@
1992
  "acc": true
1993
  },
1994
  "predict": {
1995
- "Да": 0.06003950163722038,
1996
- "Нет": 0.9391758441925049
1997
  },
1998
  "sample": {
1999
  "messages": [
@@ -2031,8 +2031,8 @@
2031
  "acc": true
2032
  },
2033
  "predict": {
2034
- "Да": 0.0001794979179976508,
2035
- "Нет": 0.9996531009674072
2036
  },
2037
  "sample": {
2038
  "messages": [
@@ -2070,8 +2070,8 @@
2070
  "acc": false
2071
  },
2072
  "predict": {
2073
- "Да": 0.000626234570518136,
2074
- "Нет": 0.999214768409729
2075
  },
2076
  "sample": {
2077
  "messages": [
@@ -2109,8 +2109,8 @@
2109
  "acc": true
2110
  },
2111
  "predict": {
2112
- "Да": 0.01590224727988243,
2113
- "Нет": 0.9838371872901917
2114
  },
2115
  "sample": {
2116
  "messages": [
@@ -2148,8 +2148,8 @@
2148
  "acc": false
2149
  },
2150
  "predict": {
2151
- "Да": 0.00359282735735178,
2152
- "Нет": 0.9961913824081421
2153
  },
2154
  "sample": {
2155
  "messages": [
@@ -2187,8 +2187,8 @@
2187
  "acc": false
2188
  },
2189
  "predict": {
2190
- "Да": 0.00857413001358509,
2191
- "Нет": 0.9910346865653992
2192
  },
2193
  "sample": {
2194
  "messages": [
@@ -2226,8 +2226,8 @@
2226
  "acc": true
2227
  },
2228
  "predict": {
2229
- "Да": 0.8524937033653259,
2230
- "Нет": 0.1018158569931984
2231
  },
2232
  "sample": {
2233
  "messages": [
@@ -2262,11 +2262,11 @@
2262
  }
2263
  {
2264
  "metric": {
2265
- "acc": true
2266
  },
2267
  "predict": {
2268
- "Да": 0.07573917508125305,
2269
- "Нет": 0.922692060470581
2270
  },
2271
  "sample": {
2272
  "messages": [
@@ -2296,7 +2296,7 @@
2296
  "prompt_len": 86,
2297
  "generated_len": 1,
2298
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2299
- "generated_token": " Нет"
2300
  }
2301
  }
2302
  {
@@ -2304,8 +2304,8 @@
2304
  "acc": true
2305
  },
2306
  "predict": {
2307
- "Да": 0.02594723552465439,
2308
- "Нет": 0.9736628532409668
2309
  },
2310
  "sample": {
2311
  "messages": [
@@ -2343,8 +2343,8 @@
2343
  "acc": false
2344
  },
2345
  "predict": {
2346
- "Да": 0.008576461113989353,
2347
- "Нет": 0.9913041591644287
2348
  },
2349
  "sample": {
2350
  "messages": [
@@ -2382,8 +2382,8 @@
2382
  "acc": false
2383
  },
2384
  "predict": {
2385
- "Да": 0.49879026412963867,
2386
- "Нет": 0.49879026412963867
2387
  },
2388
  "sample": {
2389
  "messages": [
@@ -2421,8 +2421,8 @@
2421
  "acc": true
2422
  },
2423
  "predict": {
2424
- "Да": 0.0003351966734044254,
2425
- "Нет": 0.9992071986198425
2426
  },
2427
  "sample": {
2428
  "messages": [
@@ -2460,8 +2460,8 @@
2460
  "acc": false
2461
  },
2462
  "predict": {
2463
- "Да": 1.8922029994428158e-05,
2464
- "Нет": 0.999816358089447
2465
  },
2466
  "sample": {
2467
  "messages": [
@@ -2496,11 +2496,11 @@
2496
  }
2497
  {
2498
  "metric": {
2499
- "acc": true
2500
  },
2501
  "predict": {
2502
- "Да": 0.053379327058792114,
2503
- "Нет": 0.9461711645126343
2504
  },
2505
  "sample": {
2506
  "messages": [
@@ -2530,16 +2530,16 @@
2530
  "prompt_len": 204,
2531
  "generated_len": 1,
2532
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2533
- "generated_token": " Нет"
2534
  }
2535
  }
2536
  {
2537
  "metric": {
2538
- "acc": true
2539
  },
2540
  "predict": {
2541
- "Да": 0.0017005337867885828,
2542
- "Нет": 0.9981884360313416
2543
  },
2544
  "sample": {
2545
  "messages": [
@@ -2569,7 +2569,7 @@
2569
  "prompt_len": 157,
2570
  "generated_len": 1,
2571
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2572
- "generated_token": " Нет"
2573
  }
2574
  }
2575
  {
@@ -2577,8 +2577,8 @@
2577
  "acc": false
2578
  },
2579
  "predict": {
2580
- "Да": 0.017960019409656525,
2581
- "Нет": 0.9805837869644165
2582
  },
2583
  "sample": {
2584
  "messages": [
@@ -2613,11 +2613,11 @@
2613
  }
2614
  {
2615
  "metric": {
2616
- "acc": true
2617
  },
2618
  "predict": {
2619
- "Да": 0.4208765923976898,
2620
- "Нет": 0.5404162406921387
2621
  },
2622
  "sample": {
2623
  "messages": [
@@ -2647,7 +2647,7 @@
2647
  "prompt_len": 168,
2648
  "generated_len": 1,
2649
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2650
- "generated_token": " Нет"
2651
  }
2652
  }
2653
  {
@@ -2655,8 +2655,8 @@
2655
  "acc": false
2656
  },
2657
  "predict": {
2658
- "Да": 0.8503593802452087,
2659
- "Нет": 0.14777028560638428
2660
  },
2661
  "sample": {
2662
  "messages": [
@@ -2694,8 +2694,8 @@
2694
  "acc": true
2695
  },
2696
  "predict": {
2697
- "Да": 0.05337907001376152,
2698
- "Нет": 0.9461665749549866
2699
  },
2700
  "sample": {
2701
  "messages": [
@@ -2733,8 +2733,8 @@
2733
  "acc": false
2734
  },
2735
  "predict": {
2736
- "Да": 0.0019263619324192405,
2737
- "Нет": 0.997880220413208
2738
  },
2739
  "sample": {
2740
  "messages": [
@@ -2769,11 +2769,11 @@
2769
  }
2770
  {
2771
  "metric": {
2772
- "acc": false
2773
  },
2774
  "predict": {
2775
- "Да": 0.4366450011730194,
2776
- "Нет": 0.5606632828712463
2777
  },
2778
  "sample": {
2779
  "messages": [
@@ -2803,7 +2803,7 @@
2803
  "prompt_len": 115,
2804
  "generated_len": 1,
2805
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2806
- "generated_token": " Нет"
2807
  }
2808
  }
2809
  {
@@ -2811,8 +2811,8 @@
2811
  "acc": true
2812
  },
2813
  "predict": {
2814
- "Да": 0.9451661705970764,
2815
- "Нет": 0.05332263186573982
2816
  },
2817
  "sample": {
2818
  "messages": [
@@ -2847,11 +2847,11 @@
2847
  }
2848
  {
2849
  "metric": {
2850
- "acc": true
2851
  },
2852
  "predict": {
2853
- "Да": 0.0019265249138697982,
2854
- "Нет": 0.99796462059021
2855
  },
2856
  "sample": {
2857
  "messages": [
@@ -2881,7 +2881,7 @@
2881
  "prompt_len": 126,
2882
  "generated_len": 1,
2883
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2884
- "generated_token": " Нет"
2885
  }
2886
  }
2887
  {
@@ -2889,8 +2889,8 @@
2889
  "acc": false
2890
  },
2891
  "predict": {
2892
- "Да": 0.9129877090454102,
2893
- "Нет": 0.08492108434438705
2894
  },
2895
  "sample": {
2896
  "messages": [
@@ -2928,8 +2928,8 @@
2928
  "acc": false
2929
  },
2930
  "predict": {
2931
- "Да": 0.002800603397190571,
2932
- "Нет": 0.997083842754364
2933
  },
2934
  "sample": {
2935
  "messages": [
@@ -2967,8 +2967,8 @@
2967
  "acc": false
2968
  },
2969
  "predict": {
2970
- "Да": 0.0019256919622421265,
2971
- "Нет": 0.9975330829620361
2972
  },
2973
  "sample": {
2974
  "messages": [
@@ -3003,11 +3003,11 @@
3003
  }
3004
  {
3005
  "metric": {
3006
- "acc": true
3007
  },
3008
  "predict": {
3009
- "Да": 0.008415551856160164,
3010
- "Нет": 0.972705602645874
3011
  },
3012
  "sample": {
3013
  "messages": [
@@ -3037,7 +3037,7 @@
3037
  "prompt_len": 185,
3038
  "generated_len": 1,
3039
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3040
- "generated_token": " Нет"
3041
  }
3042
  }
3043
  {
@@ -3045,8 +3045,8 @@
3045
  "acc": true
3046
  },
3047
  "predict": {
3048
- "Да": 0.9229567646980286,
3049
- "Нет": 0.07576090097427368
3050
  },
3051
  "sample": {
3052
  "messages": [
@@ -3081,11 +3081,11 @@
3081
  }
3082
  {
3083
  "metric": {
3084
- "acc": false
3085
  },
3086
  "predict": {
3087
- "Да": 0.3771263360977173,
3088
- "Нет": 0.6217762231826782
3089
  },
3090
  "sample": {
3091
  "messages": [
@@ -3115,7 +3115,7 @@
3115
  "prompt_len": 256,
3116
  "generated_len": 1,
3117
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3118
- "generated_token": " Нет"
3119
  }
3120
  }
3121
  {
@@ -3123,8 +3123,8 @@
3123
  "acc": false
3124
  },
3125
  "predict": {
3126
- "Да": 0.00043048226507380605,
3127
- "Нет": 0.9993957281112671
3128
  },
3129
  "sample": {
3130
  "messages": [
@@ -3162,8 +3162,8 @@
3162
  "acc": false
3163
  },
3164
  "predict": {
3165
- "Да": 0.7047868371009827,
3166
- "Нет": 0.2937988340854645
3167
  },
3168
  "sample": {
3169
  "messages": [
@@ -3198,11 +3198,11 @@
3198
  }
3199
  {
3200
  "metric": {
3201
- "acc": false
3202
  },
3203
  "predict": {
3204
- "Да": 0.010979406535625458,
3205
- "Нет": 0.9883346557617188
3206
  },
3207
  "sample": {
3208
  "messages": [
@@ -3232,7 +3232,7 @@
3232
  "prompt_len": 109,
3233
  "generated_len": 1,
3234
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3235
- "generated_token": " Нет"
3236
  }
3237
  }
3238
  {
@@ -3240,8 +3240,8 @@
3240
  "acc": false
3241
  },
3242
  "predict": {
3243
- "Да": 0.0010320731671527028,
3244
- "Нет": 0.9988150596618652
3245
  },
3246
  "sample": {
3247
  "messages": [
@@ -3279,8 +3279,8 @@
3279
  "acc": false
3280
  },
3281
  "predict": {
3282
- "Да": 0.9235284924507141,
3283
- "Нет": 0.0758078396320343
3284
  },
3285
  "sample": {
3286
  "messages": [
@@ -3318,8 +3318,8 @@
3318
  "acc": false
3319
  },
3320
  "predict": {
3321
- "Да": 6.60449659335427e-05,
3322
- "Нет": 0.9998252987861633
3323
  },
3324
  "sample": {
3325
  "messages": [
@@ -3357,8 +3357,8 @@
3357
  "acc": true
3358
  },
3359
  "predict": {
3360
- "Да": 0.08504106849431992,
3361
- "Нет": 0.9142776131629944
3362
  },
3363
  "sample": {
3364
  "messages": [
@@ -3396,8 +3396,8 @@
3396
  "acc": false
3397
  },
3398
  "predict": {
3399
- "Да": 0.9325262308120728,
3400
- "Нет": 0.024850983172655106
3401
  },
3402
  "sample": {
3403
  "messages": [
@@ -3432,11 +3432,11 @@
3432
  }
3433
  {
3434
  "metric": {
3435
- "acc": true
3436
  },
3437
  "predict": {
3438
- "Да": 0.18190623819828033,
3439
- "Нет": 0.8152471780776978
3440
  },
3441
  "sample": {
3442
  "messages": [
@@ -3466,7 +3466,7 @@
3466
  "prompt_len": 134,
3467
  "generated_len": 1,
3468
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3469
- "generated_token": " Нет"
3470
  }
3471
  }
3472
  {
@@ -3474,8 +3474,8 @@
3474
  "acc": false
3475
  },
3476
  "predict": {
3477
- "Да": 0.11914443224668503,
3478
- "Нет": 0.8803648948669434
3479
  },
3480
  "sample": {
3481
  "messages": [
@@ -3513,8 +3513,8 @@
3513
  "acc": true
3514
  },
3515
  "predict": {
3516
- "Да": 0.0002611565578263253,
3517
- "Нет": 0.999609649181366
3518
  },
3519
  "sample": {
3520
  "messages": [
@@ -3552,8 +3552,8 @@
3552
  "acc": true
3553
  },
3554
  "predict": {
3555
- "Да": 0.025938672944903374,
3556
- "Нет": 0.9733415842056274
3557
  },
3558
  "sample": {
3559
  "messages": [
@@ -3591,8 +3591,8 @@
3591
  "acc": false
3592
  },
3593
  "predict": {
3594
- "Да": 0.0002611354284454137,
3595
- "Нет": 0.9995287656784058
3596
  },
3597
  "sample": {
3598
  "messages": [
@@ -3630,8 +3630,8 @@
3630
  "acc": true
3631
  },
3632
  "predict": {
3633
- "Да": 0.0019264277070760727,
3634
- "Нет": 0.9979142546653748
3635
  },
3636
  "sample": {
3637
  "messages": [
@@ -3669,8 +3669,8 @@
3669
  "acc": true
3670
  },
3671
  "predict": {
3672
- "Да": 0.9939780235290527,
3673
- "Нет": 0.005215918179601431
3674
  },
3675
  "sample": {
3676
  "messages": [
@@ -3708,8 +3708,8 @@
3708
  "acc": false
3709
  },
3710
  "predict": {
3711
- "Да": 0.0021823798306286335,
3712
- "Нет": 0.9976633787155151
3713
  },
3714
  "sample": {
3715
  "messages": [
@@ -3747,8 +3747,8 @@
3747
  "acc": false
3748
  },
3749
  "predict": {
3750
- "Да": 0.0008039618842303753,
3751
- "Нет": 0.999041736125946
3752
  },
3753
  "sample": {
3754
  "messages": [
@@ -3786,8 +3786,8 @@
3786
  "acc": false
3787
  },
3788
  "predict": {
3789
- "Да": 0.9750506281852722,
3790
- "Нет": 0.002738716546446085
3791
  },
3792
  "sample": {
3793
  "messages": [
@@ -3822,11 +3822,11 @@
3822
  }
3823
  {
3824
  "metric": {
3825
- "acc": false
3826
  },
3827
  "predict": {
3828
- "Да": 0.4672533869743347,
3829
- "Нет": 0.5294674634933472
3830
  },
3831
  "sample": {
3832
  "messages": [
@@ -3856,7 +3856,7 @@
3856
  "prompt_len": 127,
3857
  "generated_len": 1,
3858
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3859
- "generated_token": " Нет"
3860
  }
3861
  }
3862
  {
@@ -3864,8 +3864,8 @@
3864
  "acc": false
3865
  },
3866
  "predict": {
3867
- "Да": 0.24489443004131317,
3868
- "Нет": 0.7543279528617859
3869
  },
3870
  "sample": {
3871
  "messages": [
@@ -3903,8 +3903,8 @@
3903
  "acc": true
3904
  },
3905
  "predict": {
3906
- "Да": 0.004069507587701082,
3907
- "Нет": 0.9957756400108337
3908
  },
3909
  "sample": {
3910
  "messages": [
@@ -3942,8 +3942,8 @@
3942
  "acc": false
3943
  },
3944
  "predict": {
3945
- "Да": 0.7040597200393677,
3946
- "Нет": 0.2934957444667816
3947
  },
3948
  "sample": {
3949
  "messages": [
@@ -3981,8 +3981,8 @@
3981
  "acc": true
3982
  },
3983
  "predict": {
3984
- "Да": 2.7526933990884572e-05,
3985
- "Нет": 0.9996544122695923
3986
  },
3987
  "sample": {
3988
  "messages": [
@@ -4020,8 +4020,8 @@
4020
  "acc": true
4021
  },
4022
  "predict": {
4023
- "Да": 0.002800086047500372,
4024
- "Нет": 0.9968996047973633
4025
  },
4026
  "sample": {
4027
  "messages": [
@@ -4059,8 +4059,8 @@
4059
  "acc": true
4060
  },
4061
  "predict": {
4062
- "Да": 0.816844642162323,
4063
- "Нет": 0.18226267397403717
4064
  },
4065
  "sample": {
4066
  "messages": [
@@ -4098,8 +4098,8 @@
4098
  "acc": false
4099
  },
4100
  "predict": {
4101
- "Да": 0.0011693844571709633,
4102
- "Нет": 0.9987230896949768
4103
  },
4104
  "sample": {
4105
  "messages": [
@@ -4137,8 +4137,8 @@
4137
  "acc": true
4138
  },
4139
  "predict": {
4140
- "Да": 0.000626144465059042,
4141
- "Нет": 0.9990710020065308
4142
  },
4143
  "sample": {
4144
  "messages": [
@@ -4176,8 +4176,8 @@
4176
  "acc": true
4177
  },
4178
  "predict": {
4179
- "Да": 0.688129723072052,
4180
- "Нет": 0.25314879417419434
4181
  },
4182
  "sample": {
4183
  "messages": [
@@ -4215,8 +4215,8 @@
4215
  "acc": true
4216
  },
4217
  "predict": {
4218
- "Да": 0.997204065322876,
4219
- "Нет": 0.0021813749335706234
4220
  },
4221
  "sample": {
4222
  "messages": [
@@ -4254,8 +4254,8 @@
4254
  "acc": true
4255
  },
4256
  "predict": {
4257
- "Да": 0.13287986814975739,
4258
- "Нет": 0.866485595703125
4259
  },
4260
  "sample": {
4261
  "messages": [
@@ -4293,8 +4293,8 @@
4293
  "acc": false
4294
  },
4295
  "predict": {
4296
- "Да": 0.04741421714425087,
4297
- "Нет": 0.9523400068283081
4298
  },
4299
  "sample": {
4300
  "messages": [
@@ -4329,11 +4329,11 @@
4329
  }
4330
  {
4331
  "metric": {
4332
- "acc": true
4333
  },
4334
  "predict": {
4335
- "Да": 0.20164957642555237,
4336
- "Нет": 0.7975395321846008
4337
  },
4338
  "sample": {
4339
  "messages": [
@@ -4363,7 +4363,7 @@
4363
  "prompt_len": 117,
4364
  "generated_len": 1,
4365
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4366
- "generated_token": " Нет"
4367
  }
4368
  }
4369
  {
@@ -4371,8 +4371,8 @@
4371
  "acc": false
4372
  },
4373
  "predict": {
4374
- "Да": 0.00023044571571517736,
4375
- "Нет": 0.9995049238204956
4376
  },
4377
  "sample": {
4378
  "messages": [
@@ -4410,8 +4410,8 @@
4410
  "acc": true
4411
  },
4412
  "predict": {
4413
- "Да": 0.01798146963119507,
4414
- "Нет": 0.9817549586296082
4415
  },
4416
  "sample": {
4417
  "messages": [
@@ -4446,11 +4446,11 @@
4446
  }
4447
  {
4448
  "metric": {
4449
- "acc": false
4450
  },
4451
  "predict": {
4452
- "Да": 0.07579170912504196,
4453
- "Нет": 0.9233320355415344
4454
  },
4455
  "sample": {
4456
  "messages": [
@@ -4480,7 +4480,7 @@
4480
  "prompt_len": 111,
4481
  "generated_len": 1,
4482
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4483
- "generated_token": " Нет"
4484
  }
4485
  }
4486
  {
@@ -4488,8 +4488,8 @@
4488
  "acc": true
4489
  },
4490
  "predict": {
4491
- "Да": 0.00317213824018836,
4492
- "Нет": 0.9966561794281006
4493
  },
4494
  "sample": {
4495
  "messages": [
@@ -4527,8 +4527,8 @@
4527
  "acc": false
4528
  },
4529
  "predict": {
4530
- "Да": 0.16443610191345215,
4531
- "Нет": 0.8350754976272583
4532
  },
4533
  "sample": {
4534
  "messages": [
@@ -4566,8 +4566,8 @@
4566
  "acc": false
4567
  },
4568
  "predict": {
4569
- "Да": 0.9623715877532959,
4570
- "Нет": 0.005050062667578459
4571
  },
4572
  "sample": {
4573
  "messages": [
@@ -4602,11 +4602,11 @@
4602
  }
4603
  {
4604
  "metric": {
4605
- "acc": true
4606
  },
4607
  "predict": {
4608
- "Да": 0.05997878313064575,
4609
- "Нет": 0.938226044178009
4610
  },
4611
  "sample": {
4612
  "messages": [
@@ -4636,7 +4636,7 @@
4636
  "prompt_len": 106,
4637
  "generated_len": 1,
4638
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4639
- "generated_token": " Нет"
4640
  }
4641
  }
4642
  {
@@ -4644,8 +4644,8 @@
4644
  "acc": true
4645
  },
4646
  "predict": {
4647
- "Да": 0.0850648358464241,
4648
- "Нет": 0.9145331978797913
4649
  },
4650
  "sample": {
4651
  "messages": [
@@ -4680,11 +4680,11 @@
4680
  }
4681
  {
4682
  "metric": {
4683
- "acc": true
4684
  },
4685
  "predict": {
4686
- "Да": 0.007576117757707834,
4687
- "Нет": 0.9922756552696228
4688
  },
4689
  "sample": {
4690
  "messages": [
@@ -4714,7 +4714,7 @@
4714
  "prompt_len": 104,
4715
  "generated_len": 1,
4716
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4717
- "generated_token": " Нет"
4718
  }
4719
  }
4720
  {
@@ -4722,8 +4722,8 @@
4722
  "acc": true
4723
  },
4724
  "predict": {
4725
- "Да": 0.0259372740983963,
4726
- "Нет": 0.9732890725135803
4727
  },
4728
  "sample": {
4729
  "messages": [
@@ -4761,8 +4761,8 @@
4761
  "acc": true
4762
  },
4763
  "predict": {
4764
- "Да": 0.00020336236048024148,
4765
- "Нет": 0.9994789958000183
4766
  },
4767
  "sample": {
4768
  "messages": [
@@ -4797,11 +4797,11 @@
4797
  }
4798
  {
4799
  "metric": {
4800
- "acc": false
4801
  },
4802
  "predict": {
4803
- "Да": 0.047405000776052475,
4804
- "Нет": 0.9521549344062805
4805
  },
4806
  "sample": {
4807
  "messages": [
@@ -4831,7 +4831,7 @@
4831
  "prompt_len": 105,
4832
  "generated_len": 1,
4833
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4834
- "generated_token": " Нет"
4835
  }
4836
  }
4837
  {
@@ -4839,8 +4839,8 @@
4839
  "acc": true
4840
  },
4841
  "predict": {
4842
- "Да": 0.007575278170406818,
4843
- "Нет": 0.9921656847000122
4844
  },
4845
  "sample": {
4846
  "messages": [
@@ -4878,8 +4878,8 @@
4878
  "acc": true
4879
  },
4880
  "predict": {
4881
- "Да": 0.00043050554813817143,
4882
- "Нет": 0.9994497895240784
4883
  },
4884
  "sample": {
4885
  "messages": [
@@ -4917,8 +4917,8 @@
4917
  "acc": true
4918
  },
4919
  "predict": {
4920
- "Да": 0.005217640660703182,
4921
- "Нет": 0.9943062663078308
4922
  },
4923
  "sample": {
4924
  "messages": [
@@ -4956,8 +4956,8 @@
4956
  "acc": false
4957
  },
4958
  "predict": {
4959
- "Да": 0.8910460472106934,
4960
- "Нет": 0.06454716622829437
4961
  },
4962
  "sample": {
4963
  "messages": [
@@ -4995,8 +4995,8 @@
4995
  "acc": false
4996
  },
4997
  "predict": {
4998
- "Да": 0.0040669688023626804,
4999
- "Нет": 0.9951545000076294
5000
  },
5001
  "sample": {
5002
  "messages": [
@@ -5031,11 +5031,11 @@
5031
  }
5032
  {
5033
  "metric": {
5034
- "acc": false
5035
  },
5036
  "predict": {
5037
- "Да": 0.26880720257759094,
5038
- "Нет": 0.7306936383247375
5039
  },
5040
  "sample": {
5041
  "messages": [
@@ -5065,16 +5065,16 @@
5065
  "prompt_len": 228,
5066
  "generated_len": 1,
5067
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5068
- "generated_token": " Нет"
5069
  }
5070
  }
5071
  {
5072
  "metric": {
5073
- "acc": false
5074
  },
5075
  "predict": {
5076
- "Да": 0.022972356528043747,
5077
- "Нет": 0.9768094420433044
5078
  },
5079
  "sample": {
5080
  "messages": [
@@ -5104,7 +5104,7 @@
5104
  "prompt_len": 101,
5105
  "generated_len": 1,
5106
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5107
- "generated_token": " Нет"
5108
  }
5109
  }
5110
  {
@@ -5112,8 +5112,8 @@
5112
  "acc": true
5113
  },
5114
  "predict": {
5115
- "Да": 0.02594570256769657,
5116
- "Нет": 0.9736053347587585
5117
  },
5118
  "sample": {
5119
  "messages": [
@@ -5151,8 +5151,8 @@
5151
  "acc": true
5152
  },
5153
  "predict": {
5154
- "Да": 0.0007094802567735314,
5155
- "Нет": 0.9990226030349731
5156
  },
5157
  "sample": {
5158
  "messages": [
@@ -5190,8 +5190,8 @@
5190
  "acc": false
5191
  },
5192
  "predict": {
5193
- "Да": 0.0035929649602621794,
5194
- "Нет": 0.9962295889854431
5195
  },
5196
  "sample": {
5197
  "messages": [
@@ -5229,8 +5229,8 @@
5229
  "acc": false
5230
  },
5231
  "predict": {
5232
- "Да": 0.9318119287490845,
5233
- "Нет": 0.06750023365020752
5234
  },
5235
  "sample": {
5236
  "messages": [
@@ -5268,8 +5268,8 @@
5268
  "acc": true
5269
  },
5270
  "predict": {
5271
- "Да": 0.0024723317474126816,
5272
- "Нет": 0.9974097609519958
5273
  },
5274
  "sample": {
5275
  "messages": [
@@ -5307,8 +5307,8 @@
5307
  "acc": false
5308
  },
5309
  "predict": {
5310
- "Да": 0.004607855807989836,
5311
- "Нет": 0.9950196743011475
5312
  },
5313
  "sample": {
5314
  "messages": [
@@ -5346,8 +5346,8 @@
5346
  "acc": true
5347
  },
5348
  "predict": {
5349
- "Да": 0.8451831340789795,
5350
- "Нет": 0.0693768560886383
5351
  },
5352
  "sample": {
5353
  "messages": [
@@ -5382,11 +5382,11 @@
5382
  }
5383
  {
5384
  "metric": {
5385
- "acc": true
5386
  },
5387
  "predict": {
5388
- "Да": 0.025922434404492378,
5389
- "Нет": 0.9727321863174438
5390
  },
5391
  "sample": {
5392
  "messages": [
@@ -5416,16 +5416,16 @@
5416
  "prompt_len": 78,
5417
  "generated_len": 1,
5418
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5419
- "generated_token": " Нет"
5420
  }
5421
  }
5422
  {
5423
  "metric": {
5424
- "acc": true
5425
  },
5426
  "predict": {
5427
- "Да": 0.13289140164852142,
5428
- "Нет": 0.8665607571601868
5429
  },
5430
  "sample": {
5431
  "messages": [
@@ -5455,7 +5455,7 @@
5455
  "prompt_len": 224,
5456
  "generated_len": 1,
5457
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5458
- "generated_token": " Нет"
5459
  }
5460
  }
5461
  {
@@ -5463,8 +5463,8 @@
5463
  "acc": true
5464
  },
5465
  "predict": {
5466
- "Да": 0.0004305104666855186,
5467
- "Нет": 0.9994612336158752
5468
  },
5469
  "sample": {
5470
  "messages": [
@@ -5502,8 +5502,8 @@
5502
  "acc": false
5503
  },
5504
  "predict": {
5505
- "Да": 0.468307226896286,
5506
- "Нет": 0.5306615829467773
5507
  },
5508
  "sample": {
5509
  "messages": [
@@ -5541,8 +5541,8 @@
5541
  "acc": false
5542
  },
5543
  "predict": {
5544
- "Да": 0.0002304443478351459,
5545
- "Нет": 0.9994989633560181
5546
  },
5547
  "sample": {
5548
  "messages": [
@@ -5580,8 +5580,8 @@
5580
  "acc": false
5581
  },
5582
  "predict": {
5583
- "Да": 0.0003799199184868485,
5584
- "Нет": 0.9994500279426575
5585
  },
5586
  "sample": {
5587
  "messages": [
@@ -5616,11 +5616,11 @@
5616
  }
5617
  {
5618
  "metric": {
5619
- "acc": false
5620
  },
5621
  "predict": {
5622
- "Да": 0.2447185069322586,
5623
- "Нет": 0.7537860870361328
5624
  },
5625
  "sample": {
5626
  "messages": [
@@ -5650,7 +5650,7 @@
5650
  "prompt_len": 105,
5651
  "generated_len": 1,
5652
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5653
- "generated_token": " Нет"
5654
  }
5655
  }
5656
  {
@@ -5658,8 +5658,8 @@
5658
  "acc": false
5659
  },
5660
  "predict": {
5661
- "Да": 0.0015010004863142967,
5662
- "Нет": 0.9983779191970825
5663
  },
5664
  "sample": {
5665
  "messages": [
@@ -5697,8 +5697,8 @@
5697
  "acc": false
5698
  },
5699
  "predict": {
5700
- "Да": 0.012427574954926968,
5701
- "Нет": 0.9872445464134216
5702
  },
5703
  "sample": {
5704
  "messages": [
@@ -5736,8 +5736,8 @@
5736
  "acc": false
5737
  },
5738
  "predict": {
5739
- "Да": 0.7317188382148743,
5740
- "Нет": 0.23755432665348053
5741
  },
5742
  "sample": {
5743
  "messages": [
@@ -5772,11 +5772,11 @@
5772
  }
5773
  {
5774
  "metric": {
5775
- "acc": false
5776
  },
5777
  "predict": {
5778
- "Да": 0.4057359993457794,
5779
- "Нет": 0.5903423428535461
5780
  },
5781
  "sample": {
5782
  "messages": [
@@ -5806,7 +5806,7 @@
5806
  "prompt_len": 128,
5807
  "generated_len": 1,
5808
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5809
- "generated_token": " Нет"
5810
  }
5811
  }
5812
  {
@@ -5814,8 +5814,8 @@
5814
  "acc": false
5815
  },
5816
  "predict": {
5817
- "Да": 0.16444909572601318,
5818
- "Нет": 0.8351414799690247
5819
  },
5820
  "sample": {
5821
  "messages": [
@@ -5853,8 +5853,8 @@
5853
  "acc": true
5854
  },
5855
  "predict": {
5856
- "Да": 0.002182485768571496,
5857
- "Нет": 0.9977117776870728
5858
  },
5859
  "sample": {
5860
  "messages": [
@@ -5892,8 +5892,8 @@
5892
  "acc": false
5893
  },
5894
  "predict": {
5895
- "Да": 0.4990127682685852,
5896
- "Нет": 0.4990127682685852
5897
  },
5898
  "sample": {
5899
  "messages": [
@@ -5931,8 +5931,8 @@
5931
  "acc": false
5932
  },
5933
  "predict": {
5934
- "Да": 0.09520400315523148,
5935
- "Нет": 0.9032704830169678
5936
  },
5937
  "sample": {
5938
  "messages": [
@@ -5970,8 +5970,8 @@
5970
  "acc": true
5971
  },
5972
  "predict": {
5973
- "Да": 0.0006262222304940224,
5974
- "Нет": 0.9991951584815979
5975
  },
5976
  "sample": {
5977
  "messages": [
@@ -6006,11 +6006,11 @@
6006
  }
6007
  {
6008
  "metric": {
6009
- "acc": true
6010
  },
6011
  "predict": {
6012
- "Да": 0.4373700022697449,
6013
- "Нет": 0.5615941882133484
6014
  },
6015
  "sample": {
6016
  "messages": [
@@ -6040,16 +6040,16 @@
6040
  "prompt_len": 206,
6041
  "generated_len": 1,
6042
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6043
- "generated_token": " Нет"
6044
  }
6045
  }
6046
  {
6047
  "metric": {
6048
- "acc": false
6049
  },
6050
  "predict": {
6051
- "Да": 0.020326819270849228,
6052
- "Нет": 0.9794010519981384
6053
  },
6054
  "sample": {
6055
  "messages": [
@@ -6079,7 +6079,7 @@
6079
  "prompt_len": 174,
6080
  "generated_len": 1,
6081
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6082
- "generated_token": " Нет"
6083
  }
6084
  }
6085
  {
@@ -6087,8 +6087,8 @@
6087
  "acc": false
6088
  },
6089
  "predict": {
6090
- "Да": 0.2687530517578125,
6091
- "Нет": 0.7305464744567871
6092
  },
6093
  "sample": {
6094
  "messages": [
@@ -6126,8 +6126,8 @@
6126
  "acc": false
6127
  },
6128
  "predict": {
6129
- "Да": 0.6943596601486206,
6130
- "Нет": 0.17556162178516388
6131
  },
6132
  "sample": {
6133
  "messages": [
@@ -6162,11 +6162,11 @@
6162
  }
6163
  {
6164
  "metric": {
6165
- "acc": false
6166
  },
6167
  "predict": {
6168
- "Да": 0.16413532197475433,
6169
- "Нет": 0.8335480093955994
6170
  },
6171
  "sample": {
6172
  "messages": [
@@ -6196,7 +6196,7 @@
6196
  "prompt_len": 105,
6197
  "generated_len": 1,
6198
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6199
- "generated_token": " Нет"
6200
  }
6201
  }
6202
  {
@@ -6204,8 +6204,8 @@
6204
  "acc": false
6205
  },
6206
  "predict": {
6207
- "Да": 0.08501418679952621,
6208
- "Нет": 0.9139886498451233
6209
  },
6210
  "sample": {
6211
  "messages": [
@@ -6243,8 +6243,8 @@
6243
  "acc": true
6244
  },
6245
  "predict": {
6246
- "Да": 0.0013248433824628592,
6247
- "Нет": 0.9985400438308716
6248
  },
6249
  "sample": {
6250
  "messages": [
@@ -6279,11 +6279,11 @@
6279
  }
6280
  {
6281
  "metric": {
6282
- "acc": true
6283
  },
6284
  "predict": {
6285
- "Да": 0.22250330448150635,
6286
- "Нет": 0.7766128182411194
6287
  },
6288
  "sample": {
6289
  "messages": [
@@ -6313,7 +6313,7 @@
6313
  "prompt_len": 114,
6314
  "generated_len": 1,
6315
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6316
- "generated_token": " Нет"
6317
  }
6318
  }
6319
  {
@@ -6321,8 +6321,8 @@
6321
  "acc": false
6322
  },
6323
  "predict": {
6324
- "Да": 1.8911578081315383e-05,
6325
- "Нет": 0.9992640614509583
6326
  },
6327
  "sample": {
6328
  "messages": [
@@ -6360,8 +6360,8 @@
6360
  "acc": true
6361
  },
6362
  "predict": {
6363
- "Да": 0.001169274328276515,
6364
- "Нет": 0.998629093170166
6365
  },
6366
  "sample": {
6367
  "messages": [
@@ -6399,8 +6399,8 @@
6399
  "acc": true
6400
  },
6401
  "predict": {
6402
- "Да": 0.9763138890266418,
6403
- "Нет": 0.02296070195734501
6404
  },
6405
  "sample": {
6406
  "messages": [
@@ -6438,8 +6438,8 @@
6438
  "acc": false
6439
  },
6440
  "predict": {
6441
- "Да": 0.00017950989422388375,
6442
- "Нет": 0.9997197985649109
6443
  },
6444
  "sample": {
6445
  "messages": [
@@ -6477,8 +6477,8 @@
6477
  "acc": true
6478
  },
6479
  "predict": {
6480
- "Да": 0.0019262604182586074,
6481
- "Нет": 0.9978275895118713
6482
  },
6483
  "sample": {
6484
  "messages": [
@@ -6516,8 +6516,8 @@
6516
  "acc": true
6517
  },
6518
  "predict": {
6519
- "Да": 0.7125749588012695,
6520
- "Нет": 0.2313392162322998
6521
  },
6522
  "sample": {
6523
  "messages": [
@@ -6552,11 +6552,11 @@
6552
  }
6553
  {
6554
  "metric": {
6555
- "acc": false
6556
  },
6557
  "predict": {
6558
- "Да": 0.13258744776248932,
6559
- "Нет": 0.8645787239074707
6560
  },
6561
  "sample": {
6562
  "messages": [
@@ -6586,7 +6586,7 @@
6586
  "prompt_len": 104,
6587
  "generated_len": 1,
6588
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6589
- "generated_token": " Нет"
6590
  }
6591
  }
6592
  {
@@ -6594,8 +6594,8 @@
6594
  "acc": false
6595
  },
6596
  "predict": {
6597
- "Да": 0.26871126890182495,
6598
- "Нет": 0.7304328680038452
6599
  },
6600
  "sample": {
6601
  "messages": [
@@ -6633,8 +6633,8 @@
6633
  "acc": false
6634
  },
6635
  "predict": {
6636
- "Да": 0.0003799440455622971,
6637
- "Нет": 0.9995135068893433
6638
  },
6639
  "sample": {
6640
  "messages": [
@@ -6672,8 +6672,8 @@
6672
  "acc": true
6673
  },
6674
  "predict": {
6675
- "Да": 0.11908745020627975,
6676
- "Нет": 0.8799439072608948
6677
  },
6678
  "sample": {
6679
  "messages": [
@@ -6711,8 +6711,8 @@
6711
  "acc": true
6712
  },
6713
  "predict": {
6714
- "Да": 0.0001233512011822313,
6715
- "Нет": 0.9995251893997192
6716
  },
6717
  "sample": {
6718
  "messages": [
@@ -6750,8 +6750,8 @@
6750
  "acc": true
6751
  },
6752
  "predict": {
6753
- "Да": 0.0075753238052129745,
6754
- "Нет": 0.9921716451644897
6755
  },
6756
  "sample": {
6757
  "messages": [
@@ -6789,8 +6789,8 @@
6789
  "acc": true
6790
  },
6791
  "predict": {
6792
- "Да": 0.8166061639785767,
6793
- "Нет": 0.18220946192741394
6794
  },
6795
  "sample": {
6796
  "messages": [
@@ -6828,8 +6828,8 @@
6828
  "acc": true
6829
  },
6830
  "predict": {
6831
- "Да": 0.0019264371367171407,
6832
- "Нет": 0.9979191422462463
6833
  },
6834
  "sample": {
6835
  "messages": [
@@ -6867,8 +6867,8 @@
6867
  "acc": true
6868
  },
6869
  "predict": {
6870
- "Да": 0.010980246588587761,
6871
- "Нет": 0.988410234451294
6872
  },
6873
  "sample": {
6874
  "messages": [
@@ -6906,8 +6906,8 @@
6906
  "acc": true
6907
  },
6908
  "predict": {
6909
- "Да": 0.8517078161239624,
6910
- "Нет": 0.06169750913977623
6911
  },
6912
  "sample": {
6913
  "messages": [
@@ -6942,11 +6942,11 @@
6942
  }
6943
  {
6944
  "metric": {
6945
- "acc": false
6946
  },
6947
  "predict": {
6948
- "Да": 0.20117761194705963,
6949
- "Нет": 0.7956728935241699
6950
  },
6951
  "sample": {
6952
  "messages": [
@@ -6976,7 +6976,7 @@
6976
  "prompt_len": 129,
6977
  "generated_len": 1,
6978
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6979
- "generated_token": " Нет"
6980
  }
6981
  }
6982
  {
@@ -6984,8 +6984,8 @@
6984
  "acc": true
6985
  },
6986
  "predict": {
6987
- "Да": 0.40705427527427673,
6988
- "Нет": 0.592260479927063
6989
  },
6990
  "sample": {
6991
  "messages": [
@@ -7023,8 +7023,8 @@
7023
  "acc": true
7024
  },
7025
  "predict": {
7026
- "Да": 9.60883335210383e-05,
7027
- "Нет": 0.9997573494911194
7028
  },
7029
  "sample": {
7030
  "messages": [
@@ -7059,11 +7059,11 @@
7059
  }
7060
  {
7061
  "metric": {
7062
- "acc": true
7063
  },
7064
  "predict": {
7065
- "Да": 0.43702271580696106,
7066
- "Нет": 0.5611482858657837
7067
  },
7068
  "sample": {
7069
  "messages": [
@@ -7093,7 +7093,7 @@
7093
  "prompt_len": 143,
7094
  "generated_len": 1,
7095
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7096
- "generated_token": " Нет"
7097
  }
7098
  }
7099
  {
@@ -7101,8 +7101,8 @@
7101
  "acc": true
7102
  },
7103
  "predict": {
7104
- "Да": 0.005216446705162525,
7105
- "Нет": 0.9940787553787231
7106
  },
7107
  "sample": {
7108
  "messages": [
@@ -7140,8 +7140,8 @@
7140
  "acc": true
7141
  },
7142
  "predict": {
7143
- "Да": 0.0007095822365954518,
7144
- "Нет": 0.9991662502288818
7145
  },
7146
  "sample": {
7147
  "messages": [
@@ -7176,11 +7176,11 @@
7176
  }
7177
  {
7178
  "metric": {
7179
- "acc": true
7180
  },
7181
  "predict": {
7182
- "Да": 0.2448546439409256,
7183
- "Нет": 0.7542054057121277
7184
  },
7185
  "sample": {
7186
  "messages": [
@@ -7210,7 +7210,7 @@
7210
  "prompt_len": 141,
7211
  "generated_len": 1,
7212
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7213
- "generated_token": " Нет"
7214
  }
7215
  }
7216
  {
@@ -7218,8 +7218,8 @@
7218
  "acc": false
7219
  },
7220
  "predict": {
7221
- "Да": 0.0011693498818203807,
7222
- "Нет": 0.9986935257911682
7223
  },
7224
  "sample": {
7225
  "messages": [
@@ -7257,8 +7257,8 @@
7257
  "acc": false
7258
  },
7259
  "predict": {
7260
- "Да": 0.06752399355173111,
7261
- "Нет": 0.9321399331092834
7262
  },
7263
  "sample": {
7264
  "messages": [
@@ -7296,8 +7296,8 @@
7296
  "acc": false
7297
  },
7298
  "predict": {
7299
- "Да": 0.7586944103240967,
7300
- "Нет": 0.11634956300258636
7301
  },
7302
  "sample": {
7303
  "messages": [
@@ -7335,8 +7335,8 @@
7335
  "acc": true
7336
  },
7337
  "predict": {
7338
- "Да": 0.7757806777954102,
7339
- "Нет": 0.22226488590240479
7340
  },
7341
  "sample": {
7342
  "messages": [
@@ -7371,11 +7371,11 @@
7371
  }
7372
  {
7373
  "metric": {
7374
- "acc": true
7375
  },
7376
  "predict": {
7377
- "Да": 0.37726110219955444,
7378
- "Нет": 0.6219983696937561
7379
  },
7380
  "sample": {
7381
  "messages": [
@@ -7405,7 +7405,7 @@
7405
  "prompt_len": 256,
7406
  "generated_len": 1,
7407
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7408
- "generated_token": " Нет"
7409
  }
7410
  }
7411
  {
@@ -7413,8 +7413,8 @@
7413
  "acc": true
7414
  },
7415
  "predict": {
7416
- "Да": 0.0040694172494113445,
7417
- "Нет": 0.9957535266876221
7418
  },
7419
  "sample": {
7420
  "messages": [
@@ -7452,8 +7452,8 @@
7452
  "acc": true
7453
  },
7454
  "predict": {
7455
- "Да": 0.40673938393592834,
7456
- "Нет": 0.5918022990226746
7457
  },
7458
  "sample": {
7459
  "messages": [
@@ -7491,8 +7491,8 @@
7491
  "acc": true
7492
  },
7493
  "predict": {
7494
- "Да": 0.9224575757980347,
7495
- "Нет": 0.0757199302315712
7496
  },
7497
  "sample": {
7498
  "messages": [
@@ -7530,8 +7530,8 @@
7530
  "acc": false
7531
  },
7532
  "predict": {
7533
- "Да": 0.005219148471951485,
7534
- "Нет": 0.9945936799049377
7535
  },
7536
  "sample": {
7537
  "messages": [
@@ -7566,11 +7566,11 @@
7566
  }
7567
  {
7568
  "metric": {
7569
- "acc": true
7570
  },
7571
  "predict": {
7572
- "Да": 0.1479138880968094,
7573
- "Нет": 0.8511857390403748
7574
  },
7575
  "sample": {
7576
  "messages": [
@@ -7600,16 +7600,16 @@
7600
  "prompt_len": 115,
7601
  "generated_len": 1,
7602
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7603
- "generated_token": " Нет"
7604
  }
7605
  }
7606
  {
7607
  "metric": {
7608
- "acc": false
7609
  },
7610
  "predict": {
7611
- "Да": 0.015902651473879814,
7612
- "Нет": 0.9838622212409973
7613
  },
7614
  "sample": {
7615
  "messages": [
@@ -7639,7 +7639,7 @@
7639
  "prompt_len": 175,
7640
  "generated_len": 1,
7641
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7642
- "generated_token": " Нет"
7643
  }
7644
  }
7645
  {
@@ -7647,8 +7647,8 @@
7647
  "acc": true
7648
  },
7649
  "predict": {
7650
- "Да": 0.05337413400411606,
7651
- "Нет": 0.9460790753364563
7652
  },
7653
  "sample": {
7654
  "messages": [
@@ -7686,8 +7686,8 @@
7686
  "acc": true
7687
  },
7688
  "predict": {
7689
- "Да": 0.03931839391589165,
7690
- "Нет": 0.8948825001716614
7691
  },
7692
  "sample": {
7693
  "messages": [
@@ -7725,8 +7725,8 @@
7725
  "acc": false
7726
  },
7727
  "predict": {
7728
- "Да": 0.9851431250572205,
7729
- "Нет": 0.014052311889827251
7730
  },
7731
  "sample": {
7732
  "messages": [
@@ -7764,8 +7764,8 @@
7764
  "acc": false
7765
  },
7766
  "predict": {
7767
- "Да": 0.0013247027527540922,
7768
- "Нет": 0.9984340071678162
7769
  },
7770
  "sample": {
7771
  "messages": [
@@ -7803,8 +7803,8 @@
7803
  "acc": false
7804
  },
7805
  "predict": {
7806
- "Да": 0.0010320350993424654,
7807
- "Нет": 0.9987781643867493
7808
  },
7809
  "sample": {
7810
  "messages": [
@@ -7842,8 +7842,8 @@
7842
  "acc": false
7843
  },
7844
  "predict": {
7845
- "Да": 0.8922936916351318,
7846
- "Нет": 0.10656928271055222
7847
  },
7848
  "sample": {
7849
  "messages": [
@@ -7881,8 +7881,8 @@
7881
  "acc": false
7882
  },
7883
  "predict": {
7884
- "Да": 0.0005525772576220334,
7885
- "Нет": 0.9990831613540649
7886
  },
7887
  "sample": {
7888
  "messages": [
@@ -7920,8 +7920,8 @@
7920
  "acc": true
7921
  },
7922
  "predict": {
7923
- "Да": 0.00010887620010180399,
7924
- "Нет": 0.9997009038925171
7925
  },
7926
  "sample": {
7927
  "messages": [
 
3
  "acc": true
4
  },
5
  "predict": {
6
+ "Да": 0.3177708387374878,
7
+ "Нет": 0.6727208495140076
8
  },
9
  "sample": {
10
  "messages": [
 
42
  "acc": false
43
  },
44
  "predict": {
45
+ "Да": 0.5510627031326294,
46
+ "Нет": 0.42916804552078247
47
  },
48
  "sample": {
49
  "messages": [
 
81
  "acc": false
82
  },
83
  "predict": {
84
+ "Да": 0.1462704986333847,
85
+ "Нет": 0.8417286276817322
86
  },
87
  "sample": {
88
  "messages": [
 
120
  "acc": false
121
  },
122
  "predict": {
123
+ "Да": 0.16274045407772064,
124
+ "Нет": 0.8264642357826233
125
  },
126
  "sample": {
127
  "messages": [
 
159
  "acc": true
160
  },
161
  "predict": {
162
+ "Да": 0.40234827995300293,
163
+ "Нет": 0.5854132771492004
164
  },
165
  "sample": {
166
  "messages": [
 
195
  }
196
  {
197
  "metric": {
198
+ "acc": false
199
  },
200
  "predict": {
201
+ "Да": 0.5870495438575745,
202
+ "Нет": 0.4034728705883026
203
  },
204
  "sample": {
205
  "messages": [
 
229
  "prompt_len": 172,
230
  "generated_len": 1,
231
  "generated_cumulative_logprob": "TODO: calculate for hf model",
232
+ "generated_token": " Да"
233
  }
234
  }
235
  {
 
237
  "acc": false
238
  },
239
  "predict": {
240
+ "Да": 0.1812204122543335,
241
+ "Нет": 0.8121734857559204
242
  },
243
  "sample": {
244
  "messages": [
 
273
  }
274
  {
275
  "metric": {
276
+ "acc": false
277
  },
278
  "predict": {
279
+ "Да": 0.555451512336731,
280
+ "Нет": 0.26237672567367554
281
  },
282
  "sample": {
283
  "messages": [
 
307
  "prompt_len": 183,
308
  "generated_len": 1,
309
  "generated_cumulative_logprob": "TODO: calculate for hf model",
310
+ "generated_token": " Да"
311
  }
312
  }
313
  {
 
315
  "acc": false
316
  },
317
  "predict": {
318
+ "Да": 0.7572181224822998,
319
+ "Нет": 0.21694663166999817
320
  },
321
  "sample": {
322
  "messages": [
 
354
  "acc": false
355
  },
356
  "predict": {
357
+ "Да": 0.372804194688797,
358
+ "Нет": 0.6146501898765564
359
  },
360
  "sample": {
361
  "messages": [
 
393
  "acc": false
394
  },
395
  "predict": {
396
+ "Да": 0.199379563331604,
397
+ "Нет": 0.7885614633560181
398
  },
399
  "sample": {
400
  "messages": [
 
432
  "acc": false
433
  },
434
  "predict": {
435
+ "Да": 0.5521984696388245,
436
+ "Нет": 0.43005257844924927
437
  },
438
  "sample": {
439
  "messages": [
 
468
  }
469
  {
470
  "metric": {
471
+ "acc": false
472
  },
473
  "predict": {
474
+ "Да": 0.8397411704063416,
475
+ "Нет": 0.1459251195192337
476
  },
477
  "sample": {
478
  "messages": [
 
502
  "prompt_len": 136,
503
  "generated_len": 1,
504
  "generated_cumulative_logprob": "TODO: calculate for hf model",
505
+ "generated_token": " Да"
506
  }
507
  }
508
  {
 
510
  "acc": false
511
  },
512
  "predict": {
513
+ "Да": 0.26557374000549316,
514
+ "Нет": 0.7219041585922241
515
  },
516
  "sample": {
517
  "messages": [
 
549
  "acc": false
550
  },
551
  "predict": {
552
+ "Да": 0.8436118364334106,
553
+ "Нет": 0.1465977430343628
554
  },
555
  "sample": {
556
  "messages": [
 
588
  "acc": false
589
  },
590
  "predict": {
591
+ "Да": 0.13194012641906738,
592
+ "Нет": 0.8603577017784119
593
  },
594
  "sample": {
595
  "messages": [
 
627
  "acc": true
628
  },
629
  "predict": {
630
+ "Да": 0.3180491030216217,
631
+ "Нет": 0.6733099222183228
632
  },
633
  "sample": {
634
  "messages": [
 
663
  }
664
  {
665
  "metric": {
666
+ "acc": false
667
  },
668
  "predict": {
669
+ "Да": 0.6367422342300415,
670
+ "Нет": 0.1824297159910202
671
  },
672
  "sample": {
673
  "messages": [
 
697
  "prompt_len": 157,
698
  "generated_len": 1,
699
  "generated_cumulative_logprob": "TODO: calculate for hf model",
700
+ "generated_token": " Да"
701
  }
702
  }
703
  {
 
705
  "acc": true
706
  },
707
  "predict": {
708
+ "Да": 0.8310219645500183,
709
+ "Нет": 0.14440995454788208
710
  },
711
  "sample": {
712
  "messages": [
 
741
  }
742
  {
743
  "metric": {
744
+ "acc": true
745
  },
746
  "predict": {
747
+ "Да": 0.6694736480712891,
748
+ "Нет": 0.3162369728088379
749
  },
750
  "sample": {
751
  "messages": [
 
775
  "prompt_len": 265,
776
  "generated_len": 1,
777
  "generated_cumulative_logprob": "TODO: calculate for hf model",
778
+ "generated_token": " Да"
779
  }
780
  }
781
  {
 
783
  "acc": false
784
  },
785
  "predict": {
786
+ "Да": 0.1803991049528122,
787
+ "Нет": 0.8084926605224609
788
  },
789
  "sample": {
790
  "messages": [
 
822
  "acc": true
823
  },
824
  "predict": {
825
+ "Да": 0.39615458250045776,
826
+ "Нет": 0.5764015316963196
827
  },
828
  "sample": {
829
  "messages": [
 
861
  "acc": true
862
  },
863
  "predict": {
864
+ "Да": 0.05950615927577019,
865
+ "Нет": 0.9308329820632935
866
  },
867
  "sample": {
868
  "messages": [
 
900
  "acc": true
901
  },
902
  "predict": {
903
+ "Да": 0.290959894657135,
904
+ "Нет": 0.6979765295982361
905
  },
906
  "sample": {
907
  "messages": [
 
939
  "acc": true
940
  },
941
  "predict": {
942
+ "Да": 0.344249963760376,
943
+ "Нет": 0.6431435942649841
944
  },
945
  "sample": {
946
  "messages": [
 
978
  "acc": true
979
  },
980
  "predict": {
981
+ "Да": 0.31753966212272644,
982
+ "Нет": 0.6722314357757568
983
  },
984
  "sample": {
985
  "messages": [
 
1017
  "acc": true
1018
  },
1019
  "predict": {
1020
+ "Да": 0.047137245535850525,
1021
+ "Нет": 0.9467768669128418
1022
  },
1023
  "sample": {
1024
  "messages": [
 
1056
  "acc": true
1057
  },
1058
  "predict": {
1059
+ "Да": 0.6703915596008301,
1060
+ "Нет": 0.11649657040834427
1061
  },
1062
  "sample": {
1063
  "messages": [
 
1095
  "acc": true
1096
  },
1097
  "predict": {
1098
+ "Да": 0.6857701539993286,
1099
+ "Нет": 0.2858715355396271
1100
  },
1101
  "sample": {
1102
  "messages": [
 
1134
  "acc": true
1135
  },
1136
  "predict": {
1137
+ "Да": 0.3724825084209442,
1138
+ "Нет": 0.614119827747345
1139
  },
1140
  "sample": {
1141
  "messages": [
 
1173
  "acc": true
1174
  },
1175
  "predict": {
1176
+ "Да": 0.1318102777004242,
1177
+ "Нет": 0.8595109581947327
1178
  },
1179
  "sample": {
1180
  "messages": [
 
1212
  "acc": true
1213
  },
1214
  "predict": {
1215
+ "Да": 0.315209299325943,
1216
+ "Нет": 0.6672980785369873
1217
  },
1218
  "sample": {
1219
  "messages": [
 
1251
  "acc": false
1252
  },
1253
  "predict": {
1254
+ "Да": 0.34368154406547546,
1255
+ "Нет": 0.6420816779136658
1256
  },
1257
  "sample": {
1258
  "messages": [
 
1287
  }
1288
  {
1289
  "metric": {
1290
+ "acc": true
1291
  },
1292
  "predict": {
1293
+ "Да": 0.556214451789856,
1294
+ "Нет": 0.4331802427768707
1295
  },
1296
  "sample": {
1297
  "messages": [
 
1321
  "prompt_len": 140,
1322
  "generated_len": 1,
1323
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1324
+ "generated_token": " Да"
1325
  }
1326
  }
1327
  {
 
1329
  "acc": false
1330
  },
1331
  "predict": {
1332
+ "Да": 0.5253308415412903,
1333
+ "Нет": 0.46360284090042114
1334
  },
1335
  "sample": {
1336
  "messages": [
 
1368
  "acc": true
1369
  },
1370
  "predict": {
1371
+ "Да": 0.19996023178100586,
1372
+ "Нет": 0.7908580899238586
1373
  },
1374
  "sample": {
1375
  "messages": [
 
1407
  "acc": false
1408
  },
1409
  "predict": {
1410
+ "Да": 0.14682213962078094,
1411
+ "Нет": 0.8449031114578247
1412
  },
1413
  "sample": {
1414
  "messages": [
 
1446
  "acc": false
1447
  },
1448
  "predict": {
1449
+ "Да": 0.6819260120391846,
1450
+ "Нет": 0.10457674413919449
1451
  },
1452
  "sample": {
1453
  "messages": [
 
1485
  "acc": false
1486
  },
1487
  "predict": {
1488
+ "Да": 0.7978547215461731,
1489
+ "Нет": 0.17802545428276062
1490
  },
1491
  "sample": {
1492
  "messages": [
 
1524
  "acc": true
1525
  },
1526
  "predict": {
1527
+ "Да": 0.43058961629867554,
1528
+ "Нет": 0.5528879761695862
1529
  },
1530
  "sample": {
1531
  "messages": [
 
1563
  "acc": true
1564
  },
1565
  "predict": {
1566
+ "Да": 0.2914990186691284,
1567
+ "Нет": 0.6992698311805725
1568
  },
1569
  "sample": {
1570
  "messages": [
 
1602
  "acc": true
1603
  },
1604
  "predict": {
1605
+ "Да": 0.39979875087738037,
1606
+ "Нет": 0.5817037224769592
1607
  },
1608
  "sample": {
1609
  "messages": [
 
1638
  }
1639
  {
1640
  "metric": {
1641
+ "acc": true
1642
  },
1643
  "predict": {
1644
+ "Да": 0.5226932168006897,
1645
+ "Нет": 0.4612751305103302
1646
  },
1647
  "sample": {
1648
  "messages": [
 
1672
  "prompt_len": 139,
1673
  "generated_len": 1,
1674
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1675
+ "generated_token": " Да"
1676
  }
1677
  }
1678
  {
1679
  "metric": {
1680
+ "acc": true
1681
  },
1682
  "predict": {
1683
+ "Да": 0.644417941570282,
1684
+ "Нет": 0.34493207931518555
1685
  },
1686
  "sample": {
1687
  "messages": [
 
1711
  "prompt_len": 129,
1712
  "generated_len": 1,
1713
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1714
+ "generated_token": " Да"
1715
  }
1716
  }
1717
  {
 
1719
  "acc": false
1720
  },
1721
  "predict": {
1722
+ "Да": 0.7686586380004883,
1723
+ "Нет": 0.22022439539432526
1724
  },
1725
  "sample": {
1726
  "messages": [
 
1758
  "acc": true
1759
  },
1760
  "predict": {
1761
+ "Да": 0.2667292058467865,
1762
+ "Нет": 0.7250450849533081
1763
  },
1764
  "sample": {
1765
  "messages": [
 
1797
  "acc": false
1798
  },
1799
  "predict": {
1800
+ "Да": 0.40373584628105164,
1801
+ "Нет": 0.5874322056770325
1802
  },
1803
  "sample": {
1804
  "messages": [
 
1833
  }
1834
  {
1835
  "metric": {
1836
+ "acc": false
1837
  },
1838
  "predict": {
1839
+ "Да": 0.6286691427230835,
1840
+ "Нет": 0.18011674284934998
1841
  },
1842
  "sample": {
1843
  "messages": [
 
1867
  "prompt_len": 147,
1868
  "generated_len": 1,
1869
  "generated_cumulative_logprob": "TODO: calculate for hf model",
1870
+ "generated_token": " Да"
1871
  }
1872
  }
1873
  {
 
1875
  "acc": false
1876
  },
1877
  "predict": {
1878
+ "Да": 0.8896501064300537,
1879
+ "Нет": 0.09376843273639679
1880
  },
1881
  "sample": {
1882
  "messages": [
 
1914
  "acc": true
1915
  },
1916
  "predict": {
1917
+ "Да": 0.3719407021999359,
1918
+ "Нет": 0.6132265329360962
1919
  },
1920
  "sample": {
1921
  "messages": [
 
1953
  "acc": false
1954
  },
1955
  "predict": {
1956
+ "Да": 0.180691197514534,
1957
+ "Нет": 0.8098016977310181
1958
  },
1959
  "sample": {
1960
  "messages": [
 
1992
  "acc": true
1993
  },
1994
  "predict": {
1995
+ "Да": 0.26445913314819336,
1996
+ "Нет": 0.7188743948936462
1997
  },
1998
  "sample": {
1999
  "messages": [
 
2031
  "acc": true
2032
  },
2033
  "predict": {
2034
+ "Да": 0.08422500640153885,
2035
+ "Нет": 0.9055041670799255
2036
  },
2037
  "sample": {
2038
  "messages": [
 
2070
  "acc": false
2071
  },
2072
  "predict": {
2073
+ "Да": 0.29040369391441345,
2074
+ "Нет": 0.696642279624939
2075
  },
2076
  "sample": {
2077
  "messages": [
 
2109
  "acc": true
2110
  },
2111
  "predict": {
2112
+ "Да": 0.40303125977516174,
2113
+ "Нет": 0.5864070057868958
2114
  },
2115
  "sample": {
2116
  "messages": [
 
2148
  "acc": false
2149
  },
2150
  "predict": {
2151
+ "Да": 0.34523308277130127,
2152
+ "Нет": 0.6449803113937378
2153
  },
2154
  "sample": {
2155
  "messages": [
 
2187
  "acc": false
2188
  },
2189
  "predict": {
2190
+ "Да": 0.22018398344516754,
2191
+ "Нет": 0.7685176134109497
2192
  },
2193
  "sample": {
2194
  "messages": [
 
2226
  "acc": true
2227
  },
2228
  "predict": {
2229
+ "Да": 0.7261016964912415,
2230
+ "Нет": 0.06753797829151154
2231
  },
2232
  "sample": {
2233
  "messages": [
 
2262
  }
2263
  {
2264
  "metric": {
2265
+ "acc": false
2266
  },
2267
  "predict": {
2268
+ "Да": 0.8162997364997864,
2269
+ "Нет": 0.16073894500732422
2270
  },
2271
  "sample": {
2272
  "messages": [
 
2296
  "prompt_len": 86,
2297
  "generated_len": 1,
2298
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2299
+ "generated_token": " Да"
2300
  }
2301
  }
2302
  {
 
2304
  "acc": true
2305
  },
2306
  "predict": {
2307
+ "Да": 0.11767112463712692,
2308
+ "Нет": 0.8694785237312317
2309
  },
2310
  "sample": {
2311
  "messages": [
 
2343
  "acc": false
2344
  },
2345
  "predict": {
2346
+ "Да": 0.3737662136554718,
2347
+ "Нет": 0.6162363290786743
2348
  },
2349
  "sample": {
2350
  "messages": [
 
2382
  "acc": false
2383
  },
2384
  "predict": {
2385
+ "Да": 0.48837897181510925,
2386
+ "Нет": 0.48837897181510925
2387
  },
2388
  "sample": {
2389
  "messages": [
 
2421
  "acc": true
2422
  },
2423
  "predict": {
2424
+ "Да": 0.13113625347614288,
2425
+ "Нет": 0.8551157712936401
2426
  },
2427
  "sample": {
2428
  "messages": [
 
2460
  "acc": false
2461
  },
2462
  "predict": {
2463
+ "Да": 0.14641688764095306,
2464
+ "Нет": 0.8425710797309875
2465
  },
2466
  "sample": {
2467
  "messages": [
 
2496
  }
2497
  {
2498
  "metric": {
2499
+ "acc": false
2500
  },
2501
  "predict": {
2502
+ "Да": 0.5243616104125977,
2503
+ "Нет": 0.4627474844455719
2504
  },
2505
  "sample": {
2506
  "messages": [
 
2530
  "prompt_len": 204,
2531
  "generated_len": 1,
2532
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2533
+ "generated_token": " Да"
2534
  }
2535
  }
2536
  {
2537
  "metric": {
2538
+ "acc": false
2539
  },
2540
  "predict": {
2541
+ "Да": 0.6734837889671326,
2542
+ "Нет": 0.3181312084197998
2543
  },
2544
  "sample": {
2545
  "messages": [
 
2569
  "prompt_len": 157,
2570
  "generated_len": 1,
2571
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2572
+ "generated_token": " Да"
2573
  }
2574
  }
2575
  {
 
2577
  "acc": false
2578
  },
2579
  "predict": {
2580
+ "Да": 0.1466711312532425,
2581
+ "Нет": 0.8440341353416443
2582
  },
2583
  "sample": {
2584
  "messages": [
 
2613
  }
2614
  {
2615
  "metric": {
2616
+ "acc": false
2617
  },
2618
  "predict": {
2619
+ "Да": 0.6537999510765076,
2620
+ "Нет": 0.16530652344226837
2621
  },
2622
  "sample": {
2623
  "messages": [
 
2647
  "prompt_len": 168,
2648
  "generated_len": 1,
2649
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2650
+ "generated_token": " Да"
2651
  }
2652
  }
2653
  {
 
2655
  "acc": false
2656
  },
2657
  "predict": {
2658
+ "Да": 0.6613848209381104,
2659
+ "Нет": 0.31241607666015625
2660
  },
2661
  "sample": {
2662
  "messages": [
 
2694
  "acc": true
2695
  },
2696
  "predict": {
2697
+ "Да": 0.2903401851654053,
2698
+ "Нет": 0.6964899301528931
2699
  },
2700
  "sample": {
2701
  "messages": [
 
2733
  "acc": false
2734
  },
2735
  "predict": {
2736
+ "Да": 0.2198316901922226,
2737
+ "Нет": 0.7672879695892334
2738
  },
2739
  "sample": {
2740
  "messages": [
 
2769
  }
2770
  {
2771
  "metric": {
2772
+ "acc": true
2773
  },
2774
  "predict": {
2775
+ "Да": 0.5178764462471008,
2776
+ "Нет": 0.45702433586120605
2777
  },
2778
  "sample": {
2779
  "messages": [
 
2803
  "prompt_len": 115,
2804
  "generated_len": 1,
2805
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2806
+ "generated_token": " Да"
2807
  }
2808
  }
2809
  {
 
2811
  "acc": true
2812
  },
2813
  "predict": {
2814
+ "Да": 0.8425301909446716,
2815
+ "Нет": 0.1464097797870636
2816
  },
2817
  "sample": {
2818
  "messages": [
 
2847
  }
2848
  {
2849
  "metric": {
2850
+ "acc": false
2851
  },
2852
  "predict": {
2853
+ "Да": 0.524517834186554,
2854
+ "Нет": 0.4628853499889374
2855
  },
2856
  "sample": {
2857
  "messages": [
 
2881
  "prompt_len": 126,
2882
  "generated_len": 1,
2883
  "generated_cumulative_logprob": "TODO: calculate for hf model",
2884
+ "generated_token": " Да"
2885
  }
2886
  }
2887
  {
 
2889
  "acc": false
2890
  },
2891
  "predict": {
2892
+ "Да": 0.721961498260498,
2893
+ "Нет": 0.26559484004974365
2894
  },
2895
  "sample": {
2896
  "messages": [
 
2928
  "acc": false
2929
  },
2930
  "predict": {
2931
+ "Да": 0.05969366058707237,
2932
+ "Нет": 0.9337659478187561
2933
  },
2934
  "sample": {
2935
  "messages": [
 
2967
  "acc": false
2968
  },
2969
  "predict": {
2970
+ "Да": 0.11805005371570587,
2971
+ "Нет": 0.8722785115242004
2972
  },
2973
  "sample": {
2974
  "messages": [
 
3003
  }
3004
  {
3005
  "metric": {
3006
+ "acc": false
3007
  },
3008
  "predict": {
3009
+ "Да": 0.7121773958206177,
3010
+ "Нет": 0.12375786900520325
3011
  },
3012
  "sample": {
3013
  "messages": [
 
3037
  "prompt_len": 185,
3038
  "generated_len": 1,
3039
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3040
+ "generated_token": " Да"
3041
  }
3042
  }
3043
  {
 
3045
  "acc": true
3046
  },
3047
  "predict": {
3048
+ "Да": 0.7993603944778442,
3049
+ "Нет": 0.1783614158630371
3050
  },
3051
  "sample": {
3052
  "messages": [
 
3081
  }
3082
  {
3083
  "metric": {
3084
+ "acc": true
3085
  },
3086
  "predict": {
3087
+ "Да": 0.4931306540966034,
3088
+ "Нет": 0.4931306540966034
3089
  },
3090
  "sample": {
3091
  "messages": [
 
3115
  "prompt_len": 256,
3116
  "generated_len": 1,
3117
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3118
+ "generated_token": " Да"
3119
  }
3120
  }
3121
  {
 
3123
  "acc": false
3124
  },
3125
  "predict": {
3126
+ "Да": 0.24220842123031616,
3127
+ "Нет": 0.746054470539093
3128
  },
3129
  "sample": {
3130
  "messages": [
 
3162
  "acc": false
3163
  },
3164
  "predict": {
3165
+ "Да": 0.6129973530769348,
3166
+ "Нет": 0.3718017041683197
3167
  },
3168
  "sample": {
3169
  "messages": [
 
3198
  }
3199
  {
3200
  "metric": {
3201
+ "acc": true
3202
  },
3203
  "predict": {
3204
+ "Да": 0.6393494009971619,
3205
+ "Нет": 0.3422190845012665
3206
  },
3207
  "sample": {
3208
  "messages": [
 
3232
  "prompt_len": 109,
3233
  "generated_len": 1,
3234
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3235
+ "generated_token": " Да"
3236
  }
3237
  }
3238
  {
 
3240
  "acc": false
3241
  },
3242
  "predict": {
3243
+ "Да": 0.29139018058776855,
3244
+ "Нет": 0.6990087032318115
3245
  },
3246
  "sample": {
3247
  "messages": [
 
3279
  "acc": false
3280
  },
3281
  "predict": {
3282
+ "Да": 0.859817624092102,
3283
+ "Нет": 0.13185730576515198
3284
  },
3285
  "sample": {
3286
  "messages": [
 
3318
  "acc": false
3319
  },
3320
  "predict": {
3321
+ "Да": 0.0671381726861,
3322
+ "Нет": 0.9268138408660889
3323
  },
3324
  "sample": {
3325
  "messages": [
 
3357
  "acc": true
3358
  },
3359
  "predict": {
3360
+ "Да": 0.2661048471927643,
3361
+ "Нет": 0.7233479022979736
3362
  },
3363
  "sample": {
3364
  "messages": [
 
3396
  "acc": false
3397
  },
3398
  "predict": {
3399
+ "Да": 0.7852900624275208,
3400
+ "Нет": 0.044303033500909805
3401
  },
3402
  "sample": {
3403
  "messages": [
 
3432
  }
3433
  {
3434
  "metric": {
3435
+ "acc": false
3436
  },
3437
  "predict": {
3438
+ "Да": 0.661439061164856,
3439
+ "Нет": 0.3124416768550873
3440
  },
3441
  "sample": {
3442
  "messages": [
 
3466
  "prompt_len": 134,
3467
  "generated_len": 1,
3468
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3469
+ "generated_token": " Да"
3470
  }
3471
  }
3472
  {
 
3474
  "acc": false
3475
  },
3476
  "predict": {
3477
+ "Да": 0.4621798098087311,
3478
+ "Нет": 0.5237182974815369
3479
  },
3480
  "sample": {
3481
  "messages": [
 
3513
  "acc": true
3514
  },
3515
  "predict": {
3516
+ "Да": 0.13172535598278046,
3517
+ "Нет": 0.8589572310447693
3518
  },
3519
  "sample": {
3520
  "messages": [
 
3552
  "acc": true
3553
  },
3554
  "predict": {
3555
+ "Да": 0.19765295088291168,
3556
+ "Нет": 0.7817325592041016
3557
  },
3558
  "sample": {
3559
  "messages": [
 
3591
  "acc": false
3592
  },
3593
  "predict": {
3594
+ "Да": 0.26451557874679565,
3595
+ "Нет": 0.7190278172492981
3596
  },
3597
  "sample": {
3598
  "messages": [
 
3630
  "acc": true
3631
  },
3632
  "predict": {
3633
+ "Да": 0.43345609307289124,
3634
+ "Нет": 0.5565686225891113
3635
  },
3636
  "sample": {
3637
  "messages": [
 
3669
  "acc": true
3670
  },
3671
  "predict": {
3672
+ "Да": 0.8087101578712463,
3673
+ "Нет": 0.18044762313365936
3674
  },
3675
  "sample": {
3676
  "messages": [
 
3708
  "acc": false
3709
  },
3710
  "predict": {
3711
+ "Да": 0.18087710440158844,
3712
+ "Нет": 0.8106349110603333
3713
  },
3714
  "sample": {
3715
  "messages": [
 
3747
  "acc": false
3748
  },
3749
  "predict": {
3750
+ "Да": 0.09458442777395248,
3751
+ "Нет": 0.8973921537399292
3752
  },
3753
  "sample": {
3754
  "messages": [
 
3786
  "acc": false
3787
  },
3788
  "predict": {
3789
+ "Да": 0.8824678659439087,
3790
+ "Нет": 0.018315035849809647
3791
  },
3792
  "sample": {
3793
  "messages": [
 
3822
  }
3823
  {
3824
  "metric": {
3825
+ "acc": true
3826
  },
3827
  "predict": {
3828
+ "Да": 0.8161836266517639,
3829
+ "Нет": 0.16071607172489166
3830
  },
3831
  "sample": {
3832
  "messages": [
 
3856
  "prompt_len": 127,
3857
  "generated_len": 1,
3858
  "generated_cumulative_logprob": "TODO: calculate for hf model",
3859
+ "generated_token": " Да"
3860
  }
3861
  }
3862
  {
 
3864
  "acc": false
3865
  },
3866
  "predict": {
3867
+ "Да": 0.31640368700027466,
3868
+ "Нет": 0.6698265671730042
3869
  },
3870
  "sample": {
3871
  "messages": [
 
3903
  "acc": true
3904
  },
3905
  "predict": {
3906
+ "Да": 0.31690847873687744,
3907
+ "Нет": 0.6708952188491821
3908
  },
3909
  "sample": {
3910
  "messages": [
 
3942
  "acc": false
3943
  },
3944
  "predict": {
3945
+ "Да": 0.5515921115875244,
3946
+ "Нет": 0.42958036065101624
3947
  },
3948
  "sample": {
3949
  "messages": [
 
3981
  "acc": true
3982
  },
3983
  "predict": {
3984
+ "Да": 0.02903013676404953,
3985
+ "Нет": 0.9613460898399353
3986
  },
3987
  "sample": {
3988
  "messages": [
 
4020
  "acc": true
4021
  },
4022
  "predict": {
4023
+ "Да": 0.4634995758533478,
4024
+ "Нет": 0.5252138376235962
4025
  },
4026
  "sample": {
4027
  "messages": [
 
4059
  "acc": true
4060
  },
4061
  "predict": {
4062
+ "Да": 0.6726118922233582,
4063
+ "Нет": 0.31771937012672424
4064
  },
4065
  "sample": {
4066
  "messages": [
 
4098
  "acc": false
4099
  },
4100
  "predict": {
4101
+ "Да": 0.10598231852054596,
4102
+ "Нет": 0.8873790502548218
4103
  },
4104
  "sample": {
4105
  "messages": [
 
4137
  "acc": true
4138
  },
4139
  "predict": {
4140
+ "Да": 0.09457684308290482,
4141
+ "Нет": 0.8973201513290405
4142
  },
4143
  "sample": {
4144
  "messages": [
 
4176
  "acc": true
4177
  },
4178
  "predict": {
4179
+ "Да": 0.759200930595398,
4180
+ "Нет": 0.07061668485403061
4181
  },
4182
  "sample": {
4183
  "messages": [
 
4215
  "acc": true
4216
  },
4217
  "predict": {
4218
+ "Да": 0.9164130091667175,
4219
+ "Нет": 0.06638473272323608
4220
  },
4221
  "sample": {
4222
  "messages": [
 
4254
  "acc": true
4255
  },
4256
  "predict": {
4257
+ "Да": 0.34382012486457825,
4258
+ "Нет": 0.6423404812812805
4259
  },
4260
  "sample": {
4261
  "messages": [
 
4293
  "acc": false
4294
  },
4295
  "predict": {
4296
+ "Да": 0.40259450674057007,
4297
+ "Нет": 0.5857715606689453
4298
  },
4299
  "sample": {
4300
  "messages": [
 
4329
  }
4330
  {
4331
  "metric": {
4332
+ "acc": false
4333
  },
4334
  "predict": {
4335
+ "Да": 0.5218465924263,
4336
+ "Нет": 0.46052801609039307
4337
  },
4338
  "sample": {
4339
  "messages": [
 
4363
  "prompt_len": 117,
4364
  "generated_len": 1,
4365
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4366
+ "generated_token": " Да"
4367
  }
4368
  }
4369
  {
 
4371
  "acc": false
4372
  },
4373
  "predict": {
4374
+ "Да": 0.3146660625934601,
4375
+ "Нет": 0.6661480665206909
4376
  },
4377
  "sample": {
4378
  "messages": [
 
4410
  "acc": true
4411
  },
4412
  "predict": {
4413
+ "Да": 0.402546763420105,
4414
+ "Нет": 0.5857020616531372
4415
  },
4416
  "sample": {
4417
  "messages": [
 
4446
  }
4447
  {
4448
  "metric": {
4449
+ "acc": true
4450
  },
4451
  "predict": {
4452
+ "Да": 0.4943414032459259,
4453
+ "Нет": 0.4943414032459259
4454
  },
4455
  "sample": {
4456
  "messages": [
 
4480
  "prompt_len": 111,
4481
  "generated_len": 1,
4482
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4483
+ "generated_token": " Да"
4484
  }
4485
  }
4486
  {
 
4488
  "acc": true
4489
  },
4490
  "predict": {
4491
+ "Да": 0.16312016546726227,
4492
+ "Нет": 0.8283925652503967
4493
  },
4494
  "sample": {
4495
  "messages": [
 
4527
  "acc": false
4528
  },
4529
  "predict": {
4530
+ "Да": 0.4651126265525818,
4531
+ "Нет": 0.5270416736602783
4532
  },
4533
  "sample": {
4534
  "messages": [
 
4566
  "acc": false
4567
  },
4568
  "predict": {
4569
+ "Да": 0.8558553457260132,
4570
+ "Нет": 0.025844592601060867
4571
  },
4572
  "sample": {
4573
  "messages": [
 
4602
  }
4603
  {
4604
  "metric": {
4605
+ "acc": false
4606
  },
4607
  "predict": {
4608
+ "Да": 0.5476301908493042,
4609
+ "Нет": 0.4264948070049286
4610
  },
4611
  "sample": {
4612
  "messages": [
 
4636
  "prompt_len": 106,
4637
  "generated_len": 1,
4638
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4639
+ "generated_token": " Да"
4640
  }
4641
  }
4642
  {
 
4644
  "acc": true
4645
  },
4646
  "predict": {
4647
+ "Да": 0.3729587197303772,
4648
+ "Нет": 0.6149049401283264
4649
  },
4650
  "sample": {
4651
  "messages": [
 
4680
  }
4681
  {
4682
  "metric": {
4683
+ "acc": false
4684
  },
4685
  "predict": {
4686
+ "Да": 0.5560380220413208,
4687
+ "Нет": 0.43304288387298584
4688
  },
4689
  "sample": {
4690
  "messages": [
 
4714
  "prompt_len": 104,
4715
  "generated_len": 1,
4716
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4717
+ "generated_token": " Да"
4718
  }
4719
  }
4720
  {
 
4722
  "acc": true
4723
  },
4724
  "predict": {
4725
+ "Да": 0.26346248388290405,
4726
+ "Нет": 0.7161652445793152
4727
  },
4728
  "sample": {
4729
  "messages": [
 
4761
  "acc": true
4762
  },
4763
  "predict": {
4764
+ "Да": 0.24148209393024445,
4765
+ "Нет": 0.7438172698020935
4766
  },
4767
  "sample": {
4768
  "messages": [
 
4797
  }
4798
  {
4799
  "metric": {
4800
+ "acc": true
4801
  },
4802
  "predict": {
4803
+ "Да": 0.64447420835495,
4804
+ "Нет": 0.3449622094631195
4805
  },
4806
  "sample": {
4807
  "messages": [
 
4831
  "prompt_len": 105,
4832
  "generated_len": 1,
4833
  "generated_cumulative_logprob": "TODO: calculate for hf model",
4834
+ "generated_token": " Да"
4835
  }
4836
  }
4837
  {
 
4839
  "acc": true
4840
  },
4841
  "predict": {
4842
+ "Да": 0.14658458530902863,
4843
+ "Нет": 0.8435360789299011
4844
  },
4845
  "sample": {
4846
  "messages": [
 
4878
  "acc": true
4879
  },
4880
  "predict": {
4881
+ "Да": 0.16303664445877075,
4882
+ "Нет": 0.8279684782028198
4883
  },
4884
  "sample": {
4885
  "messages": [
 
4917
  "acc": true
4918
  },
4919
  "predict": {
4920
+ "Да": 0.14677007496356964,
4921
+ "Нет": 0.8446035385131836
4922
  },
4923
  "sample": {
4924
  "messages": [
 
4956
  "acc": false
4957
  },
4958
  "predict": {
4959
+ "Да": 0.7619085907936096,
4960
+ "Нет": 0.10311310738325119
4961
  },
4962
  "sample": {
4963
  "messages": [
 
4995
  "acc": false
4996
  },
4997
  "predict": {
4998
+ "Да": 0.26235565543174744,
4999
+ "Нет": 0.713156521320343
5000
  },
5001
  "sample": {
5002
  "messages": [
 
5031
  }
5032
  {
5033
  "metric": {
5034
+ "acc": true
5035
  },
5036
  "predict": {
5037
+ "Да": 0.4940703511238098,
5038
+ "Нет": 0.4940703511238098
5039
  },
5040
  "sample": {
5041
  "messages": [
 
5065
  "prompt_len": 228,
5066
  "generated_len": 1,
5067
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5068
+ "generated_token": " Да"
5069
  }
5070
  }
5071
  {
5072
  "metric": {
5073
+ "acc": true
5074
  },
5075
  "predict": {
5076
+ "Да": 0.556427538394928,
5077
+ "Нет": 0.4333462119102478
5078
  },
5079
  "sample": {
5080
  "messages": [
 
5104
  "prompt_len": 101,
5105
  "generated_len": 1,
5106
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5107
+ "generated_token": " Да"
5108
  }
5109
  }
5110
  {
 
5112
  "acc": true
5113
  },
5114
  "predict": {
5115
+ "Да": 0.2407470941543579,
5116
+ "Нет": 0.7415533065795898
5117
  },
5118
  "sample": {
5119
  "messages": [
 
5151
  "acc": true
5152
  },
5153
  "predict": {
5154
+ "Да": 0.2656715512275696,
5155
+ "Нет": 0.7221701145172119
5156
  },
5157
  "sample": {
5158
  "messages": [
 
5190
  "acc": false
5191
  },
5192
  "predict": {
5193
+ "Да": 0.46243879199028015,
5194
+ "Нет": 0.5240117907524109
5195
  },
5196
  "sample": {
5197
  "messages": [
 
5229
  "acc": false
5230
  },
5231
  "predict": {
5232
+ "Да": 0.7232219576835632,
5233
+ "Нет": 0.2660585045814514
5234
  },
5235
  "sample": {
5236
  "messages": [
 
5268
  "acc": true
5269
  },
5270
  "predict": {
5271
+ "Да": 0.4041346609592438,
5272
+ "Нет": 0.5880124568939209
5273
  },
5274
  "sample": {
5275
  "messages": [
 
5307
  "acc": false
5308
  },
5309
  "predict": {
5310
+ "Да": 0.1467008739709854,
5311
+ "Нет": 0.8442053198814392
5312
  },
5313
  "sample": {
5314
  "messages": [
 
5346
  "acc": true
5347
  },
5348
  "predict": {
5349
+ "Да": 0.6610932946205139,
5350
+ "Нет": 0.1894063949584961
5351
  },
5352
  "sample": {
5353
  "messages": [
 
5382
  }
5383
  {
5384
  "metric": {
5385
+ "acc": false
5386
  },
5387
  "predict": {
5388
+ "Да": 0.7113779783248901,
5389
+ "Нет": 0.2617013454437256
5390
  },
5391
  "sample": {
5392
  "messages": [
 
5416
  "prompt_len": 78,
5417
  "generated_len": 1,
5418
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5419
+ "generated_token": " Да"
5420
  }
5421
  }
5422
  {
5423
  "metric": {
5424
+ "acc": false
5425
  },
5426
  "predict": {
5427
+ "Да": 0.4932982623577118,
5428
+ "Нет": 0.4932982623577118
5429
  },
5430
  "sample": {
5431
  "messages": [
 
5455
  "prompt_len": 224,
5456
  "generated_len": 1,
5457
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5458
+ "generated_token": " Да"
5459
  }
5460
  }
5461
  {
 
5463
  "acc": true
5464
  },
5465
  "predict": {
5466
+ "Да": 0.291109561920166,
5467
+ "Нет": 0.698335587978363
5468
  },
5469
  "sample": {
5470
  "messages": [
 
5502
  "acc": false
5503
  },
5504
  "predict": {
5505
+ "Да": 0.4291605055332184,
5506
+ "Нет": 0.551052987575531
5507
  },
5508
  "sample": {
5509
  "messages": [
 
5541
  "acc": false
5542
  },
5543
  "predict": {
5544
+ "Да": 0.2900485098361969,
5545
+ "Нет": 0.69579017162323
5546
  },
5547
  "sample": {
5548
  "messages": [
 
5580
  "acc": false
5581
  },
5582
  "predict": {
5583
+ "Да": 0.2201392948627472,
5584
+ "Нет": 0.7683616280555725
5585
  },
5586
  "sample": {
5587
  "messages": [
 
5616
  }
5617
  {
5618
  "metric": {
5619
+ "acc": true
5620
  },
5621
  "predict": {
5622
+ "Да": 0.5840882062911987,
5623
+ "Нет": 0.40143755078315735
5624
  },
5625
  "sample": {
5626
  "messages": [
 
5650
  "prompt_len": 105,
5651
  "generated_len": 1,
5652
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5653
+ "generated_token": " Да"
5654
  }
5655
  }
5656
  {
 
5658
  "acc": false
5659
  },
5660
  "predict": {
5661
+ "Да": 0.4643440246582031,
5662
+ "Нет": 0.5261707305908203
5663
  },
5664
  "sample": {
5665
  "messages": [
 
5697
  "acc": false
5698
  },
5699
  "predict": {
5700
+ "Да": 0.24299320578575134,
5701
+ "Нет": 0.748471736907959
5702
  },
5703
  "sample": {
5704
  "messages": [
 
5736
  "acc": false
5737
  },
5738
  "predict": {
5739
+ "Да": 0.6578083634376526,
5740
+ "Нет": 0.14677689969539642
5741
  },
5742
  "sample": {
5743
  "messages": [
 
5772
  }
5773
  {
5774
  "metric": {
5775
+ "acc": true
5776
  },
5777
  "predict": {
5778
+ "Да": 0.7556150555610657,
5779
+ "Нет": 0.2164873480796814
5780
  },
5781
  "sample": {
5782
  "messages": [
 
5806
  "prompt_len": 128,
5807
  "generated_len": 1,
5808
  "generated_cumulative_logprob": "TODO: calculate for hf model",
5809
+ "generated_token": " Да"
5810
  }
5811
  }
5812
  {
 
5814
  "acc": false
5815
  },
5816
  "predict": {
5817
+ "Да": 0.402601957321167,
5818
+ "Нет": 0.5857823491096497
5819
  },
5820
  "sample": {
5821
  "messages": [
 
5853
  "acc": true
5854
  },
5855
  "predict": {
5856
+ "Да": 0.19968147575855255,
5857
+ "Нет": 0.7897555232048035
5858
  },
5859
  "sample": {
5860
  "messages": [
 
5892
  "acc": false
5893
  },
5894
  "predict": {
5895
+ "Да": 0.48904749751091003,
5896
+ "Нет": 0.48904749751091003
5897
  },
5898
  "sample": {
5899
  "messages": [
 
5931
  "acc": false
5932
  },
5933
  "predict": {
5934
+ "Да": 0.46068769693374634,
5935
+ "Нет": 0.5220275521278381
5936
  },
5937
  "sample": {
5938
  "messages": [
 
5970
  "acc": true
5971
  },
5972
  "predict": {
5973
+ "Да": 0.3446996510028839,
5974
+ "Нет": 0.6439837217330933
5975
  },
5976
  "sample": {
5977
  "messages": [
 
6006
  }
6007
  {
6008
  "metric": {
6009
+ "acc": false
6010
  },
6011
  "predict": {
6012
+ "Да": 0.5551593899726868,
6013
+ "Нет": 0.43235859274864197
6014
  },
6015
  "sample": {
6016
  "messages": [
 
6040
  "prompt_len": 206,
6041
  "generated_len": 1,
6042
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6043
+ "generated_token": " Да"
6044
  }
6045
  }
6046
  {
6047
  "metric": {
6048
+ "acc": true
6049
  },
6050
  "predict": {
6051
+ "Да": 0.6718888282775879,
6052
+ "Нет": 0.31737783551216125
6053
  },
6054
  "sample": {
6055
  "messages": [
 
6079
  "prompt_len": 174,
6080
  "generated_len": 1,
6081
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6082
+ "generated_token": " Да"
6083
  }
6084
  }
6085
  {
 
6087
  "acc": false
6088
  },
6089
  "predict": {
6090
+ "Да": 0.29172345995903015,
6091
+ "Нет": 0.6998081803321838
6092
  },
6093
  "sample": {
6094
  "messages": [
 
6126
  "acc": false
6127
  },
6128
  "predict": {
6129
+ "Да": 0.7152285575866699,
6130
+ "Нет": 0.14083683490753174
6131
  },
6132
  "sample": {
6133
  "messages": [
 
6162
  }
6163
  {
6164
  "metric": {
6165
+ "acc": true
6166
  },
6167
  "predict": {
6168
+ "Да": 0.6622115969657898,
6169
+ "Нет": 0.312806636095047
6170
  },
6171
  "sample": {
6172
  "messages": [
 
6196
  "prompt_len": 105,
6197
  "generated_len": 1,
6198
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6199
+ "generated_token": " Да"
6200
  }
6201
  }
6202
  {
 
6204
  "acc": false
6205
  },
6206
  "predict": {
6207
+ "Да": 0.3157346844673157,
6208
+ "Нет": 0.6684103012084961
6209
  },
6210
  "sample": {
6211
  "messages": [
 
6243
  "acc": true
6244
  },
6245
  "predict": {
6246
+ "Да": 0.316948264837265,
6247
+ "Нет": 0.6709794402122498
6248
  },
6249
  "sample": {
6250
  "messages": [
 
6279
  }
6280
  {
6281
  "metric": {
6282
+ "acc": false
6283
  },
6284
  "predict": {
6285
+ "Да": 0.4906494915485382,
6286
+ "Нет": 0.4906494915485382
6287
  },
6288
  "sample": {
6289
  "messages": [
 
6313
  "prompt_len": 114,
6314
  "generated_len": 1,
6315
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6316
+ "generated_token": " Да"
6317
  }
6318
  }
6319
  {
 
6321
  "acc": false
6322
  },
6323
  "predict": {
6324
+ "Да": 0.032708365470170975,
6325
+ "Нет": 0.9558786153793335
6326
  },
6327
  "sample": {
6328
  "messages": [
 
6360
  "acc": true
6361
  },
6362
  "predict": {
6363
+ "Да": 0.4026472568511963,
6364
+ "Нет": 0.5858482718467712
6365
  },
6366
  "sample": {
6367
  "messages": [
 
6399
  "acc": true
6400
  },
6401
  "predict": {
6402
+ "Да": 0.7218838930130005,
6403
+ "Нет": 0.26556628942489624
6404
  },
6405
  "sample": {
6406
  "messages": [
 
6438
  "acc": false
6439
  },
6440
  "predict": {
6441
+ "Да": 0.16291382908821106,
6442
+ "Нет": 0.8273447155952454
6443
  },
6444
  "sample": {
6445
  "messages": [
 
6477
  "acc": true
6478
  },
6479
  "predict": {
6480
+ "Да": 0.06710854917764664,
6481
+ "Нет": 0.9264049530029297
6482
  },
6483
  "sample": {
6484
  "messages": [
 
6516
  "acc": true
6517
  },
6518
  "predict": {
6519
+ "Да": 0.7096846699714661,
6520
+ "Нет": 0.1583520472049713
6521
  },
6522
  "sample": {
6523
  "messages": [
 
6552
  }
6553
  {
6554
  "metric": {
6555
+ "acc": true
6556
  },
6557
  "predict": {
6558
+ "Да": 0.6868926286697388,
6559
+ "Нет": 0.2863394320011139
6560
  },
6561
  "sample": {
6562
  "messages": [
 
6586
  "prompt_len": 104,
6587
  "generated_len": 1,
6588
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6589
+ "generated_token": " Да"
6590
  }
6591
  }
6592
  {
 
6594
  "acc": false
6595
  },
6596
  "predict": {
6597
+ "Да": 0.431057870388031,
6598
+ "Нет": 0.5534892678260803
6599
  },
6600
  "sample": {
6601
  "messages": [
 
6633
  "acc": false
6634
  },
6635
  "predict": {
6636
+ "Да": 0.11795960366725922,
6637
+ "Нет": 0.871610164642334
6638
  },
6639
  "sample": {
6640
  "messages": [
 
6672
  "acc": true
6673
  },
6674
  "predict": {
6675
+ "Да": 0.3427065312862396,
6676
+ "Нет": 0.6402600407600403
6677
  },
6678
  "sample": {
6679
  "messages": [
 
6711
  "acc": true
6712
  },
6713
  "predict": {
6714
+ "Да": 0.0841980129480362,
6715
+ "Нет": 0.9052139520645142
6716
  },
6717
  "sample": {
6718
  "messages": [
 
6750
  "acc": true
6751
  },
6752
  "predict": {
6753
+ "Да": 0.4628131091594696,
6754
+ "Нет": 0.5244359374046326
6755
  },
6756
  "sample": {
6757
  "messages": [
 
6789
  "acc": true
6790
  },
6791
  "predict": {
6792
+ "Да": 0.49528276920318604,
6793
+ "Нет": 0.49528276920318604
6794
  },
6795
  "sample": {
6796
  "messages": [
 
6828
  "acc": true
6829
  },
6830
  "predict": {
6831
+ "Да": 0.4035145938396454,
6832
+ "Нет": 0.5871102809906006
6833
  },
6834
  "sample": {
6835
  "messages": [
 
6867
  "acc": true
6868
  },
6869
  "predict": {
6870
+ "Да": 0.11797474324703217,
6871
+ "Нет": 0.8717220425605774
6872
  },
6873
  "sample": {
6874
  "messages": [
 
6906
  "acc": true
6907
  },
6908
  "predict": {
6909
+ "Да": 0.7332377433776855,
6910
+ "Нет": 0.12741760909557343
6911
  },
6912
  "sample": {
6913
  "messages": [
 
6942
  }
6943
  {
6944
  "metric": {
6945
+ "acc": true
6946
  },
6947
  "predict": {
6948
+ "Да": 0.8314090371131897,
6949
+ "Нет": 0.1444772183895111
6950
  },
6951
  "sample": {
6952
  "messages": [
 
6976
  "prompt_len": 129,
6977
  "generated_len": 1,
6978
  "generated_cumulative_logprob": "TODO: calculate for hf model",
6979
+ "generated_token": " Да"
6980
  }
6981
  }
6982
  {
 
6984
  "acc": true
6985
  },
6986
  "predict": {
6987
+ "Да": 0.4627135396003723,
6988
+ "Нет": 0.5243231058120728
6989
  },
6990
  "sample": {
6991
  "messages": [
 
7023
  "acc": true
7024
  },
7025
  "predict": {
7026
+ "Да": 0.06682266294956207,
7027
+ "Нет": 0.9224584102630615
7028
  },
7029
  "sample": {
7030
  "messages": [
 
7059
  }
7060
  {
7061
  "metric": {
7062
+ "acc": false
7063
  },
7064
  "predict": {
7065
+ "Да": 0.5513796210289001,
7066
+ "Нет": 0.42941489815711975
7067
  },
7068
  "sample": {
7069
  "messages": [
 
7093
  "prompt_len": 143,
7094
  "generated_len": 1,
7095
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7096
+ "generated_token": " Да"
7097
  }
7098
  }
7099
  {
 
7101
  "acc": true
7102
  },
7103
  "predict": {
7104
+ "Да": 0.4604188799858093,
7105
+ "Нет": 0.5217229723930359
7106
  },
7107
  "sample": {
7108
  "messages": [
 
7140
  "acc": true
7141
  },
7142
  "predict": {
7143
+ "Да": 0.2908935546875,
7144
+ "Нет": 0.6978173851966858
7145
  },
7146
  "sample": {
7147
  "messages": [
 
7176
  }
7177
  {
7178
  "metric": {
7179
+ "acc": false
7180
  },
7181
  "predict": {
7182
+ "Да": 0.5554031729698181,
7183
+ "Нет": 0.4325484335422516
7184
  },
7185
  "sample": {
7186
  "messages": [
 
7210
  "prompt_len": 141,
7211
  "generated_len": 1,
7212
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7213
+ "generated_token": " Да"
7214
  }
7215
  }
7216
  {
 
7218
  "acc": false
7219
  },
7220
  "predict": {
7221
+ "Да": 0.3448418080806732,
7222
+ "Нет": 0.6442492604255676
7223
  },
7224
  "sample": {
7225
  "messages": [
 
7257
  "acc": false
7258
  },
7259
  "predict": {
7260
+ "Да": 0.24322527647018433,
7261
+ "Нет": 0.749186635017395
7262
  },
7263
  "sample": {
7264
  "messages": [
 
7296
  "acc": false
7297
  },
7298
  "predict": {
7299
+ "Да": 0.6574149131774902,
7300
+ "Нет": 0.1142415776848793
7301
  },
7302
  "sample": {
7303
  "messages": [
 
7335
  "acc": true
7336
  },
7337
  "predict": {
7338
+ "Да": 0.8883598446846008,
7339
+ "Нет": 0.09363243728876114
7340
  },
7341
  "sample": {
7342
  "messages": [
 
7371
  }
7372
  {
7373
  "metric": {
7374
+ "acc": false
7375
  },
7376
  "predict": {
7377
+ "Да": 0.5544126033782959,
7378
+ "Нет": 0.4317769706249237
7379
  },
7380
  "sample": {
7381
  "messages": [
 
7405
  "prompt_len": 256,
7406
  "generated_len": 1,
7407
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7408
+ "generated_token": " Да"
7409
  }
7410
  }
7411
  {
 
7413
  "acc": true
7414
  },
7415
  "predict": {
7416
+ "Да": 0.24236559867858887,
7417
+ "Нет": 0.7465386390686035
7418
  },
7419
  "sample": {
7420
  "messages": [
 
7452
  "acc": true
7453
  },
7454
  "predict": {
7455
+ "Да": 0.36993473768234253,
7456
+ "Нет": 0.6099192500114441
7457
  },
7458
  "sample": {
7459
  "messages": [
 
7491
  "acc": true
7492
  },
7493
  "predict": {
7494
+ "Да": 0.8060645461082458,
7495
+ "Нет": 0.1798573136329651
7496
  },
7497
  "sample": {
7498
  "messages": [
 
7530
  "acc": false
7531
  },
7532
  "predict": {
7533
+ "Да": 0.4632495641708374,
7534
+ "Нет": 0.5249305367469788
7535
  },
7536
  "sample": {
7537
  "messages": [
 
7566
  }
7567
  {
7568
  "metric": {
7569
+ "acc": false
7570
  },
7571
  "predict": {
7572
+ "Да": 0.4942798912525177,
7573
+ "Нет": 0.4942798912525177
7574
  },
7575
  "sample": {
7576
  "messages": [
 
7600
  "prompt_len": 115,
7601
  "generated_len": 1,
7602
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7603
+ "generated_token": " Да"
7604
  }
7605
  }
7606
  {
7607
  "metric": {
7608
+ "acc": true
7609
  },
7610
  "predict": {
7611
+ "Да": 0.5872390866279602,
7612
+ "Нет": 0.4036031663417816
7613
  },
7614
  "sample": {
7615
  "messages": [
 
7639
  "prompt_len": 175,
7640
  "generated_len": 1,
7641
  "generated_cumulative_logprob": "TODO: calculate for hf model",
7642
+ "generated_token": " Да"
7643
  }
7644
  }
7645
  {
 
7647
  "acc": true
7648
  },
7649
  "predict": {
7650
+ "Да": 0.266113817691803,
7651
+ "Нет": 0.723372220993042
7652
  },
7653
  "sample": {
7654
  "messages": [
 
7686
  "acc": true
7687
  },
7688
  "predict": {
7689
+ "Да": 0.2134118527173996,
7690
+ "Нет": 0.5801134705543518
7691
  },
7692
  "sample": {
7693
  "messages": [
 
7725
  "acc": false
7726
  },
7727
  "predict": {
7728
+ "Да": 0.8618310689926147,
7729
+ "Нет": 0.11663615703582764
7730
  },
7731
  "sample": {
7732
  "messages": [
 
7764
  "acc": false
7765
  },
7766
  "predict": {
7767
+ "Да": 0.14564481377601624,
7768
+ "Нет": 0.8381280899047852
7769
  },
7770
  "sample": {
7771
  "messages": [
 
7803
  "acc": false
7804
  },
7805
  "predict": {
7806
+ "Да": 0.34413546323776245,
7807
+ "Нет": 0.6429296731948853
7808
  },
7809
  "sample": {
7810
  "messages": [
 
7842
  "acc": false
7843
  },
7844
  "predict": {
7845
+ "Да": 0.6388486623764038,
7846
+ "Нет": 0.34195107221603394
7847
  },
7848
  "sample": {
7849
  "messages": [
 
7881
  "acc": false
7882
  },
7883
  "predict": {
7884
+ "Да": 0.13097181916236877,
7885
+ "Нет": 0.8540434837341309
7886
  },
7887
  "sample": {
7888
  "messages": [
 
7920
  "acc": true
7921
  },
7922
  "predict": {
7923
+ "Да": 0.21951821446418762,
7924
+ "Нет": 0.7661938071250916
7925
  },
7926
  "sample": {
7927
  "messages": [
llmtf_eval/darumeru_RWSD_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
llmtf_eval/darumeru_RWSD_total.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "task_name": "darumeru/RWSD",
3
  "results": {
4
- "acc": 0.5098039215686274
5
  },
6
- "leaderboard_result": 0.5098039215686274
7
  }
 
1
  {
2
  "task_name": "darumeru/RWSD",
3
  "results": {
4
+ "acc": 0.49019607843137253
5
  },
6
+ "leaderboard_result": 0.49019607843137253
7
  }
llmtf_eval/darumeru_USE.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/darumeru_USE_params.jsonl ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
+ "generation_config": {
6
+ "bos_token_id": 145109,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 145111
10
+ ],
11
+ "max_length": 32768,
12
+ "max_new_tokens": 64,
13
+ "pad_token_id": 145109,
14
+ "stop_strings": [
15
+ "<|im_end|>"
16
+ ],
17
+ "temperature": 0.1,
18
+ "top_k": 40,
19
+ "top_p": 0.9,
20
+ "transformers_version": "4.45.2",
21
+ "trust_remote_code": false
22
+ },
23
+ "conversation_template": {
24
+ "system_prompt": "",
25
+ "system_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
26
+ "user_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
27
+ "bot_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
28
+ "bot_message_template_incomplete": "<|im_start|>{role}\n{content}",
29
+ "user_role": "user",
30
+ "bot_role": "assistant",
31
+ "system_role": "system",
32
+ "global_prefix": "",
33
+ "suffix": "<|im_start|>assistant\n",
34
+ "add_special_tokens": false,
35
+ "eos_token": "<|im_end|>"
36
+ },
37
+ "load_in_8bit": false,
38
+ "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
+ "device_map": "cuda:0",
41
+ "use_fast_tokenizer": true,
42
+ "leading_space": false,
43
+ "space_token": null,
44
+ "trust_remote_code": false,
45
+ "max_model_len": 32768
46
+ },
47
+ "task_params": {
48
+ "max_len": 4000,
49
+ "few_shot_count": 0,
50
+ "batch_size": 16,
51
+ "max_sample_per_dataset": 10000000000000,
52
+ "method": "generate"
53
+ }
54
+ }
llmtf_eval/darumeru_USE_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/USE",
3
+ "results": {
4
+ "grade_norm": 0.06078431372549018
5
+ },
6
+ "leaderboard_result": 0.06078431372549018
7
+ }
llmtf_eval/darumeru_cp_para_ru.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/darumeru_cp_para_ru_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "generate"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "generate"
53
  }
llmtf_eval/darumeru_cp_para_ru_total.jsonl CHANGED
@@ -1,9 +1,10 @@
1
  {
2
  "task_name": "darumeru/cp_para_ru",
3
  "results": {
4
- "symbol_per_token": 3.8978256135744003,
5
- "len": 0.8764597602663033,
6
- "lcs": 0.05
 
7
  },
8
- "leaderboard_result": 0.05
9
  }
 
1
  {
2
  "task_name": "darumeru/cp_para_ru",
3
  "results": {
4
+ "tokens_per_word": 1.905314928817744,
5
+ "symbol_per_token": 3.913951487866651,
6
+ "len": 0.9904780330832407,
7
+ "lcs": 0.8
8
  },
9
+ "leaderboard_result": 0.8
10
  }
llmtf_eval/evaluation_log.txt CHANGED
@@ -1,251 +1,272 @@
1
- INFO: 2024-11-18 14:18:48,851: llmtf.base.evaluator: Starting eval on ['darumeru/multiq']
2
- INFO: 2024-11-18 14:18:48,852: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
3
- INFO: 2024-11-18 14:18:48,852: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
4
- INFO: 2024-11-18 14:18:50,653: llmtf.base.evaluator: Starting eval on ['darumeru/parus']
5
- INFO: 2024-11-18 14:18:50,654: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
6
- INFO: 2024-11-18 14:18:50,654: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
7
- INFO: 2024-11-18 14:18:52,696: llmtf.base.darumeru/MultiQ: Loading Dataset: 3.84s
8
- INFO: 2024-11-18 14:18:52,936: llmtf.base.evaluator: Starting eval on ['darumeru/rcb']
9
- INFO: 2024-11-18 14:18:52,936: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
10
- INFO: 2024-11-18 14:18:52,936: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
11
- INFO: 2024-11-18 14:18:53,102: llmtf.base.darumeru/PARus: Loading Dataset: 2.45s
12
- INFO: 2024-11-18 14:18:54,811: llmtf.base.evaluator: Starting eval on ['darumeru/ruopenbookqa']
13
- INFO: 2024-11-18 14:18:54,811: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
14
- INFO: 2024-11-18 14:18:54,811: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
15
- INFO: 2024-11-18 14:18:55,708: llmtf.base.darumeru/RCB: Loading Dataset: 2.77s
16
- INFO: 2024-11-18 14:18:56,455: llmtf.base.darumeru/PARus: Processing Dataset: 3.35s
17
- INFO: 2024-11-18 14:18:56,457: llmtf.base.darumeru/PARus: Results for darumeru/PARus:
18
- INFO: 2024-11-18 14:18:56,470: llmtf.base.darumeru/PARus: {'acc': 0.24}
19
- INFO: 2024-11-18 14:18:56,471: llmtf.base.evaluator: Ended eval
20
- INFO: 2024-11-18 14:18:56,474: llmtf.base.evaluator:
21
- mean darumeru/PARus
22
- 0.240 0.240
23
- INFO: 2024-11-18 14:18:56,487: llmtf.base.evaluator: Starting eval on ['darumeru/ruworldtree']
24
- INFO: 2024-11-18 14:18:56,488: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
25
- INFO: 2024-11-18 14:18:56,488: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
26
- INFO: 2024-11-18 14:18:58,099: llmtf.base.darumeru/ruOpenBookQA: Loading Dataset: 3.29s
27
- INFO: 2024-11-18 14:18:58,743: llmtf.base.evaluator: Starting eval on ['darumeru/rwsd']
28
- INFO: 2024-11-18 14:18:58,744: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
29
- INFO: 2024-11-18 14:18:58,744: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
30
- INFO: 2024-11-18 14:18:58,925: llmtf.base.darumeru/ruWorldTree: Loading Dataset: 2.44s
31
- INFO: 2024-11-18 14:19:00,968: llmtf.base.evaluator: Starting eval on ['daru/treewayextractive']
32
- INFO: 2024-11-18 14:19:00,968: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
33
- INFO: 2024-11-18 14:19:00,968: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
34
- INFO: 2024-11-18 14:19:01,031: llmtf.base.darumeru/RCB: Processing Dataset: 5.32s
35
- INFO: 2024-11-18 14:19:01,033: llmtf.base.darumeru/RCB: Results for darumeru/RCB:
36
- INFO: 2024-11-18 14:19:01,040: llmtf.base.darumeru/RCB: {'acc': 0.4727272727272727, 'f1_macro': 0.39356669305497743}
37
- INFO: 2024-11-18 14:19:01,041: llmtf.base.evaluator: Ended eval
38
- INFO: 2024-11-18 14:19:01,044: llmtf.base.evaluator:
39
- mean darumeru/PARus darumeru/RCB
40
- 0.337 0.240 0.433
41
- INFO: 2024-11-18 14:19:01,497: llmtf.base.darumeru/RWSD: Loading Dataset: 2.75s
42
- INFO: 2024-11-18 14:19:01,851: llmtf.base.darumeru/ruWorldTree: Processing Dataset: 2.92s
43
- INFO: 2024-11-18 14:19:01,852: llmtf.base.darumeru/ruWorldTree: Results for darumeru/ruWorldTree:
44
- INFO: 2024-11-18 14:19:01,859: llmtf.base.darumeru/ruWorldTree: {'acc': 0.7714285714285715, 'f1_macro': 0.7726851851851853}
45
- INFO: 2024-11-18 14:19:01,859: llmtf.base.evaluator: Ended eval
46
- INFO: 2024-11-18 14:19:01,863: llmtf.base.evaluator:
47
- mean darumeru/PARus darumeru/RCB darumeru/ruWorldTree
48
- 0.482 0.240 0.433 0.772
49
- INFO: 2024-11-18 14:19:02,889: llmtf.base.evaluator: Starting eval on ['daru/treewayabstractive']
50
- INFO: 2024-11-18 14:19:02,890: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
51
- INFO: 2024-11-18 14:19:02,890: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
52
- INFO: 2024-11-18 14:19:03,199: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/rummlu']
53
- INFO: 2024-11-18 14:19:03,199: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
54
- INFO: 2024-11-18 14:19:03,199: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
55
- INFO: 2024-11-18 14:19:06,629: llmtf.base.darumeru/RWSD: Processing Dataset: 5.13s
56
- INFO: 2024-11-18 14:19:06,631: llmtf.base.darumeru/RWSD: Results for darumeru/RWSD:
57
- INFO: 2024-11-18 14:19:06,635: llmtf.base.darumeru/RWSD: {'acc': 0.5098039215686274}
58
- INFO: 2024-11-18 14:19:06,636: llmtf.base.evaluator: Ended eval
59
- INFO: 2024-11-18 14:19:06,641: llmtf.base.evaluator:
60
- mean darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/ruWorldTree
61
- 0.489 0.240 0.433 0.510 0.772
62
- INFO: 2024-11-18 14:19:06,885: llmtf.base.daru/treewayabstractive: Loading Dataset: 4.00s
63
- INFO: 2024-11-18 14:19:07,496: llmtf.base.evaluator: Starting eval on ['darumeru/cp_para_ru']
64
- INFO: 2024-11-18 14:19:07,497: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
65
- INFO: 2024-11-18 14:19:07,497: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
66
- INFO: 2024-11-18 14:19:10,509: llmtf.base.darumeru/cp_para_ru: Loading Dataset: 3.01s
67
- INFO: 2024-11-18 14:19:13,909: llmtf.base.daru/treewayextractive: Loading Dataset: 12.94s
68
- INFO: 2024-11-18 14:19:44,800: llmtf.base.darumeru/ruOpenBookQA: Processing Dataset: 46.70s
69
- INFO: 2024-11-18 14:19:44,801: llmtf.base.darumeru/ruOpenBookQA: Results for darumeru/ruOpenBookQA:
70
- INFO: 2024-11-18 14:19:44,814: llmtf.base.darumeru/ruOpenBookQA: {'acc': 0.615979381443299, 'f1_macro': 0.6154023944317246}
71
- INFO: 2024-11-18 14:19:44,821: llmtf.base.evaluator: Ended eval
72
- INFO: 2024-11-18 14:19:44,826: llmtf.base.evaluator:
73
- mean darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/ruOpenBookQA darumeru/ruWorldTree
74
- 0.514 0.240 0.433 0.510 0.616 0.772
75
- INFO: 2024-11-18 14:21:05,969: llmtf.base.nlpcoreteam/ruMMLU: Loading Dataset: 122.77s
76
- INFO: 2024-11-18 14:21:35,520: llmtf.base.daru/treewayextractive: Processing Dataset: 141.61s
77
- INFO: 2024-11-18 14:21:35,523: llmtf.base.daru/treewayextractive: Results for daru/treewayextractive:
78
- INFO: 2024-11-18 14:21:35,753: llmtf.base.daru/treewayextractive: {'r-prec': 0.3782488455988456}
79
- INFO: 2024-11-18 14:21:35,793: llmtf.base.evaluator: Ended eval
80
- INFO: 2024-11-18 14:21:35,799: llmtf.base.evaluator:
81
- mean daru/treewayextractive darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/ruOpenBookQA darumeru/ruWorldTree
82
- 0.491 0.378 0.240 0.433 0.510 0.616 0.772
83
- INFO: 2024-11-18 14:24:06,030: llmtf.base.darumeru/cp_para_ru: Processing Dataset: 295.52s
84
- INFO: 2024-11-18 14:24:06,032: llmtf.base.darumeru/cp_para_ru: Results for darumeru/cp_para_ru:
85
- INFO: 2024-11-18 14:24:06,036: llmtf.base.darumeru/cp_para_ru: {'symbol_per_token': 3.8978256135744003, 'len': 0.8764597602663033, 'lcs': 0.05}
86
- INFO: 2024-11-18 14:24:06,036: llmtf.base.evaluator: Ended eval
87
- INFO: 2024-11-18 14:24:06,041: llmtf.base.evaluator:
88
- mean daru/treewayextractive darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/cp_para_ru darumeru/ruOpenBookQA darumeru/ruWorldTree
89
- 0.428 0.378 0.240 0.433 0.510 0.050 0.616 0.772
90
- INFO: 2024-11-18 14:24:17,174: llmtf.base.daru/treewayabstractive: Processing Dataset: 310.29s
91
- INFO: 2024-11-18 14:24:17,190: llmtf.base.daru/treewayabstractive: Results for daru/treewayabstractive:
92
- INFO: 2024-11-18 14:24:17,208: llmtf.base.daru/treewayabstractive: {'rouge1': 0.31023763628891676, 'rouge2': 0.09443696323171702}
93
- INFO: 2024-11-18 14:24:17,210: llmtf.base.evaluator: Ended eval
94
- INFO: 2024-11-18 14:24:17,215: llmtf.base.evaluator:
95
- mean daru/treewayabstractive daru/treewayextractive darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/cp_para_ru darumeru/ruOpenBookQA darumeru/ruWorldTree
96
- 0.400 0.202 0.378 0.240 0.433 0.510 0.050 0.616 0.772
97
- INFO: 2024-11-18 14:26:06,991: llmtf.base.nlpcoreteam/ruMMLU: Processing Dataset: 301.02s
98
- INFO: 2024-11-18 14:26:06,993: llmtf.base.nlpcoreteam/ruMMLU: Results for nlpcoreteam/ruMMLU:
99
- INFO: 2024-11-18 14:26:07,037: llmtf.base.nlpcoreteam/ruMMLU: metric
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  subject
101
- abstract_algebra 0.280000
102
- anatomy 0.400000
103
- astronomy 0.572368
104
- business_ethics 0.460000
105
- clinical_knowledge 0.494340
106
- college_biology 0.375000
107
- college_chemistry 0.290000
108
- college_computer_science 0.400000
109
- college_mathematics 0.400000
110
- college_medicine 0.491329
111
- college_physics 0.362745
112
- computer_security 0.500000
113
- conceptual_physics 0.421277
114
- econometrics 0.280702
115
- electrical_engineering 0.427586
116
- elementary_mathematics 0.391534
117
- formal_logic 0.373016
118
- global_facts 0.230000
119
- high_school_biology 0.496774
120
- high_school_chemistry 0.458128
121
- high_school_computer_science 0.500000
122
- high_school_european_history 0.600000
123
- high_school_geography 0.535354
124
- high_school_government_and_politics 0.518135
125
- high_school_macroeconomics 0.471795
126
- high_school_mathematics 0.400000
127
- high_school_microeconomics 0.462185
128
- high_school_physics 0.291391
129
- high_school_psychology 0.614679
130
- high_school_statistics 0.490741
131
- high_school_us_history 0.534314
132
- high_school_world_history 0.624473
133
- human_aging 0.520179
134
- human_sexuality 0.519084
135
- international_law 0.694215
136
- jurisprudence 0.537037
137
- logical_fallacies 0.472393
138
- machine_learning 0.258929
139
- management 0.640777
140
- marketing 0.700855
141
- medical_genetics 0.480000
142
- miscellaneous 0.533844
143
- moral_disputes 0.488439
144
- moral_scenarios 0.268156
145
- nutrition 0.526144
146
- philosophy 0.543408
147
- prehistory 0.475309
148
- professional_accounting 0.347518
149
- professional_law 0.345502
150
- professional_medicine 0.426471
151
- professional_psychology 0.411765
152
- public_relations 0.427273
153
- security_studies 0.542857
154
- sociology 0.686567
155
- us_foreign_policy 0.700000
156
- virology 0.379518
157
- world_religions 0.538012
158
- INFO: 2024-11-18 14:26:07,045: llmtf.base.nlpcoreteam/ruMMLU: metric
159
- subject
160
- STEM 0.406471
161
- humanities 0.499559
162
- other (business, health, misc.) 0.473641
163
- social sciences 0.514200
164
- INFO: 2024-11-18 14:26:07,053: llmtf.base.nlpcoreteam/ruMMLU: {'acc': 0.47346768524963356}
165
- INFO: 2024-11-18 14:26:07,087: llmtf.base.evaluator: Ended eval
166
- INFO: 2024-11-18 14:26:07,096: llmtf.base.evaluator:
167
- mean daru/treewayabstractive daru/treewayextractive darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/cp_para_ru darumeru/ruOpenBookQA darumeru/ruWorldTree nlpcoreteam/ruMMLU
168
- 0.408 0.202 0.378 0.240 0.433 0.510 0.050 0.616 0.772 0.473
169
- INFO: 2024-11-18 14:29:49,600: llmtf.base.darumeru/MultiQ: Processing Dataset: 656.90s
170
- INFO: 2024-11-18 14:29:49,603: llmtf.base.darumeru/MultiQ: Results for darumeru/MultiQ:
171
- INFO: 2024-11-18 14:29:49,608: llmtf.base.darumeru/MultiQ: {'f1': 0.20613243758223346, 'em': 0.11281070745697896}
172
- INFO: 2024-11-18 14:29:49,612: llmtf.base.evaluator: Ended eval
173
- INFO: 2024-11-18 14:29:49,634: llmtf.base.evaluator:
174
- mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/cp_para_ru darumeru/ruOpenBookQA darumeru/ruWorldTree nlpcoreteam/ruMMLU
175
- 0.383 0.202 0.378 0.159 0.240 0.433 0.510 0.050 0.616 0.772 0.473
176
- INFO: 2024-11-18 14:29:55,578: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/enmmlu']
177
- INFO: 2024-11-18 14:29:55,579: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
178
- INFO: 2024-11-18 14:29:55,579: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
179
- INFO: 2024-11-18 14:31:56,928: llmtf.base.nlpcoreteam/enMMLU: Loading Dataset: 121.35s
180
- INFO: 2024-11-18 14:36:48,348: llmtf.base.nlpcoreteam/enMMLU: Processing Dataset: 291.42s
181
- INFO: 2024-11-18 14:36:48,352: llmtf.base.nlpcoreteam/enMMLU: Results for nlpcoreteam/enMMLU:
182
- INFO: 2024-11-18 14:36:48,396: llmtf.base.nlpcoreteam/enMMLU: metric
183
- subject
184
- abstract_algebra 0.350000
185
- anatomy 0.562963
186
- astronomy 0.664474
187
- business_ethics 0.640000
188
- clinical_knowledge 0.652830
189
- college_biology 0.652778
190
- college_chemistry 0.450000
191
- college_computer_science 0.490000
192
- college_mathematics 0.310000
193
  college_medicine 0.618497
194
- college_physics 0.500000
195
- computer_security 0.710000
196
- conceptual_physics 0.587234
197
- econometrics 0.429825
198
- electrical_engineering 0.531034
199
- elementary_mathematics 0.460317
200
- formal_logic 0.373016
201
- global_facts 0.270000
202
- high_school_biology 0.748387
203
- high_school_chemistry 0.522167
204
- high_school_computer_science 0.620000
205
- high_school_european_history 0.733333
206
- high_school_geography 0.747475
207
  high_school_government_and_politics 0.808290
208
- high_school_macroeconomics 0.658974
209
- high_school_mathematics 0.403704
210
- high_school_microeconomics 0.684874
211
- high_school_physics 0.390728
212
- high_school_psychology 0.822018
213
- high_school_statistics 0.550926
214
- high_school_us_history 0.720588
215
- high_school_world_history 0.742616
216
- human_aging 0.623318
217
- human_sexuality 0.687023
218
- international_law 0.710744
219
- jurisprudence 0.759259
220
- logical_fallacies 0.742331
221
- machine_learning 0.419643
222
- management 0.747573
223
- marketing 0.824786
224
  medical_genetics 0.690000
225
- miscellaneous 0.708812
226
- moral_disputes 0.641618
227
- moral_scenarios 0.252514
228
- nutrition 0.653595
229
  philosophy 0.668810
230
- prehistory 0.675926
231
- professional_accounting 0.510638
232
- professional_law 0.397653
233
- professional_medicine 0.602941
234
- professional_psychology 0.589869
235
- public_relations 0.609091
236
- security_studies 0.673469
237
- sociology 0.776119
238
- us_foreign_policy 0.740000
239
  virology 0.463855
240
- world_religions 0.783626
241
- INFO: 2024-11-18 14:36:48,403: llmtf.base.nlpcoreteam/enMMLU: metric
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  subject
243
- STEM 0.520077
244
- humanities 0.630926
245
- other (business, health, misc.) 0.612129
246
- social sciences 0.685586
247
- INFO: 2024-11-18 14:36:48,425: llmtf.base.nlpcoreteam/enMMLU: {'acc': 0.6121795307919802}
248
- INFO: 2024-11-18 14:36:48,459: llmtf.base.evaluator: Ended eval
249
- INFO: 2024-11-18 14:36:48,480: llmtf.base.evaluator:
250
- mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/cp_para_ru darumeru/ruOpenBookQA darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU
251
- 0.404 0.202 0.378 0.159 0.240 0.433 0.510 0.050 0.616 0.772 0.612 0.473
 
1
+ INFO: 2024-11-26 20:17:25,790: llmtf.base.evaluator: Starting eval on ['darumeru/multiq', 'darumeru/parus', 'darumeru/rcb', 'darumeru/rwsd', 'darumeru/use']
2
+ INFO: 2024-11-26 20:17:25,791: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
3
+ INFO: 2024-11-26 20:17:25,791: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
4
+ INFO: 2024-11-26 20:17:27,525: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/rummlu']
5
+ INFO: 2024-11-26 20:17:27,525: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
6
+ INFO: 2024-11-26 20:17:27,525: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
7
+ INFO: 2024-11-26 20:17:29,517: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/enmmlu']
8
+ INFO: 2024-11-26 20:17:29,517: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
9
+ INFO: 2024-11-26 20:17:29,517: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
10
+ INFO: 2024-11-26 20:17:30,060: llmtf.base.darumeru/MultiQ: Loading Dataset: 4.27s
11
+ INFO: 2024-11-26 20:17:31,597: llmtf.base.evaluator: Starting eval on ['daru/treewayabstractive']
12
+ INFO: 2024-11-26 20:17:31,597: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
13
+ INFO: 2024-11-26 20:17:31,597: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
14
+ INFO: 2024-11-26 20:17:33,345: llmtf.base.evaluator: Starting eval on ['darumeru/cp_para_ru']
15
+ INFO: 2024-11-26 20:17:33,345: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
16
+ INFO: 2024-11-26 20:17:33,345: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
17
+ INFO: 2024-11-26 20:17:35,521: llmtf.base.evaluator: Starting eval on ['vikhrmodels/habr_qa_sbs', 'ruparam', 'shlepa/moviesmc', 'shlepa/musicmc', 'shlepa/lawmc', 'shlepa/booksmc']
18
+ INFO: 2024-11-26 20:17:35,521: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
19
+ INFO: 2024-11-26 20:17:35,521: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
20
+ INFO: 2024-11-26 20:17:36,308: llmtf.base.darumeru/cp_para_ru: Loading Dataset: 2.96s
21
+ INFO: 2024-11-26 20:17:36,742: llmtf.base.daru/treewayabstractive: Loading Dataset: 5.14s
22
+ INFO: 2024-11-26 20:17:37,659: llmtf.base.evaluator: Starting eval on ['ruopinionne']
23
+ INFO: 2024-11-26 20:17:37,659: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
24
+ INFO: 2024-11-26 20:17:37,660: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
25
+ INFO: 2024-11-26 20:17:37,984: llmtf.base.ruopinionne: Loading Dataset: 0.32s
26
+ INFO: 2024-11-26 20:17:39,551: llmtf.base.evaluator: Starting eval on ['nerel']
27
+ INFO: 2024-11-26 20:17:39,551: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
28
+ INFO: 2024-11-26 20:17:39,551: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
29
+ INFO: 2024-11-26 20:17:43,423: llmtf.base.NEREL: Loading Dataset: 3.87s
30
+ INFO: 2024-11-26 20:17:48,557: llmtf.base.vikhrmodels/habr_qa_sbs: Loading Dataset: 13.04s
31
+ INFO: 2024-11-26 20:18:44,135: llmtf.base.darumeru/cp_para_ru: Processing Dataset: 67.83s
32
+ INFO: 2024-11-26 20:18:44,145: llmtf.base.darumeru/cp_para_ru: Results for darumeru/cp_para_ru:
33
+ INFO: 2024-11-26 20:18:44,149: llmtf.base.darumeru/cp_para_ru: {'tokens_per_word': 1.905314928817744, 'symbol_per_token': 3.913951487866651, 'len': 0.9904780330832407, 'lcs': 0.8}
34
+ INFO: 2024-11-26 20:18:44,150: llmtf.base.evaluator: Ended eval
35
+ INFO: 2024-11-26 20:18:44,152: llmtf.base.evaluator:
36
+ mean darumeru/cp_para_ru
37
+ 0.800 0.800
38
+ INFO: 2024-11-26 20:18:50,857: llmtf.base.NEREL: Processing Dataset: 67.43s
39
+ INFO: 2024-11-26 20:18:50,860: llmtf.base.NEREL: Results for NEREL:
40
+ INFO: 2024-11-26 20:18:50,864: llmtf.base.NEREL: {'tp': 2.0, 'fp': 27.0, 'fn': 519.0, 'micro-f1': 0.00727272727272595}
41
+ INFO: 2024-11-26 20:18:50,865: llmtf.base.evaluator: Ended eval
42
+ INFO: 2024-11-26 20:18:50,869: llmtf.base.evaluator:
43
+ mean NEREL darumeru/cp_para_ru
44
+ 0.404 0.007 0.800
45
+ INFO: 2024-11-26 20:18:57,627: llmtf.base.daru/treewayabstractive: Processing Dataset: 80.88s
46
+ INFO: 2024-11-26 20:18:57,629: llmtf.base.daru/treewayabstractive: Results for daru/treewayabstractive:
47
+ INFO: 2024-11-26 20:18:57,632: llmtf.base.daru/treewayabstractive: {'rouge1': 0.3138417117532064, 'rouge2': 0.10462617373556911}
48
+ INFO: 2024-11-26 20:18:57,634: llmtf.base.evaluator: Ended eval
49
+ INFO: 2024-11-26 20:18:57,637: llmtf.base.evaluator:
50
+ mean NEREL daru/treewayabstractive darumeru/cp_para_ru
51
+ 0.339 0.007 0.209 0.800
52
+ INFO: 2024-11-26 20:19:13,379: llmtf.base.darumeru/MultiQ: Processing Dataset: 103.32s
53
+ INFO: 2024-11-26 20:19:13,384: llmtf.base.darumeru/MultiQ: Results for darumeru/MultiQ:
54
+ INFO: 2024-11-26 20:19:13,404: llmtf.base.darumeru/MultiQ: {'f1': 0.3016876852043635, 'em': 0.21319311663479923}
55
+ INFO: 2024-11-26 20:19:13,412: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
56
+ INFO: 2024-11-26 20:19:13,412: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
57
+ INFO: 2024-11-26 20:19:15,573: llmtf.base.darumeru/PARus: Loading Dataset: 2.16s
58
+ INFO: 2024-11-26 20:19:19,861: llmtf.base.darumeru/PARus: Processing Dataset: 4.29s
59
+ INFO: 2024-11-26 20:19:19,867: llmtf.base.darumeru/PARus: Results for darumeru/PARus:
60
+ INFO: 2024-11-26 20:19:19,887: llmtf.base.darumeru/PARus: {'acc': 0.44}
61
+ INFO: 2024-11-26 20:19:19,888: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
62
+ INFO: 2024-11-26 20:19:19,888: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
63
+ INFO: 2024-11-26 20:19:22,295: llmtf.base.darumeru/RCB: Loading Dataset: 2.40s
64
+ INFO: 2024-11-26 20:19:23,604: llmtf.base.ruopinionne: Processing Dataset: 105.62s
65
+ INFO: 2024-11-26 20:19:23,610: llmtf.base.ruopinionne: Results for ruopinionne:
66
+ INFO: 2024-11-26 20:19:23,639: llmtf.base.ruopinionne: {'f1': 0.02701209922104298}
67
+ INFO: 2024-11-26 20:19:23,640: llmtf.base.evaluator: Ended eval
68
+ INFO: 2024-11-26 20:19:23,656: llmtf.base.evaluator:
69
+ mean NEREL daru/treewayabstractive darumeru/MultiQ darumeru/PARus darumeru/cp_para_ru ruopinionne
70
+ 0.290 0.007 0.209 0.257 0.440 0.800 0.027
71
+ INFO: 2024-11-26 20:19:27,714: llmtf.base.darumeru/RCB: Processing Dataset: 5.42s
72
+ INFO: 2024-11-26 20:19:27,715: llmtf.base.darumeru/RCB: Results for darumeru/RCB:
73
+ INFO: 2024-11-26 20:19:27,722: llmtf.base.darumeru/RCB: {'acc': 0.4590909090909091, 'f1_macro': 0.36910715356478985}
74
+ INFO: 2024-11-26 20:19:27,724: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
75
+ INFO: 2024-11-26 20:19:27,724: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
76
+ INFO: 2024-11-26 20:19:29,690: llmtf.base.darumeru/RWSD: Loading Dataset: 1.96s
77
+ INFO: 2024-11-26 20:19:34,850: llmtf.base.darumeru/RWSD: Processing Dataset: 5.16s
78
+ INFO: 2024-11-26 20:19:34,855: llmtf.base.darumeru/RWSD: Results for darumeru/RWSD:
79
+ INFO: 2024-11-26 20:19:34,858: llmtf.base.darumeru/RWSD: {'acc': 0.49019607843137253}
80
+ INFO: 2024-11-26 20:19:34,859: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
81
+ INFO: 2024-11-26 20:19:34,859: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
82
+ INFO: 2024-11-26 20:19:38,002: llmtf.base.darumeru/USE: Loading Dataset: 3.14s
83
+ INFO: 2024-11-26 20:19:52,797: llmtf.base.nlpcoreteam/enMMLU: Loading Dataset: 143.28s
84
+ INFO: 2024-11-26 20:19:55,208: llmtf.base.nlpcoreteam/ruMMLU: Loading Dataset: 147.68s
85
+ INFO: 2024-11-26 20:20:33,495: llmtf.base.vikhrmodels/habr_qa_sbs: Processing Dataset: 164.94s
86
+ INFO: 2024-11-26 20:20:33,496: llmtf.base.vikhrmodels/habr_qa_sbs: Results for vikhrmodels/habr_qa_sbs:
87
+ INFO: 2024-11-26 20:20:33,533: llmtf.base.vikhrmodels/habr_qa_sbs: {'acc': 0.547, 'f1_macro': 0.5280856541414141}
88
+ INFO: 2024-11-26 20:20:33,546: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
89
+ INFO: 2024-11-26 20:20:33,547: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
90
+ INFO: 2024-11-26 20:20:40,434: llmtf.base.ruparam: Loading Dataset: 6.89s
91
+ INFO: 2024-11-26 20:21:07,899: llmtf.base.darumeru/USE: Processing Dataset: 89.90s
92
+ INFO: 2024-11-26 20:21:07,900: llmtf.base.darumeru/USE: Results for darumeru/USE:
93
+ INFO: 2024-11-26 20:21:07,942: llmtf.base.darumeru/USE: {'grade_norm': 0.06078431372549018}
94
+ INFO: 2024-11-26 20:21:07,948: llmtf.base.evaluator: Ended eval
95
+ INFO: 2024-11-26 20:21:08,016: llmtf.base.evaluator:
96
+ mean NEREL daru/treewayabstractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru ruopinionne vikhrmodels/habr_qa_sbs
97
+ 0.324 0.007 0.209 0.257 0.440 0.414 0.490 0.061 0.800 0.027 0.538
98
+ INFO: 2024-11-26 20:25:43,772: llmtf.base.ruparam: Processing Dataset: 303.34s
99
+ INFO: 2024-11-26 20:25:43,788: llmtf.base.ruparam: Results for ruparam:
100
+ INFO: 2024-11-26 20:25:44,038: llmtf.base.ruparam: {'acc': 0.21363220494053065}
101
+ INFO: 2024-11-26 20:25:44,053: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
102
+ INFO: 2024-11-26 20:25:44,053: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
103
+ INFO: 2024-11-26 20:25:47,874: llmtf.base.shlepa/movie_mc: Loading Dataset: 3.82s
104
+ INFO: 2024-11-26 20:26:05,103: llmtf.base.shlepa/movie_mc: Processing Dataset: 17.22s
105
+ INFO: 2024-11-26 20:26:05,119: llmtf.base.shlepa/movie_mc: Results for shlepa/movie_mc:
106
+ INFO: 2024-11-26 20:26:05,122: llmtf.base.shlepa/movie_mc: {'acc': 0.22453703703703703}
107
+ INFO: 2024-11-26 20:26:05,130: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
108
+ INFO: 2024-11-26 20:26:05,130: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
109
+ INFO: 2024-11-26 20:26:08,736: llmtf.base.shlepa/music_mc: Loading Dataset: 3.60s
110
+ INFO: 2024-11-26 20:26:26,413: llmtf.base.shlepa/music_mc: Processing Dataset: 17.68s
111
+ INFO: 2024-11-26 20:26:26,416: llmtf.base.shlepa/music_mc: Results for shlepa/music_mc:
112
+ INFO: 2024-11-26 20:26:26,435: llmtf.base.shlepa/music_mc: {'acc': 0.24468085106382978}
113
+ INFO: 2024-11-26 20:26:26,438: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
114
+ INFO: 2024-11-26 20:26:26,439: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
115
+ INFO: 2024-11-26 20:26:31,058: llmtf.base.shlepa/law_mc: Loading Dataset: 4.62s
116
+ INFO: 2024-11-26 20:27:15,973: llmtf.base.shlepa/law_mc: Processing Dataset: 44.91s
117
+ INFO: 2024-11-26 20:27:15,980: llmtf.base.shlepa/law_mc: Results for shlepa/law_mc:
118
+ INFO: 2024-11-26 20:27:16,000: llmtf.base.shlepa/law_mc: {'acc': 0.537590113285273}
119
+ INFO: 2024-11-26 20:27:16,006: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
120
+ INFO: 2024-11-26 20:27:16,006: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
121
+ INFO: 2024-11-26 20:27:19,668: llmtf.base.shlepa/books_mc: Loading Dataset: 3.66s
122
+ INFO: 2024-11-26 20:27:39,200: llmtf.base.shlepa/books_mc: Processing Dataset: 19.53s
123
+ INFO: 2024-11-26 20:27:39,204: llmtf.base.shlepa/books_mc: Results for shlepa/books_mc:
124
+ INFO: 2024-11-26 20:27:39,209: llmtf.base.shlepa/books_mc: {'acc': 0.3112033195020747}
125
+ INFO: 2024-11-26 20:27:39,212: llmtf.base.evaluator: Ended eval
126
+ INFO: 2024-11-26 20:27:39,231: llmtf.base.evaluator:
127
+ mean NEREL daru/treewayabstractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru ruopinionne ruparam shlepa/books_mc shlepa/law_mc shlepa/movie_mc shlepa/music_mc vikhrmodels/habr_qa_sbs
128
+ 0.318 0.007 0.209 0.257 0.440 0.414 0.490 0.061 0.800 0.027 0.214 0.311 0.538 0.225 0.245 0.538
129
+ INFO: 2024-11-26 20:28:00,389: llmtf.base.nlpcoreteam/enMMLU: Processing Dataset: 487.59s
130
+ INFO: 2024-11-26 20:28:00,396: llmtf.base.nlpcoreteam/enMMLU: Results for nlpcoreteam/enMMLU:
131
+ INFO: 2024-11-26 20:28:00,445: llmtf.base.nlpcoreteam/enMMLU: metric
132
  subject
133
+ abstract_algebra 0.310000
134
+ anatomy 0.518519
135
+ astronomy 0.717105
136
+ business_ethics 0.630000
137
+ clinical_knowledge 0.667925
138
+ college_biology 0.680556
139
+ college_chemistry 0.400000
140
+ college_computer_science 0.510000
141
+ college_mathematics 0.270000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  college_medicine 0.618497
143
+ college_physics 0.558824
144
+ computer_security 0.750000
145
+ conceptual_physics 0.561702
146
+ econometrics 0.438596
147
+ electrical_engineering 0.579310
148
+ elementary_mathematics 0.455026
149
+ formal_logic 0.412698
150
+ global_facts 0.250000
151
+ high_school_biology 0.738710
152
+ high_school_chemistry 0.541872
153
+ high_school_computer_science 0.650000
154
+ high_school_european_history 0.751515
155
+ high_school_geography 0.767677
156
  high_school_government_and_politics 0.808290
157
+ high_school_macroeconomics 0.638462
158
+ high_school_mathematics 0.433333
159
+ high_school_microeconomics 0.676471
160
+ high_school_physics 0.377483
161
+ high_school_psychology 0.814679
162
+ high_school_statistics 0.527778
163
+ high_school_us_history 0.715686
164
+ high_school_world_history 0.767932
165
+ human_aging 0.641256
166
+ human_sexuality 0.679389
167
+ international_law 0.735537
168
+ jurisprudence 0.777778
169
+ logical_fallacies 0.760736
170
+ machine_learning 0.455357
171
+ management 0.757282
172
+ marketing 0.837607
173
  medical_genetics 0.690000
174
+ miscellaneous 0.713921
175
+ moral_disputes 0.650289
176
+ moral_scenarios 0.243575
177
+ nutrition 0.663399
178
  philosophy 0.668810
179
+ prehistory 0.682099
180
+ professional_accounting 0.489362
181
+ professional_law 0.418514
182
+ professional_medicine 0.599265
183
+ professional_psychology 0.588235
184
+ public_relations 0.600000
185
+ security_studies 0.689796
186
+ sociology 0.786070
187
+ us_foreign_policy 0.780000
188
  virology 0.463855
189
+ world_religions 0.789474
190
+ INFO: 2024-11-26 20:28:00,453: llmtf.base.nlpcoreteam/enMMLU: metric
191
+ subject
192
+ STEM 0.528725
193
+ humanities 0.644203
194
+ other (business, health, misc.) 0.610063
195
+ social sciences 0.688972
196
+ INFO: 2024-11-26 20:28:00,464: llmtf.base.nlpcoreteam/enMMLU: {'acc': 0.6179910195383496}
197
+ INFO: 2024-11-26 20:28:00,501: llmtf.base.evaluator: Ended eval
198
+ INFO: 2024-11-26 20:28:00,524: llmtf.base.evaluator:
199
+ mean NEREL daru/treewayabstractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru nlpcoreteam/enMMLU ruopinionne ruparam shlepa/books_mc shlepa/law_mc shlepa/movie_mc shlepa/music_mc vikhrmodels/habr_qa_sbs
200
+ 0.337 0.007 0.209 0.257 0.440 0.414 0.490 0.061 0.800 0.618 0.027 0.214 0.311 0.538 0.225 0.245 0.538
201
+ INFO: 2024-11-26 20:28:16,900: llmtf.base.nlpcoreteam/ruMMLU: Processing Dataset: 501.69s
202
+ INFO: 2024-11-26 20:28:16,902: llmtf.base.nlpcoreteam/ruMMLU: Results for nlpcoreteam/ruMMLU:
203
+ INFO: 2024-11-26 20:28:16,950: llmtf.base.nlpcoreteam/ruMMLU: metric
204
+ subject
205
+ abstract_algebra 0.320000
206
+ anatomy 0.414815
207
+ astronomy 0.572368
208
+ business_ethics 0.450000
209
+ clinical_knowledge 0.505660
210
+ college_biology 0.375000
211
+ college_chemistry 0.310000
212
+ college_computer_science 0.380000
213
+ college_mathematics 0.350000
214
+ college_medicine 0.508671
215
+ college_physics 0.431373
216
+ computer_security 0.530000
217
+ conceptual_physics 0.429787
218
+ econometrics 0.298246
219
+ electrical_engineering 0.448276
220
+ elementary_mathematics 0.417989
221
+ formal_logic 0.365079
222
+ global_facts 0.240000
223
+ high_school_biology 0.487097
224
+ high_school_chemistry 0.443350
225
+ high_school_computer_science 0.530000
226
+ high_school_european_history 0.654545
227
+ high_school_geography 0.525253
228
+ high_school_government_and_politics 0.481865
229
+ high_school_macroeconomics 0.430769
230
+ high_school_mathematics 0.392593
231
+ high_school_microeconomics 0.441176
232
+ high_school_physics 0.291391
233
+ high_school_psychology 0.572477
234
+ high_school_statistics 0.416667
235
+ high_school_us_history 0.495098
236
+ high_school_world_history 0.632911
237
+ human_aging 0.488789
238
+ human_sexuality 0.496183
239
+ international_law 0.677686
240
+ jurisprudence 0.574074
241
+ logical_fallacies 0.441718
242
+ machine_learning 0.321429
243
+ management 0.563107
244
+ marketing 0.722222
245
+ medical_genetics 0.470000
246
+ miscellaneous 0.536398
247
+ moral_disputes 0.517341
248
+ moral_scenarios 0.237989
249
+ nutrition 0.526144
250
+ philosophy 0.546624
251
+ prehistory 0.478395
252
+ professional_accounting 0.365248
253
+ professional_law 0.331160
254
+ professional_medicine 0.367647
255
+ professional_psychology 0.415033
256
+ public_relations 0.454545
257
+ security_studies 0.595918
258
+ sociology 0.631841
259
+ us_foreign_policy 0.710000
260
+ virology 0.409639
261
+ world_religions 0.549708
262
+ INFO: 2024-11-26 20:28:16,959: llmtf.base.nlpcoreteam/ruMMLU: metric
263
  subject
264
+ STEM 0.413740
265
+ humanities 0.500179
266
+ other (business, health, misc.) 0.469167
267
+ social sciences 0.504442
268
+ INFO: 2024-11-26 20:28:16,967: llmtf.base.nlpcoreteam/ruMMLU: {'acc': 0.47188210698069283}
269
+ INFO: 2024-11-26 20:28:17,010: llmtf.base.evaluator: Ended eval
270
+ INFO: 2024-11-26 20:28:17,020: llmtf.base.evaluator:
271
+ mean NEREL daru/treewayabstractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru nlpcoreteam/enMMLU nlpcoreteam/ruMMLU ruopinionne ruparam shlepa/books_mc shlepa/law_mc shlepa/movie_mc shlepa/music_mc vikhrmodels/habr_qa_sbs
272
+ 0.345 0.007 0.209 0.257 0.440 0.414 0.490 0.061 0.800 0.618 0.472 0.027 0.214 0.311 0.538 0.225 0.245 0.538
llmtf_eval/evaluation_results.txt CHANGED
@@ -1,2 +1,2 @@
1
- mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/cp_para_ru darumeru/ruOpenBookQA darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU
2
- 0.404 0.202 0.378 0.159 0.240 0.433 0.510 0.050 0.616 0.772 0.612 0.473
 
1
+ mean NEREL daru/treewayabstractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru llm_as_judge nlpcoreteam/enMMLU nlpcoreteam/ruMMLU ruopinionne ruparam shlepa/books_mc shlepa/law_mc shlepa/movie_mc shlepa/music_mc vikhrmodels/habr_qa_sbs
2
+ 0.349 0.007 0.209 0.257 0.440 0.414 0.490 0.061 0.800 0.420 0.618 0.472 0.027 0.214 0.311 0.538 0.225 0.245 0.538
llmtf_eval/llm_as_judge.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/llm_as_judge_params.jsonl ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "/workdir/models/Qwen2.5-72B-Instruct",
5
+ "api_base": "http://10.36.60.73:3040",
6
+ "generation_config": {
7
+ "max_new_tokens": 200,
8
+ "temperature": 0.0,
9
+ "top_k": 40,
10
+ "top_p": 0.9,
11
+ "transformers_version": "4.45.2"
12
+ },
13
+ "max_model_len": 32000
14
+ },
15
+ "task_params": {
16
+ "max_len": 4000,
17
+ "few_shot_count": 0,
18
+ "batch_size": 256,
19
+ "max_sample_per_dataset": 10000000000,
20
+ "method": "calculate_tokens_proba"
21
+ }
22
+ }
llmtf_eval/llm_as_judge_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "llm_as_judge",
3
+ "results": {
4
+ "score": 0.42
5
+ },
6
+ "leaderboard_result": 0.42
7
+ }
llmtf_eval/nlpcoreteam_enMMLU.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc82cd8c8f64da35f0862509ddfd17217e9fb107f450d2e3e2807f69ff6bc80c
3
- size 37142060
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d39638678b135a0ac575407993d8d413c727aa2d94d2de0aca2520744a2819ea
3
+ size 37122790
llmtf_eval/nlpcoreteam_enMMLU_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
llmtf_eval/nlpcoreteam_enMMLU_total.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "task_name": "nlpcoreteam/enMMLU",
3
  "results": {
4
- "acc": 0.6121795307919802
5
  },
6
- "leaderboard_result": 0.6121795307919802
7
  }
 
1
  {
2
  "task_name": "nlpcoreteam/enMMLU",
3
  "results": {
4
+ "acc": 0.6179910195383496
5
  },
6
+ "leaderboard_result": 0.6179910195383496
7
  }
llmtf_eval/nlpcoreteam_ruMMLU.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c07ebf724c9c50407ac459026f3c3514cc89efab52531b4f6c15c9002046f0ec
3
- size 43069976
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8a67a489efec02331bf36f2d61f632f8e982c340cfc1657e5ff3aa9ccc82849
3
+ size 43039512
llmtf_eval/nlpcoreteam_ruMMLU_params.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
- "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.5_nm_pv21/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
@@ -36,7 +36,7 @@
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
- "use_flash_attention_2": true,
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
@@ -47,7 +47,7 @@
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
- "batch_size": 2,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
 
1
  {
2
  "custom_generation_config": null,
3
  "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
  "generation_config": {
6
  "bos_token_id": 145109,
7
  "do_sample": true,
 
36
  },
37
  "load_in_8bit": false,
38
  "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
  "device_map": "cuda:0",
41
  "use_fast_tokenizer": true,
42
  "leading_space": false,
 
47
  "task_params": {
48
  "max_len": 4000,
49
  "few_shot_count": 0,
50
+ "batch_size": 16,
51
  "max_sample_per_dataset": 10000000000000,
52
  "method": "calculate_tokens_proba"
53
  }
llmtf_eval/nlpcoreteam_ruMMLU_total.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "task_name": "nlpcoreteam/ruMMLU",
3
  "results": {
4
- "acc": 0.47346768524963356
5
  },
6
- "leaderboard_result": 0.47346768524963356
7
  }
 
1
  {
2
  "task_name": "nlpcoreteam/ruMMLU",
3
  "results": {
4
+ "acc": 0.47188210698069283
5
  },
6
+ "leaderboard_result": 0.47188210698069283
7
  }
llmtf_eval/ruopinionne.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/ruopinionne_params.jsonl ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
+ "generation_config": {
6
+ "bos_token_id": 145109,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 145111
10
+ ],
11
+ "max_length": 32768,
12
+ "max_new_tokens": 256,
13
+ "pad_token_id": 145109,
14
+ "stop_strings": [
15
+ "<|im_end|>"
16
+ ],
17
+ "temperature": 0.1,
18
+ "top_k": 40,
19
+ "top_p": 0.9,
20
+ "transformers_version": "4.45.2",
21
+ "trust_remote_code": false
22
+ },
23
+ "conversation_template": {
24
+ "system_prompt": "",
25
+ "system_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
26
+ "user_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
27
+ "bot_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
28
+ "bot_message_template_incomplete": "<|im_start|>{role}\n{content}",
29
+ "user_role": "user",
30
+ "bot_role": "assistant",
31
+ "system_role": "system",
32
+ "global_prefix": "",
33
+ "suffix": "<|im_start|>assistant\n",
34
+ "add_special_tokens": false,
35
+ "eos_token": "<|im_end|>"
36
+ },
37
+ "load_in_8bit": false,
38
+ "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
+ "device_map": "cuda:0",
41
+ "use_fast_tokenizer": true,
42
+ "leading_space": false,
43
+ "space_token": null,
44
+ "trust_remote_code": false,
45
+ "max_model_len": 32768
46
+ },
47
+ "task_params": {
48
+ "max_len": 4000,
49
+ "few_shot_count": 0,
50
+ "batch_size": 16,
51
+ "max_sample_per_dataset": 200,
52
+ "method": "generate"
53
+ }
54
+ }
llmtf_eval/ruopinionne_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "ruopinionne",
3
+ "results": {
4
+ "f1": 0.02701209922104298
5
+ },
6
+ "leaderboard_result": 0.02701209922104298
7
+ }
llmtf_eval/ruparam.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d45cd9b4dd24149a1964646b1c3f49d2eae1fa1f73f313a0bb433aac98a22973
3
+ size 12306566
llmtf_eval/ruparam_params.jsonl ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
+ "generation_config": {
6
+ "bos_token_id": 145109,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 145111
10
+ ],
11
+ "max_length": 32768,
12
+ "max_new_tokens": 1,
13
+ "pad_token_id": 145109,
14
+ "stop_strings": [
15
+ "<|im_end|>"
16
+ ],
17
+ "temperature": 0.1,
18
+ "top_k": 40,
19
+ "top_p": 0.9,
20
+ "transformers_version": "4.45.2",
21
+ "trust_remote_code": false
22
+ },
23
+ "conversation_template": {
24
+ "system_prompt": "",
25
+ "system_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
26
+ "user_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
27
+ "bot_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
28
+ "bot_message_template_incomplete": "<|im_start|>{role}\n{content}",
29
+ "user_role": "user",
30
+ "bot_role": "assistant",
31
+ "system_role": "system",
32
+ "global_prefix": "",
33
+ "suffix": "<|im_start|>assistant\n",
34
+ "add_special_tokens": false,
35
+ "eos_token": "<|im_end|>"
36
+ },
37
+ "load_in_8bit": false,
38
+ "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
+ "device_map": "cuda:0",
41
+ "use_fast_tokenizer": true,
42
+ "leading_space": false,
43
+ "space_token": null,
44
+ "trust_remote_code": false,
45
+ "max_model_len": 32768
46
+ },
47
+ "task_params": {
48
+ "max_len": 4000,
49
+ "few_shot_count": 0,
50
+ "batch_size": 16,
51
+ "max_sample_per_dataset": 5000,
52
+ "method": "calculate_tokens_proba"
53
+ }
54
+ }
llmtf_eval/ruparam_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "ruparam",
3
+ "results": {
4
+ "acc": 0.21363220494053065
5
+ },
6
+ "leaderboard_result": 0.21363220494053065
7
+ }
llmtf_eval/shlepa_books_mc.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/shlepa_books_mc_params.jsonl ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
+ "generation_config": {
6
+ "bos_token_id": 145109,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 145111
10
+ ],
11
+ "max_length": 32768,
12
+ "max_new_tokens": 1,
13
+ "pad_token_id": 145109,
14
+ "stop_strings": [
15
+ "<|im_end|>"
16
+ ],
17
+ "temperature": 0.1,
18
+ "top_k": 40,
19
+ "top_p": 0.9,
20
+ "transformers_version": "4.45.2",
21
+ "trust_remote_code": false
22
+ },
23
+ "conversation_template": {
24
+ "system_prompt": "",
25
+ "system_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
26
+ "user_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
27
+ "bot_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
28
+ "bot_message_template_incomplete": "<|im_start|>{role}\n{content}",
29
+ "user_role": "user",
30
+ "bot_role": "assistant",
31
+ "system_role": "system",
32
+ "global_prefix": "",
33
+ "suffix": "<|im_start|>assistant\n",
34
+ "add_special_tokens": false,
35
+ "eos_token": "<|im_end|>"
36
+ },
37
+ "load_in_8bit": false,
38
+ "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
+ "device_map": "cuda:0",
41
+ "use_fast_tokenizer": true,
42
+ "leading_space": false,
43
+ "space_token": null,
44
+ "trust_remote_code": false,
45
+ "max_model_len": 32768
46
+ },
47
+ "task_params": {
48
+ "max_len": 4000,
49
+ "few_shot_count": 0,
50
+ "batch_size": 16,
51
+ "max_sample_per_dataset": 5000,
52
+ "method": "calculate_tokens_proba"
53
+ }
54
+ }
llmtf_eval/shlepa_books_mc_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "shlepa/books_mc",
3
+ "results": {
4
+ "acc": 0.3112033195020747
5
+ },
6
+ "leaderboard_result": 0.3112033195020747
7
+ }
llmtf_eval/shlepa_law_mc.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval/shlepa_law_mc_params.jsonl ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "/scratch/tikhomirov/workdir/data/models/ruadapt_qwen2.5_1.5B_ext_u48_part1-2_lr5e4_bs128_reinit_peft_mlp_32_128_lm_head_mts_1e4_as1.75_v1/simpo2",
5
+ "generation_config": {
6
+ "bos_token_id": 145109,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 145111
10
+ ],
11
+ "max_length": 32768,
12
+ "max_new_tokens": 1,
13
+ "pad_token_id": 145109,
14
+ "stop_strings": [
15
+ "<|im_end|>"
16
+ ],
17
+ "temperature": 0.1,
18
+ "top_k": 40,
19
+ "top_p": 0.9,
20
+ "transformers_version": "4.45.2",
21
+ "trust_remote_code": false
22
+ },
23
+ "conversation_template": {
24
+ "system_prompt": "",
25
+ "system_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
26
+ "user_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
27
+ "bot_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
28
+ "bot_message_template_incomplete": "<|im_start|>{role}\n{content}",
29
+ "user_role": "user",
30
+ "bot_role": "assistant",
31
+ "system_role": "system",
32
+ "global_prefix": "",
33
+ "suffix": "<|im_start|>assistant\n",
34
+ "add_special_tokens": false,
35
+ "eos_token": "<|im_end|>"
36
+ },
37
+ "load_in_8bit": false,
38
+ "torch_dtype": "auto",
39
+ "attn_implementation": "flash_attention_2",
40
+ "device_map": "cuda:0",
41
+ "use_fast_tokenizer": true,
42
+ "leading_space": false,
43
+ "space_token": null,
44
+ "trust_remote_code": false,
45
+ "max_model_len": 32768
46
+ },
47
+ "task_params": {
48
+ "max_len": 4000,
49
+ "few_shot_count": 0,
50
+ "batch_size": 16,
51
+ "max_sample_per_dataset": 5000,
52
+ "method": "calculate_tokens_proba"
53
+ }
54
+ }
llmtf_eval/shlepa_law_mc_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "shlepa/law_mc",
3
+ "results": {
4
+ "acc": 0.537590113285273
5
+ },
6
+ "leaderboard_result": 0.537590113285273
7
+ }
llmtf_eval/shlepa_movie_mc.jsonl ADDED
The diff for this file is too large to render. See raw diff