Muennighoff commited on
Commit
a1243c9
1 Parent(s): 6603402
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 4b284b12bc4/evaluation/4b284b12bc4_0.json +87 -0
  2. 4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json +87 -0
  3. 4b284b12bc4/evaluation/4b284b12bc4_1.json +87 -0
  4. 4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json +87 -0
  5. 4b284b12bc4/evaluation/4b284b12bc4_2.json +87 -0
  6. 4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json +87 -0
  7. 4b284b12bc4/evaluation/4b284b12bc4_3.json +87 -0
  8. 4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json +87 -0
  9. 4b284b12bc4/evaluation/4b284b12bc4_4.json +87 -0
  10. 4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json +87 -0
  11. 4b284b12bc4/evaluation/4b284b12bc4_5.json +66 -0
  12. 4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json +66 -0
  13. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json +133 -0
  14. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json +133 -0
  15. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json +133 -0
  16. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json +133 -0
  17. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json +133 -0
  18. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json +133 -0
  19. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json +133 -0
  20. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json +133 -0
  21. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  22. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  23. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  24. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  25. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json +133 -0
  26. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json +133 -0
  27. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json +133 -0
  28. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json +133 -0
  29. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json +133 -0
  30. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json +133 -0
  31. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json +133 -0
  32. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json +133 -0
  33. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json +133 -0
  34. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json +133 -0
  35. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json +133 -0
  36. 4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json +133 -0
  37. 4b284b17bc4/evaluation/4b284b17bc4_0.json +87 -0
  38. 4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json +87 -0
  39. 4b284b17bc4/evaluation/4b284b17bc4_1.json +87 -0
  40. 4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json +87 -0
  41. 4b284b17bc4/evaluation/4b284b17bc4_2.json +87 -0
  42. 4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json +87 -0
  43. 4b284b17bc4/evaluation/4b284b17bc4_3.json +87 -0
  44. 4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json +87 -0
  45. 4b284b17bc4/evaluation/4b284b17bc4_4.json +87 -0
  46. 4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json +87 -0
  47. 4b284b17bc4/evaluation/4b284b17bc4_5.json +73 -0
  48. 4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json +73 -0
  49. 4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json +133 -0
  50. 4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json +133 -0
4b284b12bc4/evaluation/4b284b12bc4_0.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.335,
5
+ "acc_stderr": 0.014933117490932575
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.334,
9
+ "acc_stderr": 0.014922019523732961
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3491666666666667,
13
+ "acc_stderr": 0.013767075395077249
14
+ },
15
+ "cb": {
16
+ "acc": 0.39285714285714285,
17
+ "acc_stderr": 0.0658538889806635,
18
+ "f1": 0.23306878306878312
19
+ },
20
+ "copa": {
21
+ "acc": 0.77,
22
+ "acc_stderr": 0.04229525846816506
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4695279824736108,
26
+ "acc_stderr": 0.0049805063294075845,
27
+ "acc_norm": 0.6132244572794264,
28
+ "acc_norm_stderr": 0.004860162076330956
29
+ },
30
+ "rte": {
31
+ "acc": 0.5812274368231047,
32
+ "acc_stderr": 0.02969666108123484
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5753749013417522,
36
+ "acc_stderr": 0.013891893150264218
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.711918760021379,
40
+ "acc_stderr": 0.010472537019822578
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5464831804281346,
44
+ "acc_stderr": 0.008707182331111644
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5538720538720538,
48
+ "acc_stderr": 0.01020005782876501,
49
+ "acc_norm": 0.4936868686868687,
50
+ "acc_norm_stderr": 0.01025896566804443
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2636518771331058,
54
+ "acc_stderr": 0.012875929151297049,
55
+ "acc_norm": 0.2883959044368601,
56
+ "acc_norm_stderr": 0.013238394422428175
57
+ },
58
+ "sciq": {
59
+ "acc": 0.82,
60
+ "acc_stderr": 0.012155153135511965,
61
+ "acc_norm": 0.749,
62
+ "acc_norm_stderr": 0.013718133516888921
63
+ },
64
+ "piqa": {
65
+ "acc": 0.73449401523395,
66
+ "acc_stderr": 0.010303308653024429,
67
+ "acc_norm": 0.7475516866158868,
68
+ "acc_norm_stderr": 0.010135665547362354
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.335,
5
+ "acc_stderr": 0.014933117490932575
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.334,
9
+ "acc_stderr": 0.014922019523732961
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3491666666666667,
13
+ "acc_stderr": 0.013767075395077249
14
+ },
15
+ "cb": {
16
+ "acc": 0.39285714285714285,
17
+ "acc_stderr": 0.0658538889806635,
18
+ "f1": 0.23306878306878312
19
+ },
20
+ "copa": {
21
+ "acc": 0.77,
22
+ "acc_stderr": 0.04229525846816506
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4695279824736108,
26
+ "acc_stderr": 0.0049805063294075845,
27
+ "acc_norm": 0.6132244572794264,
28
+ "acc_norm_stderr": 0.004860162076330956
29
+ },
30
+ "rte": {
31
+ "acc": 0.5812274368231047,
32
+ "acc_stderr": 0.02969666108123484
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5753749013417522,
36
+ "acc_stderr": 0.013891893150264218
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.711918760021379,
40
+ "acc_stderr": 0.010472537019822578
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5464831804281346,
44
+ "acc_stderr": 0.008707182331111644
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5538720538720538,
48
+ "acc_stderr": 0.01020005782876501,
49
+ "acc_norm": 0.4936868686868687,
50
+ "acc_norm_stderr": 0.01025896566804443
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2636518771331058,
54
+ "acc_stderr": 0.012875929151297049,
55
+ "acc_norm": 0.2883959044368601,
56
+ "acc_norm_stderr": 0.013238394422428175
57
+ },
58
+ "sciq": {
59
+ "acc": 0.82,
60
+ "acc_stderr": 0.012155153135511965,
61
+ "acc_norm": 0.749,
62
+ "acc_norm_stderr": 0.013718133516888921
63
+ },
64
+ "piqa": {
65
+ "acc": 0.73449401523395,
66
+ "acc_stderr": 0.010303308653024429,
67
+ "acc_norm": 0.7475516866158868,
68
+ "acc_norm_stderr": 0.010135665547362354
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_1.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.333,
5
+ "acc_stderr": 0.014910846164229868
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.326,
9
+ "acc_stderr": 0.01483050720454104
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3475,
13
+ "acc_stderr": 0.013751753243291852
14
+ },
15
+ "cb": {
16
+ "acc": 0.5357142857142857,
17
+ "acc_stderr": 0.06724777654937658,
18
+ "f1": 0.37227304714989445
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.47191794463254333,
26
+ "acc_stderr": 0.004981905293878145,
27
+ "acc_norm": 0.6139215295757817,
28
+ "acc_norm_stderr": 0.004858539527872466
29
+ },
30
+ "rte": {
31
+ "acc": 0.5703971119133574,
32
+ "acc_stderr": 0.029796668829124674
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5706393054459353,
36
+ "acc_stderr": 0.013911537499969163
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7151256012827365,
40
+ "acc_stderr": 0.01043751398661172
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5669724770642202,
44
+ "acc_stderr": 0.00866625130551806
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5913299663299664,
48
+ "acc_stderr": 0.010087174498762883,
49
+ "acc_norm": 0.5496632996632996,
50
+ "acc_norm_stderr": 0.010209047724374145
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2627986348122867,
54
+ "acc_stderr": 0.012862523175351333,
55
+ "acc_norm": 0.30716723549488056,
56
+ "acc_norm_stderr": 0.013481034054980943
57
+ },
58
+ "sciq": {
59
+ "acc": 0.836,
60
+ "acc_stderr": 0.011715000693181331,
61
+ "acc_norm": 0.781,
62
+ "acc_norm_stderr": 0.013084731950262012
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7448313384113167,
66
+ "acc_stderr": 0.010171571592521822,
67
+ "acc_norm": 0.7535364526659413,
68
+ "acc_norm_stderr": 0.01005481078967181
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.333,
5
+ "acc_stderr": 0.014910846164229868
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.326,
9
+ "acc_stderr": 0.01483050720454104
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3475,
13
+ "acc_stderr": 0.013751753243291852
14
+ },
15
+ "cb": {
16
+ "acc": 0.5357142857142857,
17
+ "acc_stderr": 0.06724777654937658,
18
+ "f1": 0.37227304714989445
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.47191794463254333,
26
+ "acc_stderr": 0.004981905293878145,
27
+ "acc_norm": 0.6139215295757817,
28
+ "acc_norm_stderr": 0.004858539527872466
29
+ },
30
+ "rte": {
31
+ "acc": 0.5703971119133574,
32
+ "acc_stderr": 0.029796668829124674
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5706393054459353,
36
+ "acc_stderr": 0.013911537499969163
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7151256012827365,
40
+ "acc_stderr": 0.01043751398661172
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5669724770642202,
44
+ "acc_stderr": 0.00866625130551806
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5913299663299664,
48
+ "acc_stderr": 0.010087174498762883,
49
+ "acc_norm": 0.5496632996632996,
50
+ "acc_norm_stderr": 0.010209047724374145
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2627986348122867,
54
+ "acc_stderr": 0.012862523175351333,
55
+ "acc_norm": 0.30716723549488056,
56
+ "acc_norm_stderr": 0.013481034054980943
57
+ },
58
+ "sciq": {
59
+ "acc": 0.836,
60
+ "acc_stderr": 0.011715000693181331,
61
+ "acc_norm": 0.781,
62
+ "acc_norm_stderr": 0.013084731950262012
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7448313384113167,
66
+ "acc_stderr": 0.010171571592521822,
67
+ "acc_norm": 0.7535364526659413,
68
+ "acc_norm_stderr": 0.01005481078967181
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_2.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.329,
5
+ "acc_stderr": 0.014865395385928354
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.336,
9
+ "acc_stderr": 0.014944140233795027
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3383333333333333,
13
+ "acc_stderr": 0.013664144006618266
14
+ },
15
+ "cb": {
16
+ "acc": 0.48214285714285715,
17
+ "acc_stderr": 0.06737697508644648,
18
+ "f1": 0.3338011695906433
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4697271459868552,
26
+ "acc_stderr": 0.004980627287147585,
27
+ "acc_norm": 0.6141206930890261,
28
+ "acc_norm_stderr": 0.004858074013443988
29
+ },
30
+ "rte": {
31
+ "acc": 0.5523465703971119,
32
+ "acc_stderr": 0.02993107036293953
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.574585635359116,
36
+ "acc_stderr": 0.013895257666646378
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7156600748262961,
40
+ "acc_stderr": 0.010431614128665253
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5660550458715596,
44
+ "acc_stderr": 0.008668405003744129
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5993265993265994,
48
+ "acc_stderr": 0.01005530447425557,
49
+ "acc_norm": 0.5576599326599326,
50
+ "acc_norm_stderr": 0.01019133444422085
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2781569965870307,
54
+ "acc_stderr": 0.013094469919538805,
55
+ "acc_norm": 0.30887372013651876,
56
+ "acc_norm_stderr": 0.013501770929344003
57
+ },
58
+ "sciq": {
59
+ "acc": 0.835,
60
+ "acc_stderr": 0.011743632866916145,
61
+ "acc_norm": 0.79,
62
+ "acc_norm_stderr": 0.01288666233227453
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7470076169749728,
66
+ "acc_stderr": 0.01014288869886246,
67
+ "acc_norm": 0.7519042437431991,
68
+ "acc_norm_stderr": 0.010077118315574706
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.329,
5
+ "acc_stderr": 0.014865395385928354
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.336,
9
+ "acc_stderr": 0.014944140233795027
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3383333333333333,
13
+ "acc_stderr": 0.013664144006618266
14
+ },
15
+ "cb": {
16
+ "acc": 0.48214285714285715,
17
+ "acc_stderr": 0.06737697508644648,
18
+ "f1": 0.3338011695906433
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4697271459868552,
26
+ "acc_stderr": 0.004980627287147585,
27
+ "acc_norm": 0.6141206930890261,
28
+ "acc_norm_stderr": 0.004858074013443988
29
+ },
30
+ "rte": {
31
+ "acc": 0.5523465703971119,
32
+ "acc_stderr": 0.02993107036293953
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.574585635359116,
36
+ "acc_stderr": 0.013895257666646378
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7156600748262961,
40
+ "acc_stderr": 0.010431614128665253
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5660550458715596,
44
+ "acc_stderr": 0.008668405003744129
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5993265993265994,
48
+ "acc_stderr": 0.01005530447425557,
49
+ "acc_norm": 0.5576599326599326,
50
+ "acc_norm_stderr": 0.01019133444422085
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2781569965870307,
54
+ "acc_stderr": 0.013094469919538805,
55
+ "acc_norm": 0.30887372013651876,
56
+ "acc_norm_stderr": 0.013501770929344003
57
+ },
58
+ "sciq": {
59
+ "acc": 0.835,
60
+ "acc_stderr": 0.011743632866916145,
61
+ "acc_norm": 0.79,
62
+ "acc_norm_stderr": 0.01288666233227453
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7470076169749728,
66
+ "acc_stderr": 0.01014288869886246,
67
+ "acc_norm": 0.7519042437431991,
68
+ "acc_norm_stderr": 0.010077118315574706
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_3.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.332,
5
+ "acc_stderr": 0.014899597242811485
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.334,
9
+ "acc_stderr": 0.014922019523732963
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.35,
13
+ "acc_stderr": 0.013774667009018554
14
+ },
15
+ "cb": {
16
+ "acc": 0.6071428571428571,
17
+ "acc_stderr": 0.0658538889806635,
18
+ "f1": 0.42400932400932395
19
+ },
20
+ "copa": {
21
+ "acc": 0.81,
22
+ "acc_stderr": 0.03942772444036622
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.47241585341565423,
26
+ "acc_stderr": 0.004982182323923561,
27
+ "acc_norm": 0.6199960167297351,
28
+ "acc_norm_stderr": 0.004843954338451449
29
+ },
30
+ "rte": {
31
+ "acc": 0.5379061371841155,
32
+ "acc_stderr": 0.030009848912529113
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5737963693764798,
36
+ "acc_stderr": 0.013898585965412338
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7124532335649385,
40
+ "acc_stderr": 0.010466744473098363
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5587155963302752,
44
+ "acc_stderr": 0.008684548127832637
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5955387205387206,
48
+ "acc_stderr": 0.010070746648278783,
49
+ "acc_norm": 0.5740740740740741,
50
+ "acc_norm_stderr": 0.010146568651002255
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2815699658703072,
54
+ "acc_stderr": 0.013143376735009022,
55
+ "acc_norm": 0.3122866894197952,
56
+ "acc_norm_stderr": 0.013542598541688067
57
+ },
58
+ "sciq": {
59
+ "acc": 0.841,
60
+ "acc_stderr": 0.01156947936827129,
61
+ "acc_norm": 0.796,
62
+ "acc_norm_stderr": 0.012749374359024384
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7513601741022851,
66
+ "acc_stderr": 0.01008451123429685,
67
+ "acc_norm": 0.7578890097932536,
68
+ "acc_norm_stderr": 0.009994371269104397
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.332,
5
+ "acc_stderr": 0.014899597242811485
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.334,
9
+ "acc_stderr": 0.014922019523732963
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.35,
13
+ "acc_stderr": 0.013774667009018554
14
+ },
15
+ "cb": {
16
+ "acc": 0.6071428571428571,
17
+ "acc_stderr": 0.0658538889806635,
18
+ "f1": 0.42400932400932395
19
+ },
20
+ "copa": {
21
+ "acc": 0.81,
22
+ "acc_stderr": 0.03942772444036622
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.47241585341565423,
26
+ "acc_stderr": 0.004982182323923561,
27
+ "acc_norm": 0.6199960167297351,
28
+ "acc_norm_stderr": 0.004843954338451449
29
+ },
30
+ "rte": {
31
+ "acc": 0.5379061371841155,
32
+ "acc_stderr": 0.030009848912529113
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5737963693764798,
36
+ "acc_stderr": 0.013898585965412338
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7124532335649385,
40
+ "acc_stderr": 0.010466744473098363
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5587155963302752,
44
+ "acc_stderr": 0.008684548127832637
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5955387205387206,
48
+ "acc_stderr": 0.010070746648278783,
49
+ "acc_norm": 0.5740740740740741,
50
+ "acc_norm_stderr": 0.010146568651002255
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2815699658703072,
54
+ "acc_stderr": 0.013143376735009022,
55
+ "acc_norm": 0.3122866894197952,
56
+ "acc_norm_stderr": 0.013542598541688067
57
+ },
58
+ "sciq": {
59
+ "acc": 0.841,
60
+ "acc_stderr": 0.01156947936827129,
61
+ "acc_norm": 0.796,
62
+ "acc_norm_stderr": 0.012749374359024384
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7513601741022851,
66
+ "acc_stderr": 0.01008451123429685,
67
+ "acc_norm": 0.7578890097932536,
68
+ "acc_norm_stderr": 0.009994371269104397
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_4.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.337,
5
+ "acc_stderr": 0.014955087918653603
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.349,
9
+ "acc_stderr": 0.015080663991563102
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.36666666666666664,
13
+ "acc_stderr": 0.013916893275819938
14
+ },
15
+ "cb": {
16
+ "acc": 0.44642857142857145,
17
+ "acc_stderr": 0.067031892279424,
18
+ "f1": 0.3176100628930817
19
+ },
20
+ "copa": {
21
+ "acc": 0.8,
22
+ "acc_stderr": 0.040201512610368445
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4722166899024099,
26
+ "acc_stderr": 0.004982072108448081,
27
+ "acc_norm": 0.6184027086237801,
28
+ "acc_norm_stderr": 0.004847857546957481
29
+ },
30
+ "rte": {
31
+ "acc": 0.5379061371841155,
32
+ "acc_stderr": 0.03000984891252911
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.56353591160221,
36
+ "acc_stderr": 0.013938569465677023
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7194013896312133,
40
+ "acc_stderr": 0.010389809647288821
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5636085626911315,
44
+ "acc_stderr": 0.008674000467432068
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6039562289562289,
48
+ "acc_stderr": 0.010035580962097942,
49
+ "acc_norm": 0.5702861952861953,
50
+ "acc_norm_stderr": 0.010157908005763674
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2790102389078498,
54
+ "acc_stderr": 0.013106784883601346,
55
+ "acc_norm": 0.3165529010238908,
56
+ "acc_norm_stderr": 0.013592431519068077
57
+ },
58
+ "sciq": {
59
+ "acc": 0.842,
60
+ "acc_stderr": 0.011539894677559568,
61
+ "acc_norm": 0.789,
62
+ "acc_norm_stderr": 0.012909130321042092
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7431991294885746,
66
+ "acc_stderr": 0.010192864802278045,
67
+ "acc_norm": 0.7568008705114254,
68
+ "acc_norm_stderr": 0.010009611953858915
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.337,
5
+ "acc_stderr": 0.014955087918653603
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.349,
9
+ "acc_stderr": 0.015080663991563102
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.36666666666666664,
13
+ "acc_stderr": 0.013916893275819938
14
+ },
15
+ "cb": {
16
+ "acc": 0.44642857142857145,
17
+ "acc_stderr": 0.067031892279424,
18
+ "f1": 0.3176100628930817
19
+ },
20
+ "copa": {
21
+ "acc": 0.8,
22
+ "acc_stderr": 0.040201512610368445
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4722166899024099,
26
+ "acc_stderr": 0.004982072108448081,
27
+ "acc_norm": 0.6184027086237801,
28
+ "acc_norm_stderr": 0.004847857546957481
29
+ },
30
+ "rte": {
31
+ "acc": 0.5379061371841155,
32
+ "acc_stderr": 0.03000984891252911
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.56353591160221,
36
+ "acc_stderr": 0.013938569465677023
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7194013896312133,
40
+ "acc_stderr": 0.010389809647288821
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5636085626911315,
44
+ "acc_stderr": 0.008674000467432068
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6039562289562289,
48
+ "acc_stderr": 0.010035580962097942,
49
+ "acc_norm": 0.5702861952861953,
50
+ "acc_norm_stderr": 0.010157908005763674
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2790102389078498,
54
+ "acc_stderr": 0.013106784883601346,
55
+ "acc_norm": 0.3165529010238908,
56
+ "acc_norm_stderr": 0.013592431519068077
57
+ },
58
+ "sciq": {
59
+ "acc": 0.842,
60
+ "acc_stderr": 0.011539894677559568,
61
+ "acc_norm": 0.789,
62
+ "acc_norm_stderr": 0.012909130321042092
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7431991294885746,
66
+ "acc_stderr": 0.010192864802278045,
67
+ "acc_norm": 0.7568008705114254,
68
+ "acc_norm_stderr": 0.010009611953858915
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b12bc4/evaluation/4b284b12bc4_5.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.332,
5
+ "acc_stderr": 0.014899597242811487
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.329,
9
+ "acc_stderr": 0.014865395385928357
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3541666666666667,
13
+ "acc_stderr": 0.013811933499570954
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942395,
18
+ "f1": 0.38376730002345766
19
+ },
20
+ "copa": {
21
+ "acc": 0.81,
22
+ "acc_stderr": 0.03942772444036623
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.47400916152160927,
26
+ "acc_stderr": 0.004983035420235716,
27
+ "acc_norm": 0.619896434973113,
28
+ "acc_norm_stderr": 0.004844199910173026
29
+ },
30
+ "rte": {
31
+ "acc": 0.516245487364621,
32
+ "acc_stderr": 0.030080573208738064
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5722178374112076,
36
+ "acc_stderr": 0.013905134013839944
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7177979690005345,
40
+ "acc_stderr": 0.010407834479647675
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5648318042813456,
44
+ "acc_stderr": 0.008671229580582118
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5997474747474747,
48
+ "acc_stderr": 0.010053550119896127,
49
+ "acc_norm": 0.569023569023569,
50
+ "acc_norm_stderr": 0.010161552863493746
51
+ }
52
+ },
53
+ "versions": {
54
+ "anli_r1": 0,
55
+ "anli_r2": 0,
56
+ "anli_r3": 0,
57
+ "cb": 1,
58
+ "copa": 0,
59
+ "hellaswag": 0,
60
+ "rte": 0,
61
+ "winogrande": 0,
62
+ "storycloze_2016": 0,
63
+ "boolq": 1,
64
+ "arc_easy": 0
65
+ }
66
+ }
4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.332,
5
+ "acc_stderr": 0.014899597242811487
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.329,
9
+ "acc_stderr": 0.014865395385928357
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3541666666666667,
13
+ "acc_stderr": 0.013811933499570954
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942395,
18
+ "f1": 0.38376730002345766
19
+ },
20
+ "copa": {
21
+ "acc": 0.81,
22
+ "acc_stderr": 0.03942772444036623
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.47400916152160927,
26
+ "acc_stderr": 0.004983035420235716,
27
+ "acc_norm": 0.619896434973113,
28
+ "acc_norm_stderr": 0.004844199910173026
29
+ },
30
+ "rte": {
31
+ "acc": 0.516245487364621,
32
+ "acc_stderr": 0.030080573208738064
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5722178374112076,
36
+ "acc_stderr": 0.013905134013839944
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7177979690005345,
40
+ "acc_stderr": 0.010407834479647675
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5648318042813456,
44
+ "acc_stderr": 0.008671229580582118
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5997474747474747,
48
+ "acc_stderr": 0.010053550119896127,
49
+ "acc_norm": 0.569023569023569,
50
+ "acc_norm_stderr": 0.010161552863493746
51
+ }
52
+ },
53
+ "versions": {
54
+ "anli_r1": 0,
55
+ "anli_r2": 0,
56
+ "anli_r3": 0,
57
+ "cb": 1,
58
+ "copa": 0,
59
+ "hellaswag": 0,
60
+ "rte": 0,
61
+ "winogrande": 0,
62
+ "storycloze_2016": 0,
63
+ "boolq": 1,
64
+ "arc_easy": 0
65
+ }
66
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.4070835356827751,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.03514958095848397
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.0758536616906455,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0015747064380670645
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.3264375465319237,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004888854445231445
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11509298027342854,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.002040147114373331
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.03493638633069714,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0009342574915112234
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.15766160622381195,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0033114573324024405
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.0532813862747049,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0012579627803205211
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07257604824526195,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0014483785678009685
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.31637706878833355,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004769735504597033
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.1105412242108245,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0019072286738988954
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.0714774644843108,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0014699104009543759
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.307939556685913,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004520814685280998
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.10843545057843905,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0019083088150967664
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 0,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.41914858834195134,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.030279335876129
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.07536633674836868,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.001620641410096321
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.3290768382699901,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.00481767508183653
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11424698089656772,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.001973221738343803
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.03540467062379218,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.001074817084017668
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.16089821041540717,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0033011630774406127
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.05368591058094131,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0012551880063213156
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07231503158237214,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0015163361416883465
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.3189205930522712,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004694857387684187
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.10991123942051419,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018557651460448018
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.07148579673935408,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0015357817111525064
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.3110112645350247,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.00441643475943137
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.1082043807305256,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018480349337665876
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 1,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.4241874936612034,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.03699728854949305
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.07469786641233617,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0015771153206732972
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.32891693541469197,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004751520151482175
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11375522621692136,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019642936162507533
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.03462695918297652,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0009079391487918842
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.16210166248343671,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003411098262587952
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.05344291957030947,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.001233885317072834
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07161852924412807,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0014761860684115284
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.31797917629392425,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004598849198704314
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.10934081987838024,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018415550723111455
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.0711214967812572,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.00149377360051449
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.3130870814045286,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004421211065212564
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.10823991385374933,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018380668821100924
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.3916994292697065,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.02655023153261868
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.07713695315738618,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0018521617901133295
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.32641437991318506,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004583689746653368
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11443103117633296,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019845366723218495
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.036319480745632425,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0012079439649413412
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.15985213856119682,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003223582265695079
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.05368996382308088,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0012403567348119643
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07333561421491072,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0017170202297610163
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.3129899723170166,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004403504671395443
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.10920281437234497,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018472429946517301
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.07332584684616669,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0017553031622823821
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.31070372160179327,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004314602758278112
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.10882089385505,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018674885337082484
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.37875018794247045,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.024296780304434905
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.07231813177556075,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0015118246585830762
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.31870699434574523,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0046463072458484975
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.1102139471634063,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019620155943445507
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.033695317164630666,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0009163247572691914
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.15554105747469235,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003236397171744527
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.0515680827205002,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.001213141047008391
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.06901574750871842,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0013947803448898716
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.30638147232578594,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004472401624140904
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.1054595308766645,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018290764447497754
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.06885871876103705,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.001420315845279487
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.30338100654345734,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.0043561963135752306
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.10492767242713451,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.001836832016012516
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.3689406693649318,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.01833284872989782
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.0725733890131515,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0016541722828599028
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.31647681542290346,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004649887369574888
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.10942321553706275,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.001960578009336271
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.0340511038621137,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.001109810266658632
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.1539259113242296,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003295678214536681
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.05107734688924233,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.00122669666906548
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.06920155517233274,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0015510532870385023
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.30372126675172995,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004472388579309833
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.10453686352175766,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018279045748061345
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.0688242602910231,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0015744817424633028
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.3000075619025587,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004328528641012373
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.1036182486745027,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018321897232056268
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.0505257980847339,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0009316535384306269
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.07944574030730557,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0013436416711421135
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.05739499438745971,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.000959404112224303
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0025522430280765624,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.00019188137819214202
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.003889530091650386,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0003502492712853139
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.002874313185982406,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.00022791640812011725
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.04641940622808299,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0007892141876172595
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.07403750267734954,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0011941764542527037
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.053008425140295905,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0008201296289562219
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.04816018092226108,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0008696938078640268
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.07599978162074517,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0012632037879506343
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.05476968826480647,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0008953065390907387
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.14859459498800928,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.019538924284114197
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 0,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.13584035826185337,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.001883003830967405
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.2300655738827389,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002771815067524841
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.15836293018887368,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.001891640433790272
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.02400583408187123,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0007285343955308211
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.04289680599794199,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.001353605133009988
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.028190707681194575,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0008118715649523407
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.10298984208878467,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0012746961770117027
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.18009351587124983,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002104050870387258
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.12130392597137107,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0012906736654645788
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.12668865175777289,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0017429813042225584
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.21558088869876474,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0026000978058887433
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.14788907195330203,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0017520956258023405
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 1.346869321685321,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.06607443264134674
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 1,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.1708173947979992,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002081500885534108
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.2857546859749413,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002766073063501205
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.1979943409707515,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0019701979235594003
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.03867526801746755,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0009249837176721441
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.06488659439579164,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0015151297174771778
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.04456119604899187,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0009779372383836055
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.12436000788073821,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0013936495773448447
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.21512580022359384,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0021392819398317084
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.14561448339286645,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0013308339769298708
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.15896932268065497,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0019322450604166179
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.2668532631896298,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.00261075911429361
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.18443888049277404,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0018323545176278458
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 2.247794388992107,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.09928029909737168
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.15395480397824707,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002402542953049042
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.24557521515724243,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0032199609208037362
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.17165885454358776,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002266523062316873
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0345849314932535,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.000925086814362582
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.05685215254790676,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0014741611375388177
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.03887583188926559,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0009318869486006191
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.11262382269704937,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.001735467609168603
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.1853949845751863,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002495030067359919
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.12611107179368627,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0015753873989903184
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.1426484686620457,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0022377962576445335
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.22876101619014125,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0030299996388814796
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.15920866146470747,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0021025779963518188
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 2.212268753332442,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.09749124513916169
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.05323367871163294,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002019764759552398
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.08477832633294664,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0029256767843087337
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.057194516095400834,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0019532780600384314
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.011892613439611567,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0006525291351364055
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.021014952711012305,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0011420293722952714
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.013407675922368708,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0006687476239191229
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.03994947969161068,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0015323136750776513
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.0654114763710059,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0022890377474177525
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.0429641799328658,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0014362645716637073
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.049203521589357486,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0018745212624263795
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.07828616324493042,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.00270998239625019
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.052823275351703086,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0018085153746161442
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.5159665881377578,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.0355848354666848
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.008397509125114434,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0008676037544483993
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.013182384245950918,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0012488579545023588
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.00885537771521418,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0008474145806800235
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0018519121687661717,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0002652232008037664
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.003201546871291623,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.00042951400653203656
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.0020845828252393957,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0002776449859686965
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.006312806712827651,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0006339300731280119
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.010466170637582555,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0010119772330203227
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.006795329450745382,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0006430172034977336
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.0076894646083944945,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.000786099271409283
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.012227793486074013,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0011695312324965957
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.008112137370163754,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0007719985459243486
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 5.133528491740168e-07,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 9.288876136024227e-07
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 0.02601889547824242,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.008845990174481217
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.013188289488289925,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0003020110162685144
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.02331912229418711,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0005487407492538043
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.016342850382717815,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.00034912236129064063
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.00018882696164487857,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 4.0143613745290394e-05
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.0003925072247489244,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 9.925690772584578e-05
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.00024104025657346095,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 5.3010616942847675e-05
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.013188289488289925,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0003020110162685144
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.02331912229418711,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.0005487407492538043
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.016342850382717815,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.00034912236129064063
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.012772945572946016,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.000289959796992291
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.022577540598345845,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.0005237847970514886
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.015820908294670494,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.00033227991604236137
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 0,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 10.11194167971178,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.11325409958385195
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.44662580344767955,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0029145470980427935
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.403325138761631,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0029812540605776657
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.396708212066539,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0023271634805327288
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.19849995481108837,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0023301034372897886
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.17476709268624893,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0019834376005625296
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.1714205638298909,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0017391694364972787
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.3259862976774872,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0025491424613272398
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.28928330619087117,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.002329063084463768
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.28469304359904984,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018043411811584805
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.37806486217321694,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.002787004218176944
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.3367501905387613,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.002641205675752919
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.3323030796627126,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.002111384569072981
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 1,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 10.765851233592166,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.1147637687545087
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.5029109698404333,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0032326251122724503
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.399072749631299,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.002770754484292409
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.4194853660757534,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.002222220598248002
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.23608052397188775,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.002567411684370418
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.18324767332759007,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.001989894281113984
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.19259169221915515,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0018423181170468268
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.3660346353864222,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0028533265797324394
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.28763635547225785,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.002249079446710024
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.3027130154416743,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.001900798692043736
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.4152150772992765,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0030631524445173153
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.32804484742896467,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.002520109021821381
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.34512069878778673,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.00213278119755698
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 10.504414399066166,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.14144404460789148
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.509958754752616,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.003154730928607417
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.3954899202585752,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.002608601302312822
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.42179412730593185,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0021527795098671134
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.24242098353513022,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.002484157206893866
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.18360253637850166,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.001904297402323581
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.19636018570824587,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0018014064871279597
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.36777401786978037,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.002648283289980144
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.28409076327843025,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.002115488452238855
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.30275824983777616,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018031245552577217
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.4185549498251625,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0029283489596877298
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.32342169868683124,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.0023588979812351725
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.3452111361624014,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0020448977561221436
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 10.453461006006084,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.20323399299325623
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.5152975398825912,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0032876121566522126
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.3875757012647283,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.002563136912882847
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.4184745538314975,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.002152424221911221
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.24767323400172267,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.002578983235298861
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.1802765312255684,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0018268424519338505
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.19590832872090894,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0017695553874619732
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.37129359217610464,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0027272181980460375
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.2780682094337028,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.0020775510109816452
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.30013711995116676,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.001799774395616833
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.4223690533102043,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.003005111083307187
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.3169024351381635,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.0023241001320442878
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.34206938848652774,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0020120604230570572
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 10.336987597938899,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.20513507856533955
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.5143545157562858,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.003337934175013788
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.3857868235592206,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0024923995035469678
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.4170315564856164,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.002124259039598598
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.24631251817723201,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.002657555991369566
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.17787346956272435,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.001792217462473695
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.19402158147865167,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0017876994973534497
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.37299125515052484,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.002815154394732632
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.27850203471081203,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.002038628599689064
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.3009842944910936,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018055725861154817
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.42379837225389067,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0031034115053880863
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.31679500554979756,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.002259000761426462
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.3425854571657328,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0020124639191903327
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.08057623396604993,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0018427190672612415
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.1903611222785915,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004119319495363413
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.11144536072130062,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.002449613618907836
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.012439638488265747,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0007674450472725088
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.029923546174765742,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0018325311907880977
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.01730052045113504,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0010550283614850532
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.06836163772266587,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.001418824283099982
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.16290273051352785,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0032456160924942976
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.09482469984719238,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.001891115330305303
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.06720724706425169,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.001458820899362241
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.16056752379802697,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003380030301216589
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.09329416183941738,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0019565438808381327
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.711911214189282,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.062271095680873176
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 0,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.08658830544513134,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0018612611645494558
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.21204874097915127,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004437751832600217
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.12151658666281231,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0025542911124592704
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.013510304634606875,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0007312649740318255
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.03415295300257043,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0019425669982816587
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.019132118327200527,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0010395559026960023
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0702987343925042,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0013468612933160927
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.17295883718094748,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0033149082764143117
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.09875044840716671,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0018572603815356456
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.07169769773353042,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.001454220219767976
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.17642520944692217,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0035711244979865823
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.10073542075586238,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002008424522536892
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.7485653629026496,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.10161315249240252
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 1,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.10919972863971208,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0018326070695569962
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.27085077984418787,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004340987074007443
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.15372555310846955,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0024917036123975646
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.018660104842681158,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0008521301906071727
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.048121178106264206,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.002236763054096929
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.026545543337132424,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.001202505861016018
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.08598679476507894,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.001347479595633704
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.21491972476150967,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0033319697703370205
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.12124448179130719,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0018406701231741705
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.08936454412712036,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.00147707143314301
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.22313932236999878,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003637762226705496
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.12600811759241443,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002023272430841333
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.0047358326681721,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.07696741647689843
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.12156176985300436,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0019916743006548566
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.2919486745230574,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004594202930152972
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.16788870363340208,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0026015869149755492
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.023390561110934703,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0009482124699139898
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.05931283277905219,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.00253961685376344
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.03291830334125208,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0013348595001679636
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.09410425031338661,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.001505615984367887
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.22729829891507616,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0035793671647219765
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.12998891345659275,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.00194502365003499
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.09794367331761664,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0016077634502078913
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.23679450093245694,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003856459984346274
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.13536430596241408,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0020963457328233175
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.3876528749760366,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.09352517366139018
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.04061391536633297,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0025885117246031656
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0774019664424454,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004501264779451914
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.04817132624367883,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.002710318861619294
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.007571424737600509,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.000774569805719568
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.017306355810365114,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0017078169253319931
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.010104068388385765,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0009810253225945517
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.03226926320041072,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0021853447839680425
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.0606544963878778,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0035695265285015203
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.03758941214391744,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.002107269506833207
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.03361123521194327,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.002255713167385482
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.063038545340123,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003730201727070476
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.03925168824333697,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0022220799806552142
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.8091606018729823,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.13365493953263705
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.0025292500918997793,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0006843948420078455
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0020532297175197265,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0005556296741534733
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.002218415772902317,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0005977307451849004
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.0004376650603065697,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00024169751059179606
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.00040004436924525715,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0002447488200268485
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.00041371259854665804,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0002415623180229552
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0021231909904583543,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0005742362353931501
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.0017609197535564964,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0004851127145778472
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.0018828340816919485,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0005121954193145257
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.002170837264519722,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0005838940116621144
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0017952250708806817,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0004910386484249093
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.0019227239855572795,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0005197852399034371
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 2.9417748605436574e-39,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.644365126953672e-33
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b17bc4/evaluation/4b284b17bc4_0.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.332,
5
+ "acc_stderr": 0.014899597242811478
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.329,
9
+ "acc_stderr": 0.014865395385928362
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.34833333333333333,
13
+ "acc_stderr": 0.013759437498874075
14
+ },
15
+ "cb": {
16
+ "acc": 0.5714285714285714,
17
+ "acc_stderr": 0.06672848092813058,
18
+ "f1": 0.3888888888888889
19
+ },
20
+ "copa": {
21
+ "acc": 0.76,
22
+ "acc_stderr": 0.04292346959909283
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.469627564230233,
26
+ "acc_stderr": 0.004980566907790459,
27
+ "acc_norm": 0.6134236207926708,
28
+ "acc_norm_stderr": 0.004859699562451462
29
+ },
30
+ "rte": {
31
+ "acc": 0.5415162454873647,
32
+ "acc_stderr": 0.029992535385373314
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5737963693764798,
36
+ "acc_stderr": 0.013898585965412338
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7108498129342598,
40
+ "acc_stderr": 0.010484068799942072
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5623853211009174,
44
+ "acc_stderr": 0.008676717715731632
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6052188552188552,
48
+ "acc_stderr": 0.010030038935883584,
49
+ "acc_norm": 0.5429292929292929,
50
+ "acc_norm_stderr": 0.01022189756425604
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26791808873720135,
54
+ "acc_stderr": 0.012942030195136437,
55
+ "acc_norm": 0.2883959044368601,
56
+ "acc_norm_stderr": 0.013238394422428171
57
+ },
58
+ "sciq": {
59
+ "acc": 0.852,
60
+ "acc_stderr": 0.011234866364235235,
61
+ "acc_norm": 0.764,
62
+ "acc_norm_stderr": 0.013434451402438678
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7578890097932536,
66
+ "acc_stderr": 0.00999437126910438,
67
+ "acc_norm": 0.7622415669205659,
68
+ "acc_norm_stderr": 0.009932525779525492
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.332,
5
+ "acc_stderr": 0.014899597242811478
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.329,
9
+ "acc_stderr": 0.014865395385928362
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.34833333333333333,
13
+ "acc_stderr": 0.013759437498874075
14
+ },
15
+ "cb": {
16
+ "acc": 0.5714285714285714,
17
+ "acc_stderr": 0.06672848092813058,
18
+ "f1": 0.3888888888888889
19
+ },
20
+ "copa": {
21
+ "acc": 0.76,
22
+ "acc_stderr": 0.04292346959909283
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.469627564230233,
26
+ "acc_stderr": 0.004980566907790459,
27
+ "acc_norm": 0.6134236207926708,
28
+ "acc_norm_stderr": 0.004859699562451462
29
+ },
30
+ "rte": {
31
+ "acc": 0.5415162454873647,
32
+ "acc_stderr": 0.029992535385373314
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5737963693764798,
36
+ "acc_stderr": 0.013898585965412338
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7108498129342598,
40
+ "acc_stderr": 0.010484068799942072
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5623853211009174,
44
+ "acc_stderr": 0.008676717715731632
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6052188552188552,
48
+ "acc_stderr": 0.010030038935883584,
49
+ "acc_norm": 0.5429292929292929,
50
+ "acc_norm_stderr": 0.01022189756425604
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26791808873720135,
54
+ "acc_stderr": 0.012942030195136437,
55
+ "acc_norm": 0.2883959044368601,
56
+ "acc_norm_stderr": 0.013238394422428171
57
+ },
58
+ "sciq": {
59
+ "acc": 0.852,
60
+ "acc_stderr": 0.011234866364235235,
61
+ "acc_norm": 0.764,
62
+ "acc_norm_stderr": 0.013434451402438678
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7578890097932536,
66
+ "acc_stderr": 0.00999437126910438,
67
+ "acc_norm": 0.7622415669205659,
68
+ "acc_norm_stderr": 0.009932525779525492
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_1.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.32,
5
+ "acc_stderr": 0.014758652303574886
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.324,
9
+ "acc_stderr": 0.014806864733738854
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3491666666666667,
13
+ "acc_stderr": 0.01376707539507725
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942397,
18
+ "f1": 0.3890671420083185
19
+ },
20
+ "copa": {
21
+ "acc": 0.75,
22
+ "acc_stderr": 0.04351941398892446
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4640509858593906,
26
+ "acc_stderr": 0.0049768677965835555,
27
+ "acc_norm": 0.6082453694483171,
28
+ "acc_norm_stderr": 0.004871447106554927
29
+ },
30
+ "rte": {
31
+ "acc": 0.5451263537906137,
32
+ "acc_stderr": 0.029973636495415252
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.574585635359116,
36
+ "acc_stderr": 0.013895257666646378
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.711918760021379,
40
+ "acc_stderr": 0.010472537019822582
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5409785932721712,
44
+ "acc_stderr": 0.008715635308774412
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6342592592592593,
48
+ "acc_stderr": 0.009882988069418829,
49
+ "acc_norm": 0.5837542087542088,
50
+ "acc_norm_stderr": 0.01011481940450087
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2901023890784983,
54
+ "acc_stderr": 0.013261573677520764,
55
+ "acc_norm": 0.30119453924914674,
56
+ "acc_norm_stderr": 0.013406741767847638
57
+ },
58
+ "sciq": {
59
+ "acc": 0.896,
60
+ "acc_stderr": 0.009658016218524301,
61
+ "acc_norm": 0.88,
62
+ "acc_norm_stderr": 0.010281328012747386
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7551686615886833,
66
+ "acc_stderr": 0.010032309105568793,
67
+ "acc_norm": 0.766050054406964,
68
+ "acc_norm_stderr": 0.009877236895137436
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.32,
5
+ "acc_stderr": 0.014758652303574886
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.324,
9
+ "acc_stderr": 0.014806864733738854
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3491666666666667,
13
+ "acc_stderr": 0.01376707539507725
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942397,
18
+ "f1": 0.3890671420083185
19
+ },
20
+ "copa": {
21
+ "acc": 0.75,
22
+ "acc_stderr": 0.04351941398892446
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4640509858593906,
26
+ "acc_stderr": 0.0049768677965835555,
27
+ "acc_norm": 0.6082453694483171,
28
+ "acc_norm_stderr": 0.004871447106554927
29
+ },
30
+ "rte": {
31
+ "acc": 0.5451263537906137,
32
+ "acc_stderr": 0.029973636495415252
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.574585635359116,
36
+ "acc_stderr": 0.013895257666646378
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.711918760021379,
40
+ "acc_stderr": 0.010472537019822582
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5409785932721712,
44
+ "acc_stderr": 0.008715635308774412
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6342592592592593,
48
+ "acc_stderr": 0.009882988069418829,
49
+ "acc_norm": 0.5837542087542088,
50
+ "acc_norm_stderr": 0.01011481940450087
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2901023890784983,
54
+ "acc_stderr": 0.013261573677520764,
55
+ "acc_norm": 0.30119453924914674,
56
+ "acc_norm_stderr": 0.013406741767847638
57
+ },
58
+ "sciq": {
59
+ "acc": 0.896,
60
+ "acc_stderr": 0.009658016218524301,
61
+ "acc_norm": 0.88,
62
+ "acc_norm_stderr": 0.010281328012747386
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7551686615886833,
66
+ "acc_stderr": 0.010032309105568793,
67
+ "acc_norm": 0.766050054406964,
68
+ "acc_norm_stderr": 0.009877236895137436
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_2.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.343,
5
+ "acc_stderr": 0.015019206922356953
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.318,
9
+ "acc_stderr": 0.014734079309311901
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.325,
13
+ "acc_stderr": 0.013526454480351028
14
+ },
15
+ "cb": {
16
+ "acc": 0.42857142857142855,
17
+ "acc_stderr": 0.06672848092813058,
18
+ "f1": 0.3058470764617691
19
+ },
20
+ "copa": {
21
+ "acc": 0.78,
22
+ "acc_stderr": 0.04163331998932263
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.45727942640908187,
26
+ "acc_stderr": 0.004971534874389935,
27
+ "acc_norm": 0.602867954590719,
28
+ "acc_norm_stderr": 0.004883037758919964
29
+ },
30
+ "rte": {
31
+ "acc": 0.48736462093862815,
32
+ "acc_stderr": 0.030086851767188564
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5808997632202052,
36
+ "acc_stderr": 0.013867325192210116
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7215392838054516,
40
+ "acc_stderr": 0.010365521460604415
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5489296636085627,
44
+ "acc_stderr": 0.008703080962379622
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6325757575757576,
48
+ "acc_stderr": 0.009892552616211558,
49
+ "acc_norm": 0.617003367003367,
50
+ "acc_norm_stderr": 0.009974920384536479
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2901023890784983,
54
+ "acc_stderr": 0.013261573677520759,
55
+ "acc_norm": 0.31313993174061433,
56
+ "acc_norm_stderr": 0.013552671543623496
57
+ },
58
+ "sciq": {
59
+ "acc": 0.906,
60
+ "acc_stderr": 0.009233052000787738,
61
+ "acc_norm": 0.891,
62
+ "acc_norm_stderr": 0.009859828407037186
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7540805223068553,
66
+ "acc_stderr": 0.010047331865625194,
67
+ "acc_norm": 0.7698585418933623,
68
+ "acc_norm_stderr": 0.009820832826839796
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.343,
5
+ "acc_stderr": 0.015019206922356953
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.318,
9
+ "acc_stderr": 0.014734079309311901
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.325,
13
+ "acc_stderr": 0.013526454480351028
14
+ },
15
+ "cb": {
16
+ "acc": 0.42857142857142855,
17
+ "acc_stderr": 0.06672848092813058,
18
+ "f1": 0.3058470764617691
19
+ },
20
+ "copa": {
21
+ "acc": 0.78,
22
+ "acc_stderr": 0.04163331998932263
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.45727942640908187,
26
+ "acc_stderr": 0.004971534874389935,
27
+ "acc_norm": 0.602867954590719,
28
+ "acc_norm_stderr": 0.004883037758919964
29
+ },
30
+ "rte": {
31
+ "acc": 0.48736462093862815,
32
+ "acc_stderr": 0.030086851767188564
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5808997632202052,
36
+ "acc_stderr": 0.013867325192210116
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7215392838054516,
40
+ "acc_stderr": 0.010365521460604415
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5489296636085627,
44
+ "acc_stderr": 0.008703080962379622
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6325757575757576,
48
+ "acc_stderr": 0.009892552616211558,
49
+ "acc_norm": 0.617003367003367,
50
+ "acc_norm_stderr": 0.009974920384536479
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2901023890784983,
54
+ "acc_stderr": 0.013261573677520759,
55
+ "acc_norm": 0.31313993174061433,
56
+ "acc_norm_stderr": 0.013552671543623496
57
+ },
58
+ "sciq": {
59
+ "acc": 0.906,
60
+ "acc_stderr": 0.009233052000787738,
61
+ "acc_norm": 0.891,
62
+ "acc_norm_stderr": 0.009859828407037186
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7540805223068553,
66
+ "acc_stderr": 0.010047331865625194,
67
+ "acc_norm": 0.7698585418933623,
68
+ "acc_norm_stderr": 0.009820832826839796
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_3.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.348,
5
+ "acc_stderr": 0.015070604603768408
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.36,
9
+ "acc_stderr": 0.01518652793204012
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.35083333333333333,
13
+ "acc_stderr": 0.013782212417178195
14
+ },
15
+ "cb": {
16
+ "acc": 0.48214285714285715,
17
+ "acc_stderr": 0.0673769750864465,
18
+ "f1": 0.40387403446226977
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4567815176259709,
26
+ "acc_stderr": 0.004971106265046551,
27
+ "acc_norm": 0.5992830113523202,
28
+ "acc_norm_stderr": 0.004890422457747258
29
+ },
30
+ "rte": {
31
+ "acc": 0.48375451263537905,
32
+ "acc_stderr": 0.030080573208738064
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.569060773480663,
36
+ "acc_stderr": 0.013917796623335966
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7247461250668092,
40
+ "acc_stderr": 0.010328538400500567
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5498470948012233,
44
+ "acc_stderr": 0.008701488203356937
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6266835016835017,
48
+ "acc_stderr": 0.009925009142802903,
49
+ "acc_norm": 0.6203703703703703,
50
+ "acc_norm_stderr": 0.009958037725468558
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2901023890784983,
54
+ "acc_stderr": 0.013261573677520769,
55
+ "acc_norm": 0.31143344709897613,
56
+ "acc_norm_stderr": 0.013532472099850949
57
+ },
58
+ "sciq": {
59
+ "acc": 0.923,
60
+ "acc_stderr": 0.008434580140240632,
61
+ "acc_norm": 0.903,
62
+ "acc_norm_stderr": 0.00936368937324812
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7578890097932536,
66
+ "acc_stderr": 0.009994371269104387,
67
+ "acc_norm": 0.7682263329706203,
68
+ "acc_norm_stderr": 0.00984514377279405
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.348,
5
+ "acc_stderr": 0.015070604603768408
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.36,
9
+ "acc_stderr": 0.01518652793204012
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.35083333333333333,
13
+ "acc_stderr": 0.013782212417178195
14
+ },
15
+ "cb": {
16
+ "acc": 0.48214285714285715,
17
+ "acc_stderr": 0.0673769750864465,
18
+ "f1": 0.40387403446226977
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4567815176259709,
26
+ "acc_stderr": 0.004971106265046551,
27
+ "acc_norm": 0.5992830113523202,
28
+ "acc_norm_stderr": 0.004890422457747258
29
+ },
30
+ "rte": {
31
+ "acc": 0.48375451263537905,
32
+ "acc_stderr": 0.030080573208738064
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.569060773480663,
36
+ "acc_stderr": 0.013917796623335966
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7247461250668092,
40
+ "acc_stderr": 0.010328538400500567
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5498470948012233,
44
+ "acc_stderr": 0.008701488203356937
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6266835016835017,
48
+ "acc_stderr": 0.009925009142802903,
49
+ "acc_norm": 0.6203703703703703,
50
+ "acc_norm_stderr": 0.009958037725468558
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2901023890784983,
54
+ "acc_stderr": 0.013261573677520769,
55
+ "acc_norm": 0.31143344709897613,
56
+ "acc_norm_stderr": 0.013532472099850949
57
+ },
58
+ "sciq": {
59
+ "acc": 0.923,
60
+ "acc_stderr": 0.008434580140240632,
61
+ "acc_norm": 0.903,
62
+ "acc_norm_stderr": 0.00936368937324812
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7578890097932536,
66
+ "acc_stderr": 0.009994371269104387,
67
+ "acc_norm": 0.7682263329706203,
68
+ "acc_norm_stderr": 0.00984514377279405
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_4.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.36,
5
+ "acc_stderr": 0.015186527932040117
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.347,
9
+ "acc_stderr": 0.015060472031706625
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3625,
13
+ "acc_stderr": 0.01388303787422552
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942395,
18
+ "f1": 0.4538378958668814
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.45180242979486157,
26
+ "acc_stderr": 0.004966544724452227,
27
+ "acc_norm": 0.5955984863572994,
28
+ "acc_norm_stderr": 0.004897728370737246
29
+ },
30
+ "rte": {
31
+ "acc": 0.48375451263537905,
32
+ "acc_stderr": 0.030080573208738064
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5706393054459353,
36
+ "acc_stderr": 0.013911537499969163
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7177979690005345,
40
+ "acc_stderr": 0.010407834479647672
41
+ },
42
+ "boolq": {
43
+ "acc": 0.545565749235474,
44
+ "acc_stderr": 0.008708665643758015
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.640993265993266,
48
+ "acc_stderr": 0.009843424713072174,
49
+ "acc_norm": 0.6186868686868687,
50
+ "acc_norm_stderr": 0.009966542497171025
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.302901023890785,
54
+ "acc_stderr": 0.013428241573185349,
55
+ "acc_norm": 0.32337883959044367,
56
+ "acc_norm_stderr": 0.013669421630012129
57
+ },
58
+ "sciq": {
59
+ "acc": 0.915,
60
+ "acc_stderr": 0.008823426366942331,
61
+ "acc_norm": 0.912,
62
+ "acc_norm_stderr": 0.008963053962592085
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7578890097932536,
66
+ "acc_stderr": 0.009994371269104385,
67
+ "acc_norm": 0.7752992383025027,
68
+ "acc_norm_stderr": 0.009738282586548389
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.36,
5
+ "acc_stderr": 0.015186527932040117
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.347,
9
+ "acc_stderr": 0.015060472031706625
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.3625,
13
+ "acc_stderr": 0.01388303787422552
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942395,
18
+ "f1": 0.4538378958668814
19
+ },
20
+ "copa": {
21
+ "acc": 0.79,
22
+ "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.45180242979486157,
26
+ "acc_stderr": 0.004966544724452227,
27
+ "acc_norm": 0.5955984863572994,
28
+ "acc_norm_stderr": 0.004897728370737246
29
+ },
30
+ "rte": {
31
+ "acc": 0.48375451263537905,
32
+ "acc_stderr": 0.030080573208738064
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5706393054459353,
36
+ "acc_stderr": 0.013911537499969163
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7177979690005345,
40
+ "acc_stderr": 0.010407834479647672
41
+ },
42
+ "boolq": {
43
+ "acc": 0.545565749235474,
44
+ "acc_stderr": 0.008708665643758015
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.640993265993266,
48
+ "acc_stderr": 0.009843424713072174,
49
+ "acc_norm": 0.6186868686868687,
50
+ "acc_norm_stderr": 0.009966542497171025
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.302901023890785,
54
+ "acc_stderr": 0.013428241573185349,
55
+ "acc_norm": 0.32337883959044367,
56
+ "acc_norm_stderr": 0.013669421630012129
57
+ },
58
+ "sciq": {
59
+ "acc": 0.915,
60
+ "acc_stderr": 0.008823426366942331,
61
+ "acc_norm": 0.912,
62
+ "acc_norm_stderr": 0.008963053962592085
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7578890097932536,
66
+ "acc_stderr": 0.009994371269104385,
67
+ "acc_norm": 0.7752992383025027,
68
+ "acc_norm_stderr": 0.009738282586548389
69
+ }
70
+ },
71
+ "versions": {
72
+ "anli_r1": 0,
73
+ "anli_r2": 0,
74
+ "anli_r3": 0,
75
+ "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
+ }
87
+ }
4b284b17bc4/evaluation/4b284b17bc4_5.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.363,
5
+ "acc_stderr": 0.015213890444671281
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.347,
9
+ "acc_stderr": 0.015060472031706624
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.34,
13
+ "acc_stderr": 0.013680495725767794
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942397,
18
+ "f1": 0.3974410235905637
19
+ },
20
+ "copa": {
21
+ "acc": 0.81,
22
+ "acc_stderr": 0.03942772444036623
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.44981079466241786,
26
+ "acc_stderr": 0.004964579685712439,
27
+ "acc_norm": 0.6002788289185421,
28
+ "acc_norm_stderr": 0.004888398535520516
29
+ },
30
+ "rte": {
31
+ "acc": 0.49097472924187724,
32
+ "acc_stderr": 0.030091559826331334
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5785319652722968,
36
+ "acc_stderr": 0.013878072377497603
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7113842864778194,
40
+ "acc_stderr": 0.01047831178564294
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5376146788990825,
44
+ "acc_stderr": 0.008720273736433679
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6447811447811448,
48
+ "acc_stderr": 0.009820245899287117,
49
+ "acc_norm": 0.625,
50
+ "acc_norm_stderr": 0.009933992677987828
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2986348122866894,
54
+ "acc_stderr": 0.013374078615068756,
55
+ "acc_norm": 0.310580204778157,
56
+ "acc_norm_stderr": 0.013522292098053052
57
+ }
58
+ },
59
+ "versions": {
60
+ "anli_r1": 0,
61
+ "anli_r2": 0,
62
+ "anli_r3": 0,
63
+ "cb": 1,
64
+ "copa": 0,
65
+ "hellaswag": 0,
66
+ "rte": 0,
67
+ "winogrande": 0,
68
+ "storycloze_2016": 0,
69
+ "boolq": 1,
70
+ "arc_easy": 0,
71
+ "arc_challenge": 0
72
+ }
73
+ }
4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.363,
5
+ "acc_stderr": 0.015213890444671281
6
+ },
7
+ "anli_r2": {
8
+ "acc": 0.347,
9
+ "acc_stderr": 0.015060472031706624
10
+ },
11
+ "anli_r3": {
12
+ "acc": 0.34,
13
+ "acc_stderr": 0.013680495725767794
14
+ },
15
+ "cb": {
16
+ "acc": 0.5535714285714286,
17
+ "acc_stderr": 0.06703189227942397,
18
+ "f1": 0.3974410235905637
19
+ },
20
+ "copa": {
21
+ "acc": 0.81,
22
+ "acc_stderr": 0.03942772444036623
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.44981079466241786,
26
+ "acc_stderr": 0.004964579685712439,
27
+ "acc_norm": 0.6002788289185421,
28
+ "acc_norm_stderr": 0.004888398535520516
29
+ },
30
+ "rte": {
31
+ "acc": 0.49097472924187724,
32
+ "acc_stderr": 0.030091559826331334
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5785319652722968,
36
+ "acc_stderr": 0.013878072377497603
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7113842864778194,
40
+ "acc_stderr": 0.01047831178564294
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5376146788990825,
44
+ "acc_stderr": 0.008720273736433679
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6447811447811448,
48
+ "acc_stderr": 0.009820245899287117,
49
+ "acc_norm": 0.625,
50
+ "acc_norm_stderr": 0.009933992677987828
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2986348122866894,
54
+ "acc_stderr": 0.013374078615068756,
55
+ "acc_norm": 0.310580204778157,
56
+ "acc_norm_stderr": 0.013522292098053052
57
+ }
58
+ },
59
+ "versions": {
60
+ "anli_r1": 0,
61
+ "anli_r2": 0,
62
+ "anli_r3": 0,
63
+ "cb": 1,
64
+ "copa": 0,
65
+ "hellaswag": 0,
66
+ "rte": 0,
67
+ "winogrande": 0,
68
+ "storycloze_2016": 0,
69
+ "boolq": 1,
70
+ "arc_easy": 0,
71
+ "arc_challenge": 0
72
+ }
73
+ }
4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.4040857346605273,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.04358756352339084
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.0759904796250538,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0019478615830651011
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.3009878218567671,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0046586299284223885
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11153940555452811,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.002178375350508395
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.03649587266689502,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.001284173883599284
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.14683508450255534,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0032437032345681857
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.05308201459552208,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0013729761880117914
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07180766426825755,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.001761388989113738
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.28987291523705844,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.0045326872175802165
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.10594741371659425,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0019835178597520093
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.07245769602542276,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.00184585307065297
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.2877191217238231,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004439207690226351
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.10637513260264994,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.00204505600505615
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 0,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.5179012826475189,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.03546328546887922
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.07590692259956473,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0015260502670476222
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.3587176031003754,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.005304902318303979
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11715894355967386,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019757967913343107
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.03541306461486918,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0009438407351314926
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.17490815925289047,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003575953294927914
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.054620931903283015,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.001228796960295478
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07063353254188104,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0013315866040551792
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.33926359580036936,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004980489876121937
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.10948455596662156,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0017556314356608658
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.07145722539365447,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0014201666549177136
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.338101035904291,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004861723234525834
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.11027827926137282,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018206662711825689
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 1,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }