Muennighoff
commited on
Commit
•
78797a3
1
Parent(s):
411a88c
Add
Browse files- 4b284b84bc4v2/evaluation/generation/merged.csv +53 -0
- 4b284b84bc4v2/evaluation/generation/merged.json +1 -0
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_0.csv +21 -0
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_0_lm-eval_global_step80108_2023-02-15-14-49-22_0shots_backup.json +0 -87
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_1.csv +21 -0
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_1_lm-eval_global_step80108_2023-02-15-14-49-22_1shots_backup.json +0 -87
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_2.csv +21 -0
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_2_lm-eval_global_step80108_2023-02-15-14-49-22_2shots_backup.json +0 -87
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_3.csv +21 -0
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_3_lm-eval_global_step80108_2023-02-15-14-49-22_3shots_backup.json +0 -87
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_4.csv +21 -0
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_4_lm-eval_global_step80108_2023-02-15-14-49-22_4shots_backup.json +0 -87
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_5.csv +21 -0
- 4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_5_lm-eval_global_step80108_2023-02-15-14-49-22_5shots_backup.json +0 -87
4b284b84bc4v2/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.03077709464553236
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.03077709464553236
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17699912791282796
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17699912791282796
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20955751814613366
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20955751814613366
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.21988121787054832
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.21988121787054832
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.22411289670678503
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.22411289670678503
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.22601021194014084
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.22601021194014084
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.18122301120366135
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04120350350492188
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.04120350350492188
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03612025716424621
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.03612025716424621
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03605595900823992
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.03605595900823992
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03745595159864199
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.03745595159864199
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009095248607177277
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.009095248607177277
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0008276164457091518
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,0.0008276164457091518
|
27 |
+
gem_xsum,5,average,multiple,0.026793089388156072
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0005024848669286521
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.0005024848669286521
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.00011613167335155065
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.00011613167335155065
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.00030928662436137877
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.00030928662436137877
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0034293208231205925
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.0034293208231205925
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.015493410230104816
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.015493410230104816
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.03207493916916435
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.03207493916916435
|
40 |
+
web_nlg_en,5,average,multiple,0.00865426223117189
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.022488534485982925
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.022488534485982925
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03119150310857869
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.03119150310857869
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.03642005087819046
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.03642005087819046
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.033404677016268654
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.033404677016268654
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012263061789486048
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.012263061789486048
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002039871936933482
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.002039871936933482
|
53 |
+
wiki_lingua_en,5,average,multiple,0.022967949869240042
|
4b284b84bc4v2/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.0029280550995302184, "bleu_stderr": 0.0016366911962026006, "rouge1_fmeasure": 0.01166133384965969, "rouge1_fmeasure_stderr": 0.0005948210079967459, "rouge1_precision": 0.02476807814768773, "rouge1_precision_stderr": 0.0028530818463061774, "rouge1_recall": 0.02678260190311703, "rouge1_recall_stderr": 0.0012109603137937753, "rouge2_fmeasure": 0.0005024848669286521, "rouge2_fmeasure_stderr": 9.489234963288616e-05, "rouge2_precision": 0.000817405348780765, "rouge2_precision_stderr": 0.000563057630226192, "rouge2_recall": 0.0018475799663002137, "rouge2_recall_stderr": 0.00026242303057926657, "rougeL_fmeasure": 0.011371922090005586, "rougeL_fmeasure_stderr": 0.0005646757671758188, "rougeL_precision": 0.024307558662430548, "rougeL_precision_stderr": 0.002809259680412027, "rougeL_recall": 0.026306702058363073, "rougeL_recall_stderr": 0.0011921693661227405, "rougeLsum_fmeasure": 0.011437412029266373, "rougeLsum_fmeasure_stderr": 0.0005664967802496282, "rougeLsum_precision": 0.024354402252677848, "rougeLsum_precision_stderr": 0.0028093033912843027, "rougeLsum_recall": 0.026422971471523896, "rougeLsum_recall_stderr": 0.0011947591414667835}}, "1": {"PALM_prompt": {"bleu": 0.0007166390659057791, "bleu_stderr": 0.00015271503689012457, "rouge1_fmeasure": 0.004296871966216284, "rouge1_fmeasure_stderr": 0.00041150104568959847, "rouge1_precision": 0.0068975376976135325, "rouge1_precision_stderr": 0.0014736511243282327, "rouge1_recall": 0.00885887681369143, "rouge1_recall_stderr": 0.0006944193964364368, "rouge2_fmeasure": 0.00011613167335155065, "rouge2_fmeasure_stderr": 6.102001124245185e-05, "rouge2_precision": 0.0003171875473118265, "rouge2_precision_stderr": 0.0002814151389304619, "rouge2_recall": 0.00028567405139197236, "rouge2_recall_stderr": 0.00011142619043034172, "rougeL_fmeasure": 0.004206988239060447, "rougeL_fmeasure_stderr": 0.0003855165321832455, "rougeL_precision": 0.006738301550344965, "rougeL_precision_stderr": 0.0014262564028128674, "rougeL_recall": 0.008632810279305616, "rougeL_recall_stderr": 0.0006644470939899456, "rougeLsum_fmeasure": 0.004206988239060447, "rougeLsum_fmeasure_stderr": 0.0003855165321832455, "rougeLsum_precision": 0.006738301550344965, "rougeLsum_precision_stderr": 0.0014262564028128674, "rougeLsum_recall": 0.008632810279305616, "rougeLsum_recall_stderr": 0.0006644470939899456}}, "2": {"PALM_prompt": {"bleu": 0.0011433649657405618, "bleu_stderr": 9.738318964881656e-05, "rouge1_fmeasure": 0.006962873263876942, "rouge1_fmeasure_stderr": 0.0004895823378957566, "rouge1_precision": 0.01320147157904189, "rouge1_precision_stderr": 0.001952068982460362, "rouge1_recall": 0.01579302663455386, "rouge1_recall_stderr": 0.0009653406116352021, "rouge2_fmeasure": 0.00030928662436137877, "rouge2_fmeasure_stderr": 9.611704402898231e-05, "rouge2_precision": 0.0002858015060017772, "rouge2_precision_stderr": 0.00014402794289871516, "rouge2_recall": 0.000819950815164877, "rouge2_recall_stderr": 0.00019418552636900682, "rougeL_fmeasure": 0.006935271321832535, "rougeL_fmeasure_stderr": 0.0004886807794040627, "rougeL_precision": 0.013185732399727665, "rougeL_precision_stderr": 0.0019520342297636402, "rougeL_recall": 0.015624421046336484, "rougeL_recall_stderr": 0.000944465094721435, "rougeLsum_fmeasure": 0.006941075328744422, "rougeLsum_fmeasure_stderr": 0.000488906538384857, "rougeLsum_precision": 0.013190229308103155, "rougeLsum_precision_stderr": 0.0019520534211779996, "rougeLsum_recall": 0.015631466185916838, "rougeLsum_recall_stderr": 0.0009447148312929678}}, "3": {"PALM_prompt": {"bleu": 0.017458232906879215, "bleu_stderr": 0.0033790622867434076, "rouge1_fmeasure": 0.019724953977195446, "rouge1_fmeasure_stderr": 0.0007547850264824778, "rouge1_precision": 0.023119662960575944, "rouge1_precision_stderr": 0.00233355843036657, "rouge1_recall": 0.05282846241750935, "rouge1_recall_stderr": 0.0019359702235571255, "rouge2_fmeasure": 0.0034293208231205925, "rouge2_fmeasure_stderr": 0.00026671815521471114, "rouge2_precision": 0.0026448252006929015, "rouge2_precision_stderr": 0.00042559571122428457, "rouge2_recall": 0.010346087040436342, "rouge2_recall_stderr": 0.0008425074141827781, "rougeL_fmeasure": 0.018929605765833746, "rougeL_fmeasure_stderr": 0.000709339053434189, "rougeL_precision": 0.022478179089093484, "rougeL_precision_stderr": 0.002302691625879073, "rougeL_recall": 0.050774510212971205, "rougeL_recall_stderr": 0.001845862101982268, "rougeLsum_fmeasure": 0.01914065951998809, "rougeLsum_fmeasure_stderr": 0.0007220978771409472, "rougeLsum_precision": 0.022715634939831118, "rougeLsum_precision_stderr": 0.0023247590603593525, "rougeLsum_recall": 0.05112714494797748, "rougeLsum_recall_stderr": 0.0018634753399370816}}, "4": {"PALM_prompt": {"bleu": 0.07367508334810638, "bleu_stderr": 0.008598772765141552, "rouge1_fmeasure": 0.04840826317900296, "rouge1_fmeasure_stderr": 0.0012611715103482287, "rouge1_precision": 0.03481743016243523, "rouge1_precision_stderr": 0.0015627658049730399, "rouge1_recall": 0.13671695024124966, "rouge1_recall_stderr": 0.003247662069099122, "rouge2_fmeasure": 0.015493410230104816, "rouge2_fmeasure_stderr": 0.0006612843945157879, "rouge2_precision": 0.010032418466509586, "rouge2_precision_stderr": 0.00046386127467674785, "rouge2_recall": 0.04572678691854654, "rouge2_recall_stderr": 0.0019536943501289607, "rougeL_fmeasure": 0.04611942981100928, "rougeL_fmeasure_stderr": 0.001185835572456282, "rougeL_precision": 0.03330887063532685, "rougeL_precision_stderr": 0.0015356588851509804, "rougeL_recall": 0.1311235441315499, "rougeL_recall_stderr": 0.003109936140992514, "rougeLsum_fmeasure": 0.04632371389499153, "rougeLsum_fmeasure_stderr": 0.0011928970045761962, "rougeLsum_precision": 0.03348521211452374, "rougeLsum_precision_stderr": 0.0015401354090072426, "rougeLsum_recall": 0.13094153361519625, "rougeLsum_recall_stderr": 0.0030817560251935527}}, "5": {"PALM_prompt": {"bleu": 0.14403240257682612, "bleu_stderr": 0.020502126673992448, "rouge1_fmeasure": 0.07612242231547006, "rouge1_fmeasure_stderr": 0.0016048490141600572, "rouge1_precision": 0.05148247265764308, "rouge1_precision_stderr": 0.0015275173054537983, "rouge1_recall": 0.22173682737900655, "rouge1_recall_stderr": 0.004161854186037113, "rouge2_fmeasure": 0.03207493916916435, "rouge2_fmeasure_stderr": 0.0009360890624163692, "rouge2_precision": 0.020599492046193216, "rouge2_precision_stderr": 0.0006362341280141862, "rouge2_recall": 0.09581728290567497, "rouge2_recall_stderr": 0.002689360100598362, "rougeL_fmeasure": 0.07338633751298693, "rougeL_fmeasure_stderr": 0.0015133873994071854, "rougeL_precision": 0.049523715375363805, "rougeL_precision_stderr": 0.0014392263349550436, "rougeL_recall": 0.21425862449161553, "rougeL_recall_stderr": 0.003954730793317483, "rougeLsum_fmeasure": 0.07323243858705794, "rougeLsum_fmeasure_stderr": 0.0015324335796681066, "rougeLsum_precision": 0.04964705800825754, "rougeLsum_precision_stderr": 0.0014944377050311116, "rougeLsum_recall": 0.2128858004458588, "rougeLsum_recall_stderr": 0.003935525678149997}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.9851933422266917, "bleu_stderr": 0.06952404492630372, "rouge1_fmeasure": 0.126716346898361, "rouge1_fmeasure_stderr": 0.00183643925768546, "rouge1_precision": 0.1132744689534206, "rouge1_precision_stderr": 0.001964062271703089, "rouge1_recall": 0.17920924171315153, "rouge1_recall_stderr": 0.0026059508027650407, "rouge2_fmeasure": 0.022488534485982925, "rouge2_fmeasure_stderr": 0.0006967583056651384, "rouge2_precision": 0.019843012980071302, "rouge2_precision_stderr": 0.0006687266299046368, "rouge2_recall": 0.033167544297920445, "rouge2_recall_stderr": 0.0011346551086910673, "rougeL_fmeasure": 0.10671839542240537, "rougeL_fmeasure_stderr": 0.0014384846189327972, "rougeL_precision": 0.09498424083859268, "rougeL_precision_stderr": 0.001593607551270447, "rougeL_recall": 0.1533714901279331, "rougeL_recall_stderr": 0.002170011502145423, "rougeLsum_fmeasure": 0.11738170897482265, "rougeLsum_fmeasure_stderr": 0.0016744399974736352, "rougeLsum_precision": 0.10485890631099086, "rougeLsum_precision_stderr": 0.0018089123910430586, "rougeLsum_recall": 0.16659652845304845, "rougeLsum_recall_stderr": 0.0024097743667806444}}, "1": {"tldr_en": {"bleu": 1.6610768648433896, "bleu_stderr": 0.056832353542049345, "rouge1_fmeasure": 0.15184455155867413, "rouge1_fmeasure_stderr": 0.002034563245416516, "rouge1_precision": 0.13908823518286104, "rouge1_precision_stderr": 0.0022446753794830056, "rouge1_recall": 0.21098808280302153, "rouge1_recall_stderr": 0.0029368504277361264, "rouge2_fmeasure": 0.03119150310857869, "rouge2_fmeasure_stderr": 0.0008867537820946861, "rouge2_precision": 0.027877042844761103, "rouge2_precision_stderr": 0.0009018124252275846, "rouge2_recall": 0.04519203101491114, "rouge2_recall_stderr": 0.0014098192399706936, "rougeL_fmeasure": 0.11913273540788426, "rougeL_fmeasure_stderr": 0.001473272989772387, "rougeL_precision": 0.10949982767815743, "rougeL_precision_stderr": 0.0017540251498615006, "rougeL_recall": 0.1676577471462561, "rougeL_recall_stderr": 0.0022806710881294024, "rougeLsum_fmeasure": 0.14194042014754973, "rougeLsum_fmeasure_stderr": 0.001891487197261062, "rougeLsum_precision": 0.12996252462048752, "rougeLsum_precision_stderr": 0.002095806974972115, "rougeLsum_recall": 0.19761740636135, "rougeLsum_recall_stderr": 0.0027462487095581455}}, "2": {"tldr_en": {"bleu": 2.0178632068220144, "bleu_stderr": 0.05630752545151827, "rouge1_fmeasure": 0.16464877037969322, "rouge1_fmeasure_stderr": 0.0020287077043937914, "rouge1_precision": 0.15190468435288887, "rouge1_precision_stderr": 0.002345142424551451, "rouge1_recall": 0.22705395048844934, "rouge1_recall_stderr": 0.002887946301050379, "rouge2_fmeasure": 0.03642005087819046, "rouge2_fmeasure_stderr": 0.0009241245974228669, "rouge2_precision": 0.03395974615464393, "rouge2_precision_stderr": 0.0010677653838954393, "rouge2_recall": 0.05172253964536197, "rouge2_recall_stderr": 0.0014222260298251664, "rougeL_fmeasure": 0.13112025398048113, "rougeL_fmeasure_stderr": 0.0015145211634699073, "rougeL_precision": 0.12120650835304664, "rougeL_precision_stderr": 0.001876346841785363, "rougeL_recall": 0.18342257938869802, "rougeL_recall_stderr": 0.0023311986779411147, "rougeLsum_fmeasure": 0.15343386118565805, "rougeLsum_fmeasure_stderr": 0.001886311570474951, "rougeLsum_precision": 0.1413354912495611, "rougeLsum_precision_stderr": 0.002183469939019903, "rougeLsum_recall": 0.21212536828784073, "rougeLsum_recall_stderr": 0.002709014023103977}}, "3": {"tldr_en": {"bleu": 2.2124816637688673, "bleu_stderr": 0.05324645847028149, "rouge1_fmeasure": 0.14848863343770827, "rouge1_fmeasure_stderr": 0.0021653237700268767, "rouge1_precision": 0.1398125873700489, "rouge1_precision_stderr": 0.0024590771386981468, "rouge1_recall": 0.20581770819572856, "rouge1_recall_stderr": 0.0031688908975941706, "rouge2_fmeasure": 0.033404677016268654, "rouge2_fmeasure_stderr": 0.0009001030881667151, "rouge2_precision": 0.03147032611832327, "rouge2_precision_stderr": 0.0010041816927645, "rouge2_recall": 0.048666615188189155, "rouge2_recall_stderr": 0.0015022300615495168, "rougeL_fmeasure": 0.11645203897433642, "rougeL_fmeasure_stderr": 0.0016301987941532085, "rougeL_precision": 0.10967938260112484, "rougeL_precision_stderr": 0.001939118747782662, "rougeL_recall": 0.16415201449685513, "rougeL_recall_stderr": 0.002568933583030408, "rougeLsum_fmeasure": 0.1389363951676251, "rougeLsum_fmeasure_stderr": 0.002028209807222595, "rougeLsum_precision": 0.13081164522556862, "rougeLsum_precision_stderr": 0.0023048843089674047, "rougeLsum_recall": 0.19298480421501055, "rougeLsum_recall_stderr": 0.002991483092116459}}, "4": {"tldr_en": {"bleu": 0.518348574818673, "bleu_stderr": 0.04728338657384372, "rouge1_fmeasure": 0.05243492704960272, "rouge1_fmeasure_stderr": 0.0018267137943086668, "rouge1_precision": 0.051163725460641254, "rouge1_precision_stderr": 0.0020175130167421618, "rouge1_recall": 0.07547098530527592, "rouge1_recall_stderr": 0.002712368234388136, "rouge2_fmeasure": 0.012263061789486048, "rouge2_fmeasure_stderr": 0.0006259044145290426, "rouge2_precision": 0.012226230333192529, "rouge2_precision_stderr": 0.0008303652374957706, "rouge2_recall": 0.018957055186055123, "rouge2_recall_stderr": 0.0010937223070813828, "rougeL_fmeasure": 0.04136291423682112, "rougeL_fmeasure_stderr": 0.0014246743415782, "rougeL_precision": 0.040637879243783245, "rougeL_precision_stderr": 0.0016425005398963664, "rougeL_recall": 0.06062023770963191, "rougeL_recall_stderr": 0.0022164851906403818, "rougeLsum_fmeasure": 0.048791456361098884, "rougeLsum_fmeasure_stderr": 0.0016961708904439533, "rougeLsum_precision": 0.047669356275269074, "rougeLsum_precision_stderr": 0.0018834597641042353, "rougeLsum_recall": 0.07047643888720821, "rougeLsum_recall_stderr": 0.002538522085522552}}, "5": {"tldr_en": {"bleu": 5.830006972912073e-07, "bleu_stderr": 1.5242170268698066e-06, "rouge1_fmeasure": 0.008258433412209281, "rouge1_fmeasure_stderr": 0.0008152843888710176, "rouge1_precision": 0.00798899802009205, "rouge1_precision_stderr": 0.0008684678804321791, "rouge1_recall": 0.012157252874457363, "rouge1_recall_stderr": 0.0012354312911594197, "rouge2_fmeasure": 0.002039871936933482, "rouge2_fmeasure_stderr": 0.00026887096221603856, "rouge2_precision": 0.0019747973882538582, "rouge2_precision_stderr": 0.00029691155526316755, "rouge2_recall": 0.003195835732862682, "rouge2_recall_stderr": 0.0004683017569443119, "rougeL_fmeasure": 0.006505664193233564, "rougeL_fmeasure_stderr": 0.0006339130525234551, "rougeL_precision": 0.006327383054345089, "rougeL_precision_stderr": 0.0006970601584393748, "rougeL_recall": 0.009798952635742871, "rougeL_recall_stderr": 0.0010113333254833609, "rougeLsum_fmeasure": 0.007628696622003188, "rougeLsum_fmeasure_stderr": 0.0007515251763857421, "rougeLsum_precision": 0.007380499125010994, "rougeLsum_precision_stderr": 0.0007963027803529877, "rougeLsum_recall": 0.011247846618505664, "rougeLsum_recall_stderr": 0.0011352979551129004}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.9878161967998398, "bleu_stderr": 0.06641862588673211, "rouge1_fmeasure": 0.14439442209508027, "rouge1_fmeasure_stderr": 0.001276646371980532, "rouge1_precision": 0.1230414146056, "rouge1_precision_stderr": 0.0018011287640235026, "rouge1_recall": 0.2149932361322497, "rouge1_recall_stderr": 0.0017187067878016052, "rouge2_fmeasure": 0.03077709464553236, "rouge2_fmeasure_stderr": 0.0007515788730606631, "rouge2_precision": 0.0249827917795897, "rouge2_precision_stderr": 0.0006558676504376822, "rouge2_recall": 0.04571245193961166, "rouge2_recall_stderr": 0.0011353362441016387, "rougeL_fmeasure": 0.13928620677060347, "rougeL_fmeasure_stderr": 0.0011638837692616326, "rougeL_precision": 0.11674536895319193, "rougeL_precision_stderr": 0.00152188794899193, "rougeL_recall": 0.20906254512253195, "rougeL_recall_stderr": 0.0016186984135260216, "rougeLsum_fmeasure": 0.11892783850700991, "rougeLsum_fmeasure_stderr": 0.001162432472538215, "rougeLsum_precision": 0.10170099056081539, "rougeLsum_precision_stderr": 0.0015751175790873655, "rougeLsum_recall": 0.177028719967226, "rougeLsum_recall_stderr": 0.0015830122478709882}}, "1": {"generate_text_restaurant": {"bleu": 9.080481219293423, "bleu_stderr": 0.13158038055159502, "rouge1_fmeasure": 0.4015743081483971, "rouge1_fmeasure_stderr": 0.0021978471318253212, "rouge1_precision": 0.3943302810116597, "rouge1_precision_stderr": 0.002747401885321671, "rouge1_recall": 0.45740536668656395, "rouge1_recall_stderr": 0.002914926103453838, "rouge2_fmeasure": 0.17699912791282796, "rouge2_fmeasure_stderr": 0.001750185332023163, "rouge2_precision": 0.1739497922758478, "rouge2_precision_stderr": 0.0019379057184171025, "rouge2_recall": 0.20361607378340835, "rouge2_recall_stderr": 0.0021737793672060774, "rougeL_fmeasure": 0.2944824817977815, "rougeL_fmeasure_stderr": 0.0017154971121821877, "rougeL_precision": 0.2883093375260347, "rougeL_precision_stderr": 0.0021332295123494853, "rougeL_recall": 0.33875745510049593, "rougeL_recall_stderr": 0.002404798937078158, "rougeLsum_fmeasure": 0.33455401313195715, "rougeLsum_fmeasure_stderr": 0.0021144031329413423, "rougeLsum_precision": 0.32905879506709923, "rougeLsum_precision_stderr": 0.002548531871233347, "rougeLsum_recall": 0.380730834219128, "rougeLsum_recall_stderr": 0.002722245216573897}}, "2": {"generate_text_restaurant": {"bleu": 11.880857643381121, "bleu_stderr": 0.13297726203755633, "rouge1_fmeasure": 0.4405962380870851, "rouge1_fmeasure_stderr": 0.001977493224162921, "rouge1_precision": 0.43508506023556653, "rouge1_precision_stderr": 0.002294177199252108, "rouge1_recall": 0.48224900238091717, "rouge1_recall_stderr": 0.002858032856284127, "rouge2_fmeasure": 0.20955751814613366, "rouge2_fmeasure_stderr": 0.0018140125780760031, "rouge2_precision": 0.20629151293135214, "rouge2_precision_stderr": 0.0018701912148922787, "rouge2_recall": 0.2318486855887002, "rouge2_recall_stderr": 0.0022863983188674076, "rougeL_fmeasure": 0.3259684050538889, "rougeL_fmeasure_stderr": 0.0017402765794117868, "rougeL_precision": 0.32194109393814613, "rougeL_precision_stderr": 0.0019483523439828958, "rougeL_recall": 0.35732875480485404, "rougeL_recall_stderr": 0.0024237882323617684, "rougeLsum_fmeasure": 0.3722300448615989, "rougeLsum_fmeasure_stderr": 0.002006603335020953, "rougeLsum_precision": 0.36746907546290897, "rougeLsum_precision_stderr": 0.0022249091259755735, "rougeLsum_recall": 0.4075428373909175, "rougeLsum_recall_stderr": 0.00272896532835388}}, "3": {"generate_text_restaurant": {"bleu": 12.668710535676373, "bleu_stderr": 0.11599960553637455, "rouge1_fmeasure": 0.4483224588704832, "rouge1_fmeasure_stderr": 0.00196283613596949, "rouge1_precision": 0.4395509329611724, "rouge1_precision_stderr": 0.0022799777461364563, "rouge1_recall": 0.4928397756989274, "rouge1_recall_stderr": 0.002846184524464648, "rouge2_fmeasure": 0.21988121787054832, "rouge2_fmeasure_stderr": 0.0018519515945325045, "rouge2_precision": 0.21493759260102677, "rouge2_precision_stderr": 0.001901227495395828, "rouge2_recall": 0.24437272912243296, "rouge2_recall_stderr": 0.002354647079691416, "rougeL_fmeasure": 0.33468933092034114, "rougeL_fmeasure_stderr": 0.0017729142046844634, "rougeL_precision": 0.3282468443976815, "rougeL_precision_stderr": 0.001981937273750522, "rougeL_recall": 0.3683889985662075, "rougeL_recall_stderr": 0.002459195239143975, "rougeLsum_fmeasure": 0.38209771956653327, "rougeLsum_fmeasure_stderr": 0.0020243590337524803, "rougeLsum_precision": 0.37463074084445375, "rougeLsum_precision_stderr": 0.0022487871016798645, "rougeLsum_recall": 0.4200446710884972, "rougeLsum_recall_stderr": 0.0027467231496944235}}, "4": {"generate_text_restaurant": {"bleu": 12.987984921520598, "bleu_stderr": 0.16631332095853574, "rouge1_fmeasure": 0.4550163932883833, "rouge1_fmeasure_stderr": 0.002006124106156925, "rouge1_precision": 0.4426901446726453, "rouge1_precision_stderr": 0.002312374714781613, "rouge1_recall": 0.501289222749113, "rouge1_recall_stderr": 0.0028143944479086143, "rouge2_fmeasure": 0.22411289670678503, "rouge2_fmeasure_stderr": 0.0018929411054572837, "rouge2_precision": 0.21741314567975936, "rouge2_precision_stderr": 0.0019349399124124745, "rouge2_recall": 0.24947639077879472, "rouge2_recall_stderr": 0.0023667097986760496, "rougeL_fmeasure": 0.3385148446888659, "rougeL_fmeasure_stderr": 0.0017963558283099884, "rougeL_precision": 0.32911789740564507, "rougeL_precision_stderr": 0.0019701871866358513, "rougeL_recall": 0.373872891264614, "rougeL_recall_stderr": 0.002468625894339517, "rougeLsum_fmeasure": 0.38895985734645316, "rougeLsum_fmeasure_stderr": 0.002094963829800414, "rougeLsum_precision": 0.3781176477461178, "rougeLsum_precision_stderr": 0.002287152579955625, "rougeLsum_recall": 0.429023929215127, "rougeLsum_recall_stderr": 0.0027943725912901794}}, "5": {"generate_text_restaurant": {"bleu": 12.952424194291106, "bleu_stderr": 0.17721359501081466, "rouge1_fmeasure": 0.456966571846599, "rouge1_fmeasure_stderr": 0.0019734749210126813, "rouge1_precision": 0.4445492383379575, "rouge1_precision_stderr": 0.002320357386908294, "rouge1_recall": 0.5029642206493721, "rouge1_recall_stderr": 0.0027671737574197605, "rouge2_fmeasure": 0.22601021194014084, "rouge2_fmeasure_stderr": 0.00185518963183468, "rouge2_precision": 0.21950338541571118, "rouge2_precision_stderr": 0.0019236400840460968, "rouge2_recall": 0.25104119569831557, "rouge2_recall_stderr": 0.0023122193486125706, "rougeL_fmeasure": 0.3408087694501858, "rougeL_fmeasure_stderr": 0.0017879939906870438, "rougeL_precision": 0.3311755105323646, "rougeL_precision_stderr": 0.0019790975666403757, "rougeL_recall": 0.37610609585541155, "rougeL_recall_stderr": 0.0024496405296489626, "rougeLsum_fmeasure": 0.3904609207750673, "rougeLsum_fmeasure_stderr": 0.0020619775453817987, "rougeLsum_precision": 0.3796624983587727, "rougeLsum_precision_stderr": 0.002287049071998125, "rougeLsum_recall": 0.430049661746005, "rougeLsum_recall_stderr": 0.0027398492336168975}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.6178490169994428, "bleu_stderr": 0.0877349858045454, "rouge1_fmeasure": 0.19612972469676948, "rouge1_fmeasure_stderr": 0.0025764100053972294, "rouge1_precision": 0.14359819378832311, "rouge1_precision_stderr": 0.0020650285188626525, "rouge1_recall": 0.3344043431088508, "rouge1_recall_stderr": 0.004372644717501253, "rouge2_fmeasure": 0.04120350350492188, "rouge2_fmeasure_stderr": 0.001445411825760303, "rouge2_precision": 0.029674664673500505, "rouge2_precision_stderr": 0.0010650293180486199, "rouge2_recall": 0.07214641016418301, "rouge2_recall_stderr": 0.0025347435955977373, "rougeL_fmeasure": 0.14563333421498437, "rougeL_fmeasure_stderr": 0.0018388552480541446, "rougeL_precision": 0.10655263726146283, "rougeL_precision_stderr": 0.0014837973374642718, "rougeL_recall": 0.2496302152114623, "rougeL_recall_stderr": 0.0032564726396995787, "rougeLsum_fmeasure": 0.15219649916955252, "rougeLsum_fmeasure_stderr": 0.0021005836765910536, "rougeLsum_precision": 0.11122305842755399, "rougeLsum_precision_stderr": 0.0016516903041689486, "rougeLsum_recall": 0.2610787813927188, "rougeLsum_recall_stderr": 0.0037057405165489008}}, "1": {"article_DOC_summary": {"bleu": 1.401872820123109, "bleu_stderr": 0.07118073185060977, "rouge1_fmeasure": 0.1884813019546172, "rouge1_fmeasure_stderr": 0.002494762200065633, "rouge1_precision": 0.1343874942138253, "rouge1_precision_stderr": 0.0018570510799802737, "rouge1_recall": 0.32893212915185643, "rouge1_recall_stderr": 0.004321710877930019, "rouge2_fmeasure": 0.03612025716424621, "rouge2_fmeasure_stderr": 0.0013694969765650487, "rouge2_precision": 0.025517600369870188, "rouge2_precision_stderr": 0.0009688572436838881, "rouge2_recall": 0.06465696110332493, "rouge2_recall_stderr": 0.002538422888293853, "rougeL_fmeasure": 0.139572275480932, "rougeL_fmeasure_stderr": 0.001773272158023383, "rougeL_precision": 0.09936945059700034, "rougeL_precision_stderr": 0.0013159322617693906, "rougeL_recall": 0.2449745089798639, "rougeL_recall_stderr": 0.0031984284255980005, "rougeLsum_fmeasure": 0.14792628000904443, "rougeLsum_fmeasure_stderr": 0.002038823331111779, "rougeLsum_precision": 0.10528930235196655, "rougeLsum_precision_stderr": 0.00150409519612042, "rougeLsum_recall": 0.25955915490668724, "rougeLsum_recall_stderr": 0.0036385063619601863}}, "2": {"article_DOC_summary": {"bleu": 1.3239175335791302, "bleu_stderr": 0.07771342322834383, "rouge1_fmeasure": 0.18293512337697232, "rouge1_fmeasure_stderr": 0.0024681499873010822, "rouge1_precision": 0.1306097070189259, "rouge1_precision_stderr": 0.0018291400497876672, "rouge1_recall": 0.3182133118841435, "rouge1_recall_stderr": 0.00432268646841341, "rouge2_fmeasure": 0.03605595900823992, "rouge2_fmeasure_stderr": 0.0013964259042751057, "rouge2_precision": 0.025394792824575525, "rouge2_precision_stderr": 0.000978591013958908, "rouge2_recall": 0.06504088261073109, "rouge2_recall_stderr": 0.0026333026819242173, "rougeL_fmeasure": 0.13986197658478078, "rougeL_fmeasure_stderr": 0.0018317764059515805, "rougeL_precision": 0.09964468455116615, "rougeL_precision_stderr": 0.001340823504149176, "rougeL_recall": 0.24488309458384658, "rougeL_recall_stderr": 0.0033667400945906413, "rougeLsum_fmeasure": 0.1422604749544217, "rougeLsum_fmeasure_stderr": 0.002028772651519479, "rougeLsum_precision": 0.10128534493133166, "rougeLsum_precision_stderr": 0.0014759117790227945, "rougeLsum_recall": 0.24926384915068273, "rougeLsum_recall_stderr": 0.0037026007649822485}}, "3": {"article_DOC_summary": {"bleu": 1.5403664660028038, "bleu_stderr": 0.049650292585335425, "rouge1_fmeasure": 0.17780864985044104, "rouge1_fmeasure_stderr": 0.0026565418832362885, "rouge1_precision": 0.12999892702611107, "rouge1_precision_stderr": 0.002100607591188661, "rouge1_recall": 0.3038746779145746, "rouge1_recall_stderr": 0.004634536642096101, "rouge2_fmeasure": 0.03745595159864199, "rouge2_fmeasure_stderr": 0.001473992703184434, "rouge2_precision": 0.02673239688670725, "rouge2_precision_stderr": 0.0010541854289451763, "rouge2_recall": 0.06663818620575768, "rouge2_recall_stderr": 0.0027120422899213088, "rougeL_fmeasure": 0.14056180949140154, "rougeL_fmeasure_stderr": 0.0020638724553360224, "rougeL_precision": 0.10259957486471549, "rougeL_precision_stderr": 0.0016121168108099548, "rougeL_recall": 0.24133064775652324, "rougeL_recall_stderr": 0.0037353718988143852, "rougeLsum_fmeasure": 0.13979687550731743, "rougeLsum_fmeasure_stderr": 0.0022231392203689872, "rougeLsum_precision": 0.10210305798275769, "rougeLsum_precision_stderr": 0.001744113269797536, "rougeLsum_recall": 0.24019653384515519, "rougeLsum_recall_stderr": 0.003969364769314652}}, "4": {"article_DOC_summary": {"bleu": 0.6166062478757451, "bleu_stderr": 0.08670636927186123, "rouge1_fmeasure": 0.04917817586535565, "rouge1_fmeasure_stderr": 0.0027572778499565737, "rouge1_precision": 0.04160930994559087, "rouge1_precision_stderr": 0.002537269682812133, "rouge1_recall": 0.07694631808333259, "rouge1_recall_stderr": 0.004436240229665493, "rouge2_fmeasure": 0.009095248607177277, "rouge2_fmeasure_stderr": 0.0007994257614849805, "rouge2_precision": 0.00694315085705961, "rouge2_precision_stderr": 0.0006208373979404384, "rouge2_recall": 0.015318807336869808, "rouge2_recall_stderr": 0.0013960755776362798, "rougeL_fmeasure": 0.03787247289667727, "rougeL_fmeasure_stderr": 0.002096158407281678, "rougeL_precision": 0.03183491268441076, "rougeL_precision_stderr": 0.0019187904929101934, "rougeL_recall": 0.060155704535713776, "rougeL_recall_stderr": 0.003486439319802224, "rougeLsum_fmeasure": 0.03848782014971929, "rougeLsum_fmeasure_stderr": 0.0021772105235262963, "rougeLsum_precision": 0.03267719971454053, "rougeLsum_precision_stderr": 0.0020211511957105638, "rougeLsum_recall": 0.06063462814345425, "rougeLsum_recall_stderr": 0.003564576673636372}}, "5": {"article_DOC_summary": {"bleu": 6.389248152486678e-40, "bleu_stderr": 1.8294130292828673e-32, "rouge1_fmeasure": 0.0031440827808136107, "rouge1_fmeasure_stderr": 0.0008706294734819288, "rouge1_precision": 0.003491903968713361, "rouge1_precision_stderr": 0.0009506970135154758, "rouge1_recall": 0.002931011957260622, "rouge1_recall_stderr": 0.0008267237760907469, "rouge2_fmeasure": 0.0008276164457091518, "rouge2_fmeasure_stderr": 0.0003287842625521013, "rouge2_precision": 0.0008845399743139629, "rouge2_precision_stderr": 0.000336178820671534, "rouge2_recall": 0.0007962691827841661, "rouge2_recall_stderr": 0.0003277599279605881, "rougeL_fmeasure": 0.0023295730760623354, "rougeL_fmeasure_stderr": 0.0006241983687001699, "rougeL_precision": 0.0026031866359134312, "rougeL_precision_stderr": 0.0006864560352179834, "rougeL_recall": 0.002164363471922377, "rougeL_recall_stderr": 0.0005932076779655039, "rougeLsum_fmeasure": 0.002374711651488895, "rougeLsum_fmeasure_stderr": 0.0006401770663377113, "rougeLsum_precision": 0.0026603621647870728, "rougeLsum_precision_stderr": 0.0007053452661430197, "rougeLsum_recall": 0.0021998927147651314, "rougeLsum_recall_stderr": 0.0006064303044380291}}}}
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.337,0.014955087918653605,0
|
3 |
+
anli_r2,acc,0.327,0.014842213153411242,0
|
4 |
+
anli_r3,acc,0.3475,0.013751753243291854,0
|
5 |
+
arc_challenge,acc,0.24744027303754265,0.012610352663292673,0
|
6 |
+
arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0
|
7 |
+
arc_easy,acc,0.5286195286195287,0.010242962617927195,0
|
8 |
+
arc_easy,acc_norm,0.4654882154882155,0.010235314238969397,0
|
9 |
+
boolq,acc,0.5223241590214067,0.00873633411558504,1
|
10 |
+
cb,acc,0.4642857142857143,0.06724777654937658,1
|
11 |
+
cb,f1,0.3011204481792717,,1
|
12 |
+
copa,acc,0.79,0.040936018074033256,0
|
13 |
+
hellaswag,acc,0.4069906393148775,0.004902690765066426,0
|
14 |
+
hellaswag,acc_norm,0.518621788488349,0.004986319587524956,0
|
15 |
+
piqa,acc,0.7143634385201306,0.010539303948661932,0
|
16 |
+
piqa,acc_norm,0.7241566920565833,0.01042780550272912,0
|
17 |
+
rte,acc,0.5487364620938628,0.029953149241808946,0
|
18 |
+
sciq,acc,0.75,0.013699915608779773,0
|
19 |
+
sciq,acc_norm,0.681,0.014746404865473493,0
|
20 |
+
storycloze_2016,acc,0.6440406199893105,0.01107225418438284,0
|
21 |
+
winogrande,acc,0.5524861878453039,0.013974847640536203,0
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_0_lm-eval_global_step80108_2023-02-15-14-49-22_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.337,
|
5 |
-
"acc_stderr": 0.014955087918653605
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.327,
|
9 |
-
"acc_stderr": 0.014842213153411242
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3475,
|
13 |
-
"acc_stderr": 0.013751753243291854
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4642857142857143,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.3011204481792717
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4069906393148775,
|
26 |
-
"acc_stderr": 0.004902690765066426,
|
27 |
-
"acc_norm": 0.518621788488349,
|
28 |
-
"acc_norm_stderr": 0.004986319587524956
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5487364620938628,
|
32 |
-
"acc_stderr": 0.029953149241808946
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5524861878453039,
|
36 |
-
"acc_stderr": 0.013974847640536203
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6440406199893105,
|
40 |
-
"acc_stderr": 0.01107225418438284
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5223241590214067,
|
44 |
-
"acc_stderr": 0.00873633411558504
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5286195286195287,
|
48 |
-
"acc_stderr": 0.010242962617927195,
|
49 |
-
"acc_norm": 0.4654882154882155,
|
50 |
-
"acc_norm_stderr": 0.010235314238969397
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24744027303754265,
|
54 |
-
"acc_stderr": 0.012610352663292673,
|
55 |
-
"acc_norm": 0.28924914675767915,
|
56 |
-
"acc_norm_stderr": 0.013250012579393443
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.75,
|
60 |
-
"acc_stderr": 0.013699915608779773,
|
61 |
-
"acc_norm": 0.681,
|
62 |
-
"acc_norm_stderr": 0.014746404865473493
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7143634385201306,
|
66 |
-
"acc_stderr": 0.010539303948661932,
|
67 |
-
"acc_norm": 0.7241566920565833,
|
68 |
-
"acc_norm_stderr": 0.01042780550272912
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.312,0.014658474370509005,0
|
3 |
+
anli_r2,acc,0.328,0.014853842487270336,0
|
4 |
+
anli_r3,acc,0.32666666666666666,0.013544340907003665,0
|
5 |
+
arc_challenge,acc,0.2832764505119454,0.013167478735134575,0
|
6 |
+
arc_challenge,acc_norm,0.29436860068259385,0.013318528460539422,0
|
7 |
+
arc_easy,acc,0.6094276094276094,0.01001105911206424,0
|
8 |
+
arc_easy,acc_norm,0.5631313131313131,0.010177672928157695,0
|
9 |
+
boolq,acc,0.5324159021406728,0.008726657178723137,1
|
10 |
+
cb,acc,0.5,0.06741998624632421,1
|
11 |
+
cb,f1,0.4627446995868048,,1
|
12 |
+
copa,acc,0.71,0.04560480215720684,0
|
13 |
+
hellaswag,acc,0.3833897629954192,0.0048521826212742526,0
|
14 |
+
hellaswag,acc_norm,0.47769368651663013,0.00498481339101621,0
|
15 |
+
piqa,acc,0.750816104461371,0.010091882770120216,0
|
16 |
+
piqa,acc_norm,0.7584330794341676,0.009986718001804439,0
|
17 |
+
rte,acc,0.4657039711191336,0.030025579819366426,0
|
18 |
+
sciq,acc,0.84,0.011598902298689004,0
|
19 |
+
sciq,acc_norm,0.795,0.012772554096113118,0
|
20 |
+
storycloze_2016,acc,0.6456440406199893,0.011061031791615487,0
|
21 |
+
winogrande,acc,0.5706393054459353,0.01391153749996917,0
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_1_lm-eval_global_step80108_2023-02-15-14-49-22_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.312,
|
5 |
-
"acc_stderr": 0.014658474370509005
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.328,
|
9 |
-
"acc_stderr": 0.014853842487270336
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32666666666666666,
|
13 |
-
"acc_stderr": 0.013544340907003665
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5,
|
17 |
-
"acc_stderr": 0.06741998624632421,
|
18 |
-
"f1": 0.4627446995868048
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.71,
|
22 |
-
"acc_stderr": 0.04560480215720684
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3833897629954192,
|
26 |
-
"acc_stderr": 0.0048521826212742526,
|
27 |
-
"acc_norm": 0.47769368651663013,
|
28 |
-
"acc_norm_stderr": 0.00498481339101621
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4657039711191336,
|
32 |
-
"acc_stderr": 0.030025579819366426
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5706393054459353,
|
36 |
-
"acc_stderr": 0.01391153749996917
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6456440406199893,
|
40 |
-
"acc_stderr": 0.011061031791615487
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5324159021406728,
|
44 |
-
"acc_stderr": 0.008726657178723137
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6094276094276094,
|
48 |
-
"acc_stderr": 0.01001105911206424,
|
49 |
-
"acc_norm": 0.5631313131313131,
|
50 |
-
"acc_norm_stderr": 0.010177672928157695
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2832764505119454,
|
54 |
-
"acc_stderr": 0.013167478735134575,
|
55 |
-
"acc_norm": 0.29436860068259385,
|
56 |
-
"acc_norm_stderr": 0.013318528460539422
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.84,
|
60 |
-
"acc_stderr": 0.011598902298689004,
|
61 |
-
"acc_norm": 0.795,
|
62 |
-
"acc_norm_stderr": 0.012772554096113118
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.750816104461371,
|
66 |
-
"acc_stderr": 0.010091882770120216,
|
67 |
-
"acc_norm": 0.7584330794341676,
|
68 |
-
"acc_norm_stderr": 0.009986718001804439
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.308,0.014606483127342763,0
|
3 |
+
anli_r2,acc,0.323,0.014794927843348639,0
|
4 |
+
anli_r3,acc,0.3441666666666667,0.013720551062295755,0
|
5 |
+
arc_challenge,acc,0.29692832764505117,0.013352025976725223,0
|
6 |
+
arc_challenge,acc_norm,0.32081911262798635,0.013640943091946531,0
|
7 |
+
arc_easy,acc,0.6191077441077442,0.009964428212260372,0
|
8 |
+
arc_easy,acc_norm,0.5858585858585859,0.010107387673002528,0
|
9 |
+
boolq,acc,0.5577981651376147,0.008686430526114496,1
|
10 |
+
cb,acc,0.30357142857142855,0.06199938655510754,1
|
11 |
+
cb,f1,0.262831508114527,,1
|
12 |
+
copa,acc,0.71,0.045604802157206845,0
|
13 |
+
hellaswag,acc,0.3874726150169289,0.0048617741296124945,0
|
14 |
+
hellaswag,acc_norm,0.47759410476000796,0.004984768912326942,0
|
15 |
+
piqa,acc,0.7584330794341676,0.009986718001804463,0
|
16 |
+
piqa,acc_norm,0.766050054406964,0.009877236895137432,0
|
17 |
+
rte,acc,0.51985559566787,0.030072723167317184,0
|
18 |
+
sciq,acc,0.833,0.01180043432464459,0
|
19 |
+
sciq,acc_norm,0.8,0.012655439943366665,0
|
20 |
+
storycloze_2016,acc,0.6515232495991449,0.01101871778478849,0
|
21 |
+
winogrande,acc,0.5564325177584846,0.0139626949076204,0
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_2_lm-eval_global_step80108_2023-02-15-14-49-22_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.308,
|
5 |
-
"acc_stderr": 0.014606483127342763
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.323,
|
9 |
-
"acc_stderr": 0.014794927843348639
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3441666666666667,
|
13 |
-
"acc_stderr": 0.013720551062295755
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.30357142857142855,
|
17 |
-
"acc_stderr": 0.06199938655510754,
|
18 |
-
"f1": 0.262831508114527
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.71,
|
22 |
-
"acc_stderr": 0.045604802157206845
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3874726150169289,
|
26 |
-
"acc_stderr": 0.0048617741296124945,
|
27 |
-
"acc_norm": 0.47759410476000796,
|
28 |
-
"acc_norm_stderr": 0.004984768912326942
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.51985559566787,
|
32 |
-
"acc_stderr": 0.030072723167317184
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5564325177584846,
|
36 |
-
"acc_stderr": 0.0139626949076204
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6515232495991449,
|
40 |
-
"acc_stderr": 0.01101871778478849
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5577981651376147,
|
44 |
-
"acc_stderr": 0.008686430526114496
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6191077441077442,
|
48 |
-
"acc_stderr": 0.009964428212260372,
|
49 |
-
"acc_norm": 0.5858585858585859,
|
50 |
-
"acc_norm_stderr": 0.010107387673002528
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.29692832764505117,
|
54 |
-
"acc_stderr": 0.013352025976725223,
|
55 |
-
"acc_norm": 0.32081911262798635,
|
56 |
-
"acc_norm_stderr": 0.013640943091946531
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.833,
|
60 |
-
"acc_stderr": 0.01180043432464459,
|
61 |
-
"acc_norm": 0.8,
|
62 |
-
"acc_norm_stderr": 0.012655439943366665
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7584330794341676,
|
66 |
-
"acc_stderr": 0.009986718001804463,
|
67 |
-
"acc_norm": 0.766050054406964,
|
68 |
-
"acc_norm_stderr": 0.009877236895137432
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.347,0.015060472031706622,0
|
3 |
+
anli_r2,acc,0.346,0.015050266127564448,0
|
4 |
+
anli_r3,acc,0.335,0.013630871843821476,0
|
5 |
+
arc_challenge,acc,0.2935153583617747,0.013307250444941122,0
|
6 |
+
arc_challenge,acc_norm,0.31143344709897613,0.013532472099850942,0
|
7 |
+
arc_easy,acc,0.6174242424242424,0.009972837790531477,0
|
8 |
+
arc_easy,acc_norm,0.6102693602693603,0.010007169391797055,0
|
9 |
+
boolq,acc,0.5654434250764526,0.008669824006668013,1
|
10 |
+
cb,acc,0.35714285714285715,0.06460957383809221,1
|
11 |
+
cb,f1,0.2986564996368918,,1
|
12 |
+
copa,acc,0.8,0.04020151261036845,0
|
13 |
+
hellaswag,acc,0.38259310894244175,0.004850268986903357,0
|
14 |
+
hellaswag,acc_norm,0.48078072097191793,0.004986093791041665,0
|
15 |
+
piqa,acc,0.7573449401523396,0.010002002569708698,0
|
16 |
+
piqa,acc_norm,0.7665941240478781,0.00986924788952099,0
|
17 |
+
rte,acc,0.5595667870036101,0.029882123363118726,0
|
18 |
+
sciq,acc,0.84,0.011598902298689009,0
|
19 |
+
sciq,acc_norm,0.818,0.012207580637662144,0
|
20 |
+
storycloze_2016,acc,0.6483164083377873,0.011042025772682543,0
|
21 |
+
winogrande,acc,0.580110497237569,0.013870943986310396,0
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_3_lm-eval_global_step80108_2023-02-15-14-49-22_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.347,
|
5 |
-
"acc_stderr": 0.015060472031706622
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.346,
|
9 |
-
"acc_stderr": 0.015050266127564448
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.335,
|
13 |
-
"acc_stderr": 0.013630871843821476
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.35714285714285715,
|
17 |
-
"acc_stderr": 0.06460957383809221,
|
18 |
-
"f1": 0.2986564996368918
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.04020151261036845
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.38259310894244175,
|
26 |
-
"acc_stderr": 0.004850268986903357,
|
27 |
-
"acc_norm": 0.48078072097191793,
|
28 |
-
"acc_norm_stderr": 0.004986093791041665
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5595667870036101,
|
32 |
-
"acc_stderr": 0.029882123363118726
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.580110497237569,
|
36 |
-
"acc_stderr": 0.013870943986310396
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6483164083377873,
|
40 |
-
"acc_stderr": 0.011042025772682543
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5654434250764526,
|
44 |
-
"acc_stderr": 0.008669824006668013
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6174242424242424,
|
48 |
-
"acc_stderr": 0.009972837790531477,
|
49 |
-
"acc_norm": 0.6102693602693603,
|
50 |
-
"acc_norm_stderr": 0.010007169391797055
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2935153583617747,
|
54 |
-
"acc_stderr": 0.013307250444941122,
|
55 |
-
"acc_norm": 0.31143344709897613,
|
56 |
-
"acc_norm_stderr": 0.013532472099850942
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.84,
|
60 |
-
"acc_stderr": 0.011598902298689009,
|
61 |
-
"acc_norm": 0.818,
|
62 |
-
"acc_norm_stderr": 0.012207580637662144
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7573449401523396,
|
66 |
-
"acc_stderr": 0.010002002569708698,
|
67 |
-
"acc_norm": 0.7665941240478781,
|
68 |
-
"acc_norm_stderr": 0.00986924788952099
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.322,0.014782913600996666,0
|
3 |
+
anli_r2,acc,0.357,0.015158521721486774,0
|
4 |
+
anli_r3,acc,0.34833333333333333,0.013759437498874079,0
|
5 |
+
arc_challenge,acc,0.2901023890784983,0.01326157367752077,0
|
6 |
+
arc_challenge,acc_norm,0.3250853242320819,0.013688147309729124,0
|
7 |
+
arc_easy,acc,0.6321548821548821,0.009894923464455193,0
|
8 |
+
arc_easy,acc_norm,0.61489898989899,0.00998521479873725,0
|
9 |
+
boolq,acc,0.563914373088685,0.008673312776324932,1
|
10 |
+
cb,acc,0.32142857142857145,0.06297362289056341,1
|
11 |
+
cb,f1,0.2855772439105772,,1
|
12 |
+
copa,acc,0.77,0.042295258468165044,0
|
13 |
+
hellaswag,acc,0.3828918542123083,0.004850988215167546,0
|
14 |
+
hellaswag,acc_norm,0.4871539533957379,0.004988134303021793,0
|
15 |
+
piqa,acc,0.7600652883569097,0.009963625892809544,0
|
16 |
+
piqa,acc_norm,0.7687704026115343,0.009837063180625334,0
|
17 |
+
rte,acc,0.4729241877256318,0.030052303463143713,0
|
18 |
+
sciq,acc,0.85,0.0112972398234093,0
|
19 |
+
sciq,acc_norm,0.842,0.01153989467755957,0
|
20 |
+
storycloze_2016,acc,0.6702298236237306,0.010871682471395135,0
|
21 |
+
winogrande,acc,0.5722178374112076,0.013905134013839943,0
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_4_lm-eval_global_step80108_2023-02-15-14-49-22_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.322,
|
5 |
-
"acc_stderr": 0.014782913600996666
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.357,
|
9 |
-
"acc_stderr": 0.015158521721486774
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34833333333333333,
|
13 |
-
"acc_stderr": 0.013759437498874079
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.32142857142857145,
|
17 |
-
"acc_stderr": 0.06297362289056341,
|
18 |
-
"f1": 0.2855772439105772
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.77,
|
22 |
-
"acc_stderr": 0.042295258468165044
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3828918542123083,
|
26 |
-
"acc_stderr": 0.004850988215167546,
|
27 |
-
"acc_norm": 0.4871539533957379,
|
28 |
-
"acc_norm_stderr": 0.004988134303021793
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4729241877256318,
|
32 |
-
"acc_stderr": 0.030052303463143713
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5722178374112076,
|
36 |
-
"acc_stderr": 0.013905134013839943
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6702298236237306,
|
40 |
-
"acc_stderr": 0.010871682471395135
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.563914373088685,
|
44 |
-
"acc_stderr": 0.008673312776324932
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6321548821548821,
|
48 |
-
"acc_stderr": 0.009894923464455193,
|
49 |
-
"acc_norm": 0.61489898989899,
|
50 |
-
"acc_norm_stderr": 0.00998521479873725
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2901023890784983,
|
54 |
-
"acc_stderr": 0.01326157367752077,
|
55 |
-
"acc_norm": 0.3250853242320819,
|
56 |
-
"acc_norm_stderr": 0.013688147309729124
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.85,
|
60 |
-
"acc_stderr": 0.0112972398234093,
|
61 |
-
"acc_norm": 0.842,
|
62 |
-
"acc_norm_stderr": 0.01153989467755957
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7600652883569097,
|
66 |
-
"acc_stderr": 0.009963625892809544,
|
67 |
-
"acc_norm": 0.7687704026115343,
|
68 |
-
"acc_norm_stderr": 0.009837063180625334
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.345,0.015039986742055238,0
|
3 |
+
anli_r2,acc,0.339,0.014976758771620345,0
|
4 |
+
anli_r3,acc,0.3516666666666667,0.013789711695404789,0
|
5 |
+
arc_challenge,acc,0.29692832764505117,0.013352025976725222,0
|
6 |
+
arc_challenge,acc_norm,0.32764505119453924,0.013715847940719346,0
|
7 |
+
arc_easy,acc,0.6393097643097643,0.009853512108416734,0
|
8 |
+
arc_easy,acc_norm,0.6220538720538721,0.009949405744045481,0
|
9 |
+
boolq,acc,0.5700305810397553,0.008658853690729254,1
|
10 |
+
cb,acc,0.2857142857142857,0.060914490387317256,1
|
11 |
+
cb,f1,0.26703155274583845,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.3828918542123083,0.004850988215167541,0
|
14 |
+
hellaswag,acc_norm,0.48605855407289383,0.00498784136740252,0
|
15 |
+
piqa,acc,0.7546245919477693,0.010039831320422396,0
|
16 |
+
piqa,acc_norm,0.76550598476605,0.00988520314324054,0
|
17 |
+
rte,acc,0.516245487364621,0.030080573208738064,0
|
18 |
+
sciq,acc,0.853,0.011203415395160336,0
|
19 |
+
sciq,acc_norm,0.851,0.01126614068463217,0
|
20 |
+
storycloze_2016,acc,0.6622127204703367,0.010937034991003881,0
|
21 |
+
winogrande,acc,0.55327545382794,0.01397248837161669,0
|
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_5_lm-eval_global_step80108_2023-02-15-14-49-22_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.345,
|
5 |
-
"acc_stderr": 0.015039986742055238
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.339,
|
9 |
-
"acc_stderr": 0.014976758771620345
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3516666666666667,
|
13 |
-
"acc_stderr": 0.013789711695404789
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.2857142857142857,
|
17 |
-
"acc_stderr": 0.060914490387317256,
|
18 |
-
"f1": 0.26703155274583845
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3828918542123083,
|
26 |
-
"acc_stderr": 0.004850988215167541,
|
27 |
-
"acc_norm": 0.48605855407289383,
|
28 |
-
"acc_norm_stderr": 0.00498784136740252
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.516245487364621,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.55327545382794,
|
36 |
-
"acc_stderr": 0.01397248837161669
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6622127204703367,
|
40 |
-
"acc_stderr": 0.010937034991003881
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5700305810397553,
|
44 |
-
"acc_stderr": 0.008658853690729254
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6393097643097643,
|
48 |
-
"acc_stderr": 0.009853512108416734,
|
49 |
-
"acc_norm": 0.6220538720538721,
|
50 |
-
"acc_norm_stderr": 0.009949405744045481
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.29692832764505117,
|
54 |
-
"acc_stderr": 0.013352025976725222,
|
55 |
-
"acc_norm": 0.32764505119453924,
|
56 |
-
"acc_norm_stderr": 0.013715847940719346
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.853,
|
60 |
-
"acc_stderr": 0.011203415395160336,
|
61 |
-
"acc_norm": 0.851,
|
62 |
-
"acc_norm_stderr": 0.01126614068463217
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7546245919477693,
|
66 |
-
"acc_stderr": 0.010039831320422396,
|
67 |
-
"acc_norm": 0.76550598476605,
|
68 |
-
"acc_norm_stderr": 0.00988520314324054
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|