Muennighoff commited on
Commit
78797a3
1 Parent(s): 411a88c
4b284b84bc4v2/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.03077709464553236
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.03077709464553236
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17699912791282796
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17699912791282796
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20955751814613366
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20955751814613366
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.21988121787054832
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.21988121787054832
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.22411289670678503
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.22411289670678503
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.22601021194014084
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.22601021194014084
14
+ e2e_nlg_cleaned,5,average,multiple,0.18122301120366135
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04120350350492188
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04120350350492188
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03612025716424621
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03612025716424621
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03605595900823992
20
+ gem_xsum,2,median,rouge2_fmeasure,0.03605595900823992
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03745595159864199
22
+ gem_xsum,3,median,rouge2_fmeasure,0.03745595159864199
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009095248607177277
24
+ gem_xsum,4,median,rouge2_fmeasure,0.009095248607177277
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0008276164457091518
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0008276164457091518
27
+ gem_xsum,5,average,multiple,0.026793089388156072
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0005024848669286521
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.0005024848669286521
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.00011613167335155065
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.00011613167335155065
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.00030928662436137877
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.00030928662436137877
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0034293208231205925
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.0034293208231205925
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.015493410230104816
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.015493410230104816
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.03207493916916435
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.03207493916916435
40
+ web_nlg_en,5,average,multiple,0.00865426223117189
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.022488534485982925
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.022488534485982925
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03119150310857869
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.03119150310857869
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.03642005087819046
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.03642005087819046
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.033404677016268654
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.033404677016268654
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012263061789486048
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.012263061789486048
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002039871936933482
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.002039871936933482
53
+ wiki_lingua_en,5,average,multiple,0.022967949869240042
4b284b84bc4v2/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.0029280550995302184, "bleu_stderr": 0.0016366911962026006, "rouge1_fmeasure": 0.01166133384965969, "rouge1_fmeasure_stderr": 0.0005948210079967459, "rouge1_precision": 0.02476807814768773, "rouge1_precision_stderr": 0.0028530818463061774, "rouge1_recall": 0.02678260190311703, "rouge1_recall_stderr": 0.0012109603137937753, "rouge2_fmeasure": 0.0005024848669286521, "rouge2_fmeasure_stderr": 9.489234963288616e-05, "rouge2_precision": 0.000817405348780765, "rouge2_precision_stderr": 0.000563057630226192, "rouge2_recall": 0.0018475799663002137, "rouge2_recall_stderr": 0.00026242303057926657, "rougeL_fmeasure": 0.011371922090005586, "rougeL_fmeasure_stderr": 0.0005646757671758188, "rougeL_precision": 0.024307558662430548, "rougeL_precision_stderr": 0.002809259680412027, "rougeL_recall": 0.026306702058363073, "rougeL_recall_stderr": 0.0011921693661227405, "rougeLsum_fmeasure": 0.011437412029266373, "rougeLsum_fmeasure_stderr": 0.0005664967802496282, "rougeLsum_precision": 0.024354402252677848, "rougeLsum_precision_stderr": 0.0028093033912843027, "rougeLsum_recall": 0.026422971471523896, "rougeLsum_recall_stderr": 0.0011947591414667835}}, "1": {"PALM_prompt": {"bleu": 0.0007166390659057791, "bleu_stderr": 0.00015271503689012457, "rouge1_fmeasure": 0.004296871966216284, "rouge1_fmeasure_stderr": 0.00041150104568959847, "rouge1_precision": 0.0068975376976135325, "rouge1_precision_stderr": 0.0014736511243282327, "rouge1_recall": 0.00885887681369143, "rouge1_recall_stderr": 0.0006944193964364368, "rouge2_fmeasure": 0.00011613167335155065, "rouge2_fmeasure_stderr": 6.102001124245185e-05, "rouge2_precision": 0.0003171875473118265, "rouge2_precision_stderr": 0.0002814151389304619, "rouge2_recall": 0.00028567405139197236, "rouge2_recall_stderr": 0.00011142619043034172, "rougeL_fmeasure": 0.004206988239060447, "rougeL_fmeasure_stderr": 0.0003855165321832455, "rougeL_precision": 0.006738301550344965, "rougeL_precision_stderr": 0.0014262564028128674, "rougeL_recall": 0.008632810279305616, "rougeL_recall_stderr": 0.0006644470939899456, "rougeLsum_fmeasure": 0.004206988239060447, "rougeLsum_fmeasure_stderr": 0.0003855165321832455, "rougeLsum_precision": 0.006738301550344965, "rougeLsum_precision_stderr": 0.0014262564028128674, "rougeLsum_recall": 0.008632810279305616, "rougeLsum_recall_stderr": 0.0006644470939899456}}, "2": {"PALM_prompt": {"bleu": 0.0011433649657405618, "bleu_stderr": 9.738318964881656e-05, "rouge1_fmeasure": 0.006962873263876942, "rouge1_fmeasure_stderr": 0.0004895823378957566, "rouge1_precision": 0.01320147157904189, "rouge1_precision_stderr": 0.001952068982460362, "rouge1_recall": 0.01579302663455386, "rouge1_recall_stderr": 0.0009653406116352021, "rouge2_fmeasure": 0.00030928662436137877, "rouge2_fmeasure_stderr": 9.611704402898231e-05, "rouge2_precision": 0.0002858015060017772, "rouge2_precision_stderr": 0.00014402794289871516, "rouge2_recall": 0.000819950815164877, "rouge2_recall_stderr": 0.00019418552636900682, "rougeL_fmeasure": 0.006935271321832535, "rougeL_fmeasure_stderr": 0.0004886807794040627, "rougeL_precision": 0.013185732399727665, "rougeL_precision_stderr": 0.0019520342297636402, "rougeL_recall": 0.015624421046336484, "rougeL_recall_stderr": 0.000944465094721435, "rougeLsum_fmeasure": 0.006941075328744422, "rougeLsum_fmeasure_stderr": 0.000488906538384857, "rougeLsum_precision": 0.013190229308103155, "rougeLsum_precision_stderr": 0.0019520534211779996, "rougeLsum_recall": 0.015631466185916838, "rougeLsum_recall_stderr": 0.0009447148312929678}}, "3": {"PALM_prompt": {"bleu": 0.017458232906879215, "bleu_stderr": 0.0033790622867434076, "rouge1_fmeasure": 0.019724953977195446, "rouge1_fmeasure_stderr": 0.0007547850264824778, "rouge1_precision": 0.023119662960575944, "rouge1_precision_stderr": 0.00233355843036657, "rouge1_recall": 0.05282846241750935, "rouge1_recall_stderr": 0.0019359702235571255, "rouge2_fmeasure": 0.0034293208231205925, "rouge2_fmeasure_stderr": 0.00026671815521471114, "rouge2_precision": 0.0026448252006929015, "rouge2_precision_stderr": 0.00042559571122428457, "rouge2_recall": 0.010346087040436342, "rouge2_recall_stderr": 0.0008425074141827781, "rougeL_fmeasure": 0.018929605765833746, "rougeL_fmeasure_stderr": 0.000709339053434189, "rougeL_precision": 0.022478179089093484, "rougeL_precision_stderr": 0.002302691625879073, "rougeL_recall": 0.050774510212971205, "rougeL_recall_stderr": 0.001845862101982268, "rougeLsum_fmeasure": 0.01914065951998809, "rougeLsum_fmeasure_stderr": 0.0007220978771409472, "rougeLsum_precision": 0.022715634939831118, "rougeLsum_precision_stderr": 0.0023247590603593525, "rougeLsum_recall": 0.05112714494797748, "rougeLsum_recall_stderr": 0.0018634753399370816}}, "4": {"PALM_prompt": {"bleu": 0.07367508334810638, "bleu_stderr": 0.008598772765141552, "rouge1_fmeasure": 0.04840826317900296, "rouge1_fmeasure_stderr": 0.0012611715103482287, "rouge1_precision": 0.03481743016243523, "rouge1_precision_stderr": 0.0015627658049730399, "rouge1_recall": 0.13671695024124966, "rouge1_recall_stderr": 0.003247662069099122, "rouge2_fmeasure": 0.015493410230104816, "rouge2_fmeasure_stderr": 0.0006612843945157879, "rouge2_precision": 0.010032418466509586, "rouge2_precision_stderr": 0.00046386127467674785, "rouge2_recall": 0.04572678691854654, "rouge2_recall_stderr": 0.0019536943501289607, "rougeL_fmeasure": 0.04611942981100928, "rougeL_fmeasure_stderr": 0.001185835572456282, "rougeL_precision": 0.03330887063532685, "rougeL_precision_stderr": 0.0015356588851509804, "rougeL_recall": 0.1311235441315499, "rougeL_recall_stderr": 0.003109936140992514, "rougeLsum_fmeasure": 0.04632371389499153, "rougeLsum_fmeasure_stderr": 0.0011928970045761962, "rougeLsum_precision": 0.03348521211452374, "rougeLsum_precision_stderr": 0.0015401354090072426, "rougeLsum_recall": 0.13094153361519625, "rougeLsum_recall_stderr": 0.0030817560251935527}}, "5": {"PALM_prompt": {"bleu": 0.14403240257682612, "bleu_stderr": 0.020502126673992448, "rouge1_fmeasure": 0.07612242231547006, "rouge1_fmeasure_stderr": 0.0016048490141600572, "rouge1_precision": 0.05148247265764308, "rouge1_precision_stderr": 0.0015275173054537983, "rouge1_recall": 0.22173682737900655, "rouge1_recall_stderr": 0.004161854186037113, "rouge2_fmeasure": 0.03207493916916435, "rouge2_fmeasure_stderr": 0.0009360890624163692, "rouge2_precision": 0.020599492046193216, "rouge2_precision_stderr": 0.0006362341280141862, "rouge2_recall": 0.09581728290567497, "rouge2_recall_stderr": 0.002689360100598362, "rougeL_fmeasure": 0.07338633751298693, "rougeL_fmeasure_stderr": 0.0015133873994071854, "rougeL_precision": 0.049523715375363805, "rougeL_precision_stderr": 0.0014392263349550436, "rougeL_recall": 0.21425862449161553, "rougeL_recall_stderr": 0.003954730793317483, "rougeLsum_fmeasure": 0.07323243858705794, "rougeLsum_fmeasure_stderr": 0.0015324335796681066, "rougeLsum_precision": 0.04964705800825754, "rougeLsum_precision_stderr": 0.0014944377050311116, "rougeLsum_recall": 0.2128858004458588, "rougeLsum_recall_stderr": 0.003935525678149997}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.9851933422266917, "bleu_stderr": 0.06952404492630372, "rouge1_fmeasure": 0.126716346898361, "rouge1_fmeasure_stderr": 0.00183643925768546, "rouge1_precision": 0.1132744689534206, "rouge1_precision_stderr": 0.001964062271703089, "rouge1_recall": 0.17920924171315153, "rouge1_recall_stderr": 0.0026059508027650407, "rouge2_fmeasure": 0.022488534485982925, "rouge2_fmeasure_stderr": 0.0006967583056651384, "rouge2_precision": 0.019843012980071302, "rouge2_precision_stderr": 0.0006687266299046368, "rouge2_recall": 0.033167544297920445, "rouge2_recall_stderr": 0.0011346551086910673, "rougeL_fmeasure": 0.10671839542240537, "rougeL_fmeasure_stderr": 0.0014384846189327972, "rougeL_precision": 0.09498424083859268, "rougeL_precision_stderr": 0.001593607551270447, "rougeL_recall": 0.1533714901279331, "rougeL_recall_stderr": 0.002170011502145423, "rougeLsum_fmeasure": 0.11738170897482265, "rougeLsum_fmeasure_stderr": 0.0016744399974736352, "rougeLsum_precision": 0.10485890631099086, "rougeLsum_precision_stderr": 0.0018089123910430586, "rougeLsum_recall": 0.16659652845304845, "rougeLsum_recall_stderr": 0.0024097743667806444}}, "1": {"tldr_en": {"bleu": 1.6610768648433896, "bleu_stderr": 0.056832353542049345, "rouge1_fmeasure": 0.15184455155867413, "rouge1_fmeasure_stderr": 0.002034563245416516, "rouge1_precision": 0.13908823518286104, "rouge1_precision_stderr": 0.0022446753794830056, "rouge1_recall": 0.21098808280302153, "rouge1_recall_stderr": 0.0029368504277361264, "rouge2_fmeasure": 0.03119150310857869, "rouge2_fmeasure_stderr": 0.0008867537820946861, "rouge2_precision": 0.027877042844761103, "rouge2_precision_stderr": 0.0009018124252275846, "rouge2_recall": 0.04519203101491114, "rouge2_recall_stderr": 0.0014098192399706936, "rougeL_fmeasure": 0.11913273540788426, "rougeL_fmeasure_stderr": 0.001473272989772387, "rougeL_precision": 0.10949982767815743, "rougeL_precision_stderr": 0.0017540251498615006, "rougeL_recall": 0.1676577471462561, "rougeL_recall_stderr": 0.0022806710881294024, "rougeLsum_fmeasure": 0.14194042014754973, "rougeLsum_fmeasure_stderr": 0.001891487197261062, "rougeLsum_precision": 0.12996252462048752, "rougeLsum_precision_stderr": 0.002095806974972115, "rougeLsum_recall": 0.19761740636135, "rougeLsum_recall_stderr": 0.0027462487095581455}}, "2": {"tldr_en": {"bleu": 2.0178632068220144, "bleu_stderr": 0.05630752545151827, "rouge1_fmeasure": 0.16464877037969322, "rouge1_fmeasure_stderr": 0.0020287077043937914, "rouge1_precision": 0.15190468435288887, "rouge1_precision_stderr": 0.002345142424551451, "rouge1_recall": 0.22705395048844934, "rouge1_recall_stderr": 0.002887946301050379, "rouge2_fmeasure": 0.03642005087819046, "rouge2_fmeasure_stderr": 0.0009241245974228669, "rouge2_precision": 0.03395974615464393, "rouge2_precision_stderr": 0.0010677653838954393, "rouge2_recall": 0.05172253964536197, "rouge2_recall_stderr": 0.0014222260298251664, "rougeL_fmeasure": 0.13112025398048113, "rougeL_fmeasure_stderr": 0.0015145211634699073, "rougeL_precision": 0.12120650835304664, "rougeL_precision_stderr": 0.001876346841785363, "rougeL_recall": 0.18342257938869802, "rougeL_recall_stderr": 0.0023311986779411147, "rougeLsum_fmeasure": 0.15343386118565805, "rougeLsum_fmeasure_stderr": 0.001886311570474951, "rougeLsum_precision": 0.1413354912495611, "rougeLsum_precision_stderr": 0.002183469939019903, "rougeLsum_recall": 0.21212536828784073, "rougeLsum_recall_stderr": 0.002709014023103977}}, "3": {"tldr_en": {"bleu": 2.2124816637688673, "bleu_stderr": 0.05324645847028149, "rouge1_fmeasure": 0.14848863343770827, "rouge1_fmeasure_stderr": 0.0021653237700268767, "rouge1_precision": 0.1398125873700489, "rouge1_precision_stderr": 0.0024590771386981468, "rouge1_recall": 0.20581770819572856, "rouge1_recall_stderr": 0.0031688908975941706, "rouge2_fmeasure": 0.033404677016268654, "rouge2_fmeasure_stderr": 0.0009001030881667151, "rouge2_precision": 0.03147032611832327, "rouge2_precision_stderr": 0.0010041816927645, "rouge2_recall": 0.048666615188189155, "rouge2_recall_stderr": 0.0015022300615495168, "rougeL_fmeasure": 0.11645203897433642, "rougeL_fmeasure_stderr": 0.0016301987941532085, "rougeL_precision": 0.10967938260112484, "rougeL_precision_stderr": 0.001939118747782662, "rougeL_recall": 0.16415201449685513, "rougeL_recall_stderr": 0.002568933583030408, "rougeLsum_fmeasure": 0.1389363951676251, "rougeLsum_fmeasure_stderr": 0.002028209807222595, "rougeLsum_precision": 0.13081164522556862, "rougeLsum_precision_stderr": 0.0023048843089674047, "rougeLsum_recall": 0.19298480421501055, "rougeLsum_recall_stderr": 0.002991483092116459}}, "4": {"tldr_en": {"bleu": 0.518348574818673, "bleu_stderr": 0.04728338657384372, "rouge1_fmeasure": 0.05243492704960272, "rouge1_fmeasure_stderr": 0.0018267137943086668, "rouge1_precision": 0.051163725460641254, "rouge1_precision_stderr": 0.0020175130167421618, "rouge1_recall": 0.07547098530527592, "rouge1_recall_stderr": 0.002712368234388136, "rouge2_fmeasure": 0.012263061789486048, "rouge2_fmeasure_stderr": 0.0006259044145290426, "rouge2_precision": 0.012226230333192529, "rouge2_precision_stderr": 0.0008303652374957706, "rouge2_recall": 0.018957055186055123, "rouge2_recall_stderr": 0.0010937223070813828, "rougeL_fmeasure": 0.04136291423682112, "rougeL_fmeasure_stderr": 0.0014246743415782, "rougeL_precision": 0.040637879243783245, "rougeL_precision_stderr": 0.0016425005398963664, "rougeL_recall": 0.06062023770963191, "rougeL_recall_stderr": 0.0022164851906403818, "rougeLsum_fmeasure": 0.048791456361098884, "rougeLsum_fmeasure_stderr": 0.0016961708904439533, "rougeLsum_precision": 0.047669356275269074, "rougeLsum_precision_stderr": 0.0018834597641042353, "rougeLsum_recall": 0.07047643888720821, "rougeLsum_recall_stderr": 0.002538522085522552}}, "5": {"tldr_en": {"bleu": 5.830006972912073e-07, "bleu_stderr": 1.5242170268698066e-06, "rouge1_fmeasure": 0.008258433412209281, "rouge1_fmeasure_stderr": 0.0008152843888710176, "rouge1_precision": 0.00798899802009205, "rouge1_precision_stderr": 0.0008684678804321791, "rouge1_recall": 0.012157252874457363, "rouge1_recall_stderr": 0.0012354312911594197, "rouge2_fmeasure": 0.002039871936933482, "rouge2_fmeasure_stderr": 0.00026887096221603856, "rouge2_precision": 0.0019747973882538582, "rouge2_precision_stderr": 0.00029691155526316755, "rouge2_recall": 0.003195835732862682, "rouge2_recall_stderr": 0.0004683017569443119, "rougeL_fmeasure": 0.006505664193233564, "rougeL_fmeasure_stderr": 0.0006339130525234551, "rougeL_precision": 0.006327383054345089, "rougeL_precision_stderr": 0.0006970601584393748, "rougeL_recall": 0.009798952635742871, "rougeL_recall_stderr": 0.0010113333254833609, "rougeLsum_fmeasure": 0.007628696622003188, "rougeLsum_fmeasure_stderr": 0.0007515251763857421, "rougeLsum_precision": 0.007380499125010994, "rougeLsum_precision_stderr": 0.0007963027803529877, "rougeLsum_recall": 0.011247846618505664, "rougeLsum_recall_stderr": 0.0011352979551129004}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.9878161967998398, "bleu_stderr": 0.06641862588673211, "rouge1_fmeasure": 0.14439442209508027, "rouge1_fmeasure_stderr": 0.001276646371980532, "rouge1_precision": 0.1230414146056, "rouge1_precision_stderr": 0.0018011287640235026, "rouge1_recall": 0.2149932361322497, "rouge1_recall_stderr": 0.0017187067878016052, "rouge2_fmeasure": 0.03077709464553236, "rouge2_fmeasure_stderr": 0.0007515788730606631, "rouge2_precision": 0.0249827917795897, "rouge2_precision_stderr": 0.0006558676504376822, "rouge2_recall": 0.04571245193961166, "rouge2_recall_stderr": 0.0011353362441016387, "rougeL_fmeasure": 0.13928620677060347, "rougeL_fmeasure_stderr": 0.0011638837692616326, "rougeL_precision": 0.11674536895319193, "rougeL_precision_stderr": 0.00152188794899193, "rougeL_recall": 0.20906254512253195, "rougeL_recall_stderr": 0.0016186984135260216, "rougeLsum_fmeasure": 0.11892783850700991, "rougeLsum_fmeasure_stderr": 0.001162432472538215, "rougeLsum_precision": 0.10170099056081539, "rougeLsum_precision_stderr": 0.0015751175790873655, "rougeLsum_recall": 0.177028719967226, "rougeLsum_recall_stderr": 0.0015830122478709882}}, "1": {"generate_text_restaurant": {"bleu": 9.080481219293423, "bleu_stderr": 0.13158038055159502, "rouge1_fmeasure": 0.4015743081483971, "rouge1_fmeasure_stderr": 0.0021978471318253212, "rouge1_precision": 0.3943302810116597, "rouge1_precision_stderr": 0.002747401885321671, "rouge1_recall": 0.45740536668656395, "rouge1_recall_stderr": 0.002914926103453838, "rouge2_fmeasure": 0.17699912791282796, "rouge2_fmeasure_stderr": 0.001750185332023163, "rouge2_precision": 0.1739497922758478, "rouge2_precision_stderr": 0.0019379057184171025, "rouge2_recall": 0.20361607378340835, "rouge2_recall_stderr": 0.0021737793672060774, "rougeL_fmeasure": 0.2944824817977815, "rougeL_fmeasure_stderr": 0.0017154971121821877, "rougeL_precision": 0.2883093375260347, "rougeL_precision_stderr": 0.0021332295123494853, "rougeL_recall": 0.33875745510049593, "rougeL_recall_stderr": 0.002404798937078158, "rougeLsum_fmeasure": 0.33455401313195715, "rougeLsum_fmeasure_stderr": 0.0021144031329413423, "rougeLsum_precision": 0.32905879506709923, "rougeLsum_precision_stderr": 0.002548531871233347, "rougeLsum_recall": 0.380730834219128, "rougeLsum_recall_stderr": 0.002722245216573897}}, "2": {"generate_text_restaurant": {"bleu": 11.880857643381121, "bleu_stderr": 0.13297726203755633, "rouge1_fmeasure": 0.4405962380870851, "rouge1_fmeasure_stderr": 0.001977493224162921, "rouge1_precision": 0.43508506023556653, "rouge1_precision_stderr": 0.002294177199252108, "rouge1_recall": 0.48224900238091717, "rouge1_recall_stderr": 0.002858032856284127, "rouge2_fmeasure": 0.20955751814613366, "rouge2_fmeasure_stderr": 0.0018140125780760031, "rouge2_precision": 0.20629151293135214, "rouge2_precision_stderr": 0.0018701912148922787, "rouge2_recall": 0.2318486855887002, "rouge2_recall_stderr": 0.0022863983188674076, "rougeL_fmeasure": 0.3259684050538889, "rougeL_fmeasure_stderr": 0.0017402765794117868, "rougeL_precision": 0.32194109393814613, "rougeL_precision_stderr": 0.0019483523439828958, "rougeL_recall": 0.35732875480485404, "rougeL_recall_stderr": 0.0024237882323617684, "rougeLsum_fmeasure": 0.3722300448615989, "rougeLsum_fmeasure_stderr": 0.002006603335020953, "rougeLsum_precision": 0.36746907546290897, "rougeLsum_precision_stderr": 0.0022249091259755735, "rougeLsum_recall": 0.4075428373909175, "rougeLsum_recall_stderr": 0.00272896532835388}}, "3": {"generate_text_restaurant": {"bleu": 12.668710535676373, "bleu_stderr": 0.11599960553637455, "rouge1_fmeasure": 0.4483224588704832, "rouge1_fmeasure_stderr": 0.00196283613596949, "rouge1_precision": 0.4395509329611724, "rouge1_precision_stderr": 0.0022799777461364563, "rouge1_recall": 0.4928397756989274, "rouge1_recall_stderr": 0.002846184524464648, "rouge2_fmeasure": 0.21988121787054832, "rouge2_fmeasure_stderr": 0.0018519515945325045, "rouge2_precision": 0.21493759260102677, "rouge2_precision_stderr": 0.001901227495395828, "rouge2_recall": 0.24437272912243296, "rouge2_recall_stderr": 0.002354647079691416, "rougeL_fmeasure": 0.33468933092034114, "rougeL_fmeasure_stderr": 0.0017729142046844634, "rougeL_precision": 0.3282468443976815, "rougeL_precision_stderr": 0.001981937273750522, "rougeL_recall": 0.3683889985662075, "rougeL_recall_stderr": 0.002459195239143975, "rougeLsum_fmeasure": 0.38209771956653327, "rougeLsum_fmeasure_stderr": 0.0020243590337524803, "rougeLsum_precision": 0.37463074084445375, "rougeLsum_precision_stderr": 0.0022487871016798645, "rougeLsum_recall": 0.4200446710884972, "rougeLsum_recall_stderr": 0.0027467231496944235}}, "4": {"generate_text_restaurant": {"bleu": 12.987984921520598, "bleu_stderr": 0.16631332095853574, "rouge1_fmeasure": 0.4550163932883833, "rouge1_fmeasure_stderr": 0.002006124106156925, "rouge1_precision": 0.4426901446726453, "rouge1_precision_stderr": 0.002312374714781613, "rouge1_recall": 0.501289222749113, "rouge1_recall_stderr": 0.0028143944479086143, "rouge2_fmeasure": 0.22411289670678503, "rouge2_fmeasure_stderr": 0.0018929411054572837, "rouge2_precision": 0.21741314567975936, "rouge2_precision_stderr": 0.0019349399124124745, "rouge2_recall": 0.24947639077879472, "rouge2_recall_stderr": 0.0023667097986760496, "rougeL_fmeasure": 0.3385148446888659, "rougeL_fmeasure_stderr": 0.0017963558283099884, "rougeL_precision": 0.32911789740564507, "rougeL_precision_stderr": 0.0019701871866358513, "rougeL_recall": 0.373872891264614, "rougeL_recall_stderr": 0.002468625894339517, "rougeLsum_fmeasure": 0.38895985734645316, "rougeLsum_fmeasure_stderr": 0.002094963829800414, "rougeLsum_precision": 0.3781176477461178, "rougeLsum_precision_stderr": 0.002287152579955625, "rougeLsum_recall": 0.429023929215127, "rougeLsum_recall_stderr": 0.0027943725912901794}}, "5": {"generate_text_restaurant": {"bleu": 12.952424194291106, "bleu_stderr": 0.17721359501081466, "rouge1_fmeasure": 0.456966571846599, "rouge1_fmeasure_stderr": 0.0019734749210126813, "rouge1_precision": 0.4445492383379575, "rouge1_precision_stderr": 0.002320357386908294, "rouge1_recall": 0.5029642206493721, "rouge1_recall_stderr": 0.0027671737574197605, "rouge2_fmeasure": 0.22601021194014084, "rouge2_fmeasure_stderr": 0.00185518963183468, "rouge2_precision": 0.21950338541571118, "rouge2_precision_stderr": 0.0019236400840460968, "rouge2_recall": 0.25104119569831557, "rouge2_recall_stderr": 0.0023122193486125706, "rougeL_fmeasure": 0.3408087694501858, "rougeL_fmeasure_stderr": 0.0017879939906870438, "rougeL_precision": 0.3311755105323646, "rougeL_precision_stderr": 0.0019790975666403757, "rougeL_recall": 0.37610609585541155, "rougeL_recall_stderr": 0.0024496405296489626, "rougeLsum_fmeasure": 0.3904609207750673, "rougeLsum_fmeasure_stderr": 0.0020619775453817987, "rougeLsum_precision": 0.3796624983587727, "rougeLsum_precision_stderr": 0.002287049071998125, "rougeLsum_recall": 0.430049661746005, "rougeLsum_recall_stderr": 0.0027398492336168975}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.6178490169994428, "bleu_stderr": 0.0877349858045454, "rouge1_fmeasure": 0.19612972469676948, "rouge1_fmeasure_stderr": 0.0025764100053972294, "rouge1_precision": 0.14359819378832311, "rouge1_precision_stderr": 0.0020650285188626525, "rouge1_recall": 0.3344043431088508, "rouge1_recall_stderr": 0.004372644717501253, "rouge2_fmeasure": 0.04120350350492188, "rouge2_fmeasure_stderr": 0.001445411825760303, "rouge2_precision": 0.029674664673500505, "rouge2_precision_stderr": 0.0010650293180486199, "rouge2_recall": 0.07214641016418301, "rouge2_recall_stderr": 0.0025347435955977373, "rougeL_fmeasure": 0.14563333421498437, "rougeL_fmeasure_stderr": 0.0018388552480541446, "rougeL_precision": 0.10655263726146283, "rougeL_precision_stderr": 0.0014837973374642718, "rougeL_recall": 0.2496302152114623, "rougeL_recall_stderr": 0.0032564726396995787, "rougeLsum_fmeasure": 0.15219649916955252, "rougeLsum_fmeasure_stderr": 0.0021005836765910536, "rougeLsum_precision": 0.11122305842755399, "rougeLsum_precision_stderr": 0.0016516903041689486, "rougeLsum_recall": 0.2610787813927188, "rougeLsum_recall_stderr": 0.0037057405165489008}}, "1": {"article_DOC_summary": {"bleu": 1.401872820123109, "bleu_stderr": 0.07118073185060977, "rouge1_fmeasure": 0.1884813019546172, "rouge1_fmeasure_stderr": 0.002494762200065633, "rouge1_precision": 0.1343874942138253, "rouge1_precision_stderr": 0.0018570510799802737, "rouge1_recall": 0.32893212915185643, "rouge1_recall_stderr": 0.004321710877930019, "rouge2_fmeasure": 0.03612025716424621, "rouge2_fmeasure_stderr": 0.0013694969765650487, "rouge2_precision": 0.025517600369870188, "rouge2_precision_stderr": 0.0009688572436838881, "rouge2_recall": 0.06465696110332493, "rouge2_recall_stderr": 0.002538422888293853, "rougeL_fmeasure": 0.139572275480932, "rougeL_fmeasure_stderr": 0.001773272158023383, "rougeL_precision": 0.09936945059700034, "rougeL_precision_stderr": 0.0013159322617693906, "rougeL_recall": 0.2449745089798639, "rougeL_recall_stderr": 0.0031984284255980005, "rougeLsum_fmeasure": 0.14792628000904443, "rougeLsum_fmeasure_stderr": 0.002038823331111779, "rougeLsum_precision": 0.10528930235196655, "rougeLsum_precision_stderr": 0.00150409519612042, "rougeLsum_recall": 0.25955915490668724, "rougeLsum_recall_stderr": 0.0036385063619601863}}, "2": {"article_DOC_summary": {"bleu": 1.3239175335791302, "bleu_stderr": 0.07771342322834383, "rouge1_fmeasure": 0.18293512337697232, "rouge1_fmeasure_stderr": 0.0024681499873010822, "rouge1_precision": 0.1306097070189259, "rouge1_precision_stderr": 0.0018291400497876672, "rouge1_recall": 0.3182133118841435, "rouge1_recall_stderr": 0.00432268646841341, "rouge2_fmeasure": 0.03605595900823992, "rouge2_fmeasure_stderr": 0.0013964259042751057, "rouge2_precision": 0.025394792824575525, "rouge2_precision_stderr": 0.000978591013958908, "rouge2_recall": 0.06504088261073109, "rouge2_recall_stderr": 0.0026333026819242173, "rougeL_fmeasure": 0.13986197658478078, "rougeL_fmeasure_stderr": 0.0018317764059515805, "rougeL_precision": 0.09964468455116615, "rougeL_precision_stderr": 0.001340823504149176, "rougeL_recall": 0.24488309458384658, "rougeL_recall_stderr": 0.0033667400945906413, "rougeLsum_fmeasure": 0.1422604749544217, "rougeLsum_fmeasure_stderr": 0.002028772651519479, "rougeLsum_precision": 0.10128534493133166, "rougeLsum_precision_stderr": 0.0014759117790227945, "rougeLsum_recall": 0.24926384915068273, "rougeLsum_recall_stderr": 0.0037026007649822485}}, "3": {"article_DOC_summary": {"bleu": 1.5403664660028038, "bleu_stderr": 0.049650292585335425, "rouge1_fmeasure": 0.17780864985044104, "rouge1_fmeasure_stderr": 0.0026565418832362885, "rouge1_precision": 0.12999892702611107, "rouge1_precision_stderr": 0.002100607591188661, "rouge1_recall": 0.3038746779145746, "rouge1_recall_stderr": 0.004634536642096101, "rouge2_fmeasure": 0.03745595159864199, "rouge2_fmeasure_stderr": 0.001473992703184434, "rouge2_precision": 0.02673239688670725, "rouge2_precision_stderr": 0.0010541854289451763, "rouge2_recall": 0.06663818620575768, "rouge2_recall_stderr": 0.0027120422899213088, "rougeL_fmeasure": 0.14056180949140154, "rougeL_fmeasure_stderr": 0.0020638724553360224, "rougeL_precision": 0.10259957486471549, "rougeL_precision_stderr": 0.0016121168108099548, "rougeL_recall": 0.24133064775652324, "rougeL_recall_stderr": 0.0037353718988143852, "rougeLsum_fmeasure": 0.13979687550731743, "rougeLsum_fmeasure_stderr": 0.0022231392203689872, "rougeLsum_precision": 0.10210305798275769, "rougeLsum_precision_stderr": 0.001744113269797536, "rougeLsum_recall": 0.24019653384515519, "rougeLsum_recall_stderr": 0.003969364769314652}}, "4": {"article_DOC_summary": {"bleu": 0.6166062478757451, "bleu_stderr": 0.08670636927186123, "rouge1_fmeasure": 0.04917817586535565, "rouge1_fmeasure_stderr": 0.0027572778499565737, "rouge1_precision": 0.04160930994559087, "rouge1_precision_stderr": 0.002537269682812133, "rouge1_recall": 0.07694631808333259, "rouge1_recall_stderr": 0.004436240229665493, "rouge2_fmeasure": 0.009095248607177277, "rouge2_fmeasure_stderr": 0.0007994257614849805, "rouge2_precision": 0.00694315085705961, "rouge2_precision_stderr": 0.0006208373979404384, "rouge2_recall": 0.015318807336869808, "rouge2_recall_stderr": 0.0013960755776362798, "rougeL_fmeasure": 0.03787247289667727, "rougeL_fmeasure_stderr": 0.002096158407281678, "rougeL_precision": 0.03183491268441076, "rougeL_precision_stderr": 0.0019187904929101934, "rougeL_recall": 0.060155704535713776, "rougeL_recall_stderr": 0.003486439319802224, "rougeLsum_fmeasure": 0.03848782014971929, "rougeLsum_fmeasure_stderr": 0.0021772105235262963, "rougeLsum_precision": 0.03267719971454053, "rougeLsum_precision_stderr": 0.0020211511957105638, "rougeLsum_recall": 0.06063462814345425, "rougeLsum_recall_stderr": 0.003564576673636372}}, "5": {"article_DOC_summary": {"bleu": 6.389248152486678e-40, "bleu_stderr": 1.8294130292828673e-32, "rouge1_fmeasure": 0.0031440827808136107, "rouge1_fmeasure_stderr": 0.0008706294734819288, "rouge1_precision": 0.003491903968713361, "rouge1_precision_stderr": 0.0009506970135154758, "rouge1_recall": 0.002931011957260622, "rouge1_recall_stderr": 0.0008267237760907469, "rouge2_fmeasure": 0.0008276164457091518, "rouge2_fmeasure_stderr": 0.0003287842625521013, "rouge2_precision": 0.0008845399743139629, "rouge2_precision_stderr": 0.000336178820671534, "rouge2_recall": 0.0007962691827841661, "rouge2_recall_stderr": 0.0003277599279605881, "rougeL_fmeasure": 0.0023295730760623354, "rougeL_fmeasure_stderr": 0.0006241983687001699, "rougeL_precision": 0.0026031866359134312, "rougeL_precision_stderr": 0.0006864560352179834, "rougeL_recall": 0.002164363471922377, "rougeL_recall_stderr": 0.0005932076779655039, "rougeLsum_fmeasure": 0.002374711651488895, "rougeLsum_fmeasure_stderr": 0.0006401770663377113, "rougeLsum_precision": 0.0026603621647870728, "rougeLsum_precision_stderr": 0.0007053452661430197, "rougeLsum_recall": 0.0021998927147651314, "rougeLsum_recall_stderr": 0.0006064303044380291}}}}
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.337,0.014955087918653605,0
3
+ anli_r2,acc,0.327,0.014842213153411242,0
4
+ anli_r3,acc,0.3475,0.013751753243291854,0
5
+ arc_challenge,acc,0.24744027303754265,0.012610352663292673,0
6
+ arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0
7
+ arc_easy,acc,0.5286195286195287,0.010242962617927195,0
8
+ arc_easy,acc_norm,0.4654882154882155,0.010235314238969397,0
9
+ boolq,acc,0.5223241590214067,0.00873633411558504,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.3011204481792717,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4069906393148775,0.004902690765066426,0
14
+ hellaswag,acc_norm,0.518621788488349,0.004986319587524956,0
15
+ piqa,acc,0.7143634385201306,0.010539303948661932,0
16
+ piqa,acc_norm,0.7241566920565833,0.01042780550272912,0
17
+ rte,acc,0.5487364620938628,0.029953149241808946,0
18
+ sciq,acc,0.75,0.013699915608779773,0
19
+ sciq,acc_norm,0.681,0.014746404865473493,0
20
+ storycloze_2016,acc,0.6440406199893105,0.01107225418438284,0
21
+ winogrande,acc,0.5524861878453039,0.013974847640536203,0
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_0_lm-eval_global_step80108_2023-02-15-14-49-22_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.337,
5
- "acc_stderr": 0.014955087918653605
6
- },
7
- "anli_r2": {
8
- "acc": 0.327,
9
- "acc_stderr": 0.014842213153411242
10
- },
11
- "anli_r3": {
12
- "acc": 0.3475,
13
- "acc_stderr": 0.013751753243291854
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.3011204481792717
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4069906393148775,
26
- "acc_stderr": 0.004902690765066426,
27
- "acc_norm": 0.518621788488349,
28
- "acc_norm_stderr": 0.004986319587524956
29
- },
30
- "rte": {
31
- "acc": 0.5487364620938628,
32
- "acc_stderr": 0.029953149241808946
33
- },
34
- "winogrande": {
35
- "acc": 0.5524861878453039,
36
- "acc_stderr": 0.013974847640536203
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6440406199893105,
40
- "acc_stderr": 0.01107225418438284
41
- },
42
- "boolq": {
43
- "acc": 0.5223241590214067,
44
- "acc_stderr": 0.00873633411558504
45
- },
46
- "arc_easy": {
47
- "acc": 0.5286195286195287,
48
- "acc_stderr": 0.010242962617927195,
49
- "acc_norm": 0.4654882154882155,
50
- "acc_norm_stderr": 0.010235314238969397
51
- },
52
- "arc_challenge": {
53
- "acc": 0.24744027303754265,
54
- "acc_stderr": 0.012610352663292673,
55
- "acc_norm": 0.28924914675767915,
56
- "acc_norm_stderr": 0.013250012579393443
57
- },
58
- "sciq": {
59
- "acc": 0.75,
60
- "acc_stderr": 0.013699915608779773,
61
- "acc_norm": 0.681,
62
- "acc_norm_stderr": 0.014746404865473493
63
- },
64
- "piqa": {
65
- "acc": 0.7143634385201306,
66
- "acc_stderr": 0.010539303948661932,
67
- "acc_norm": 0.7241566920565833,
68
- "acc_norm_stderr": 0.01042780550272912
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.312,0.014658474370509005,0
3
+ anli_r2,acc,0.328,0.014853842487270336,0
4
+ anli_r3,acc,0.32666666666666666,0.013544340907003665,0
5
+ arc_challenge,acc,0.2832764505119454,0.013167478735134575,0
6
+ arc_challenge,acc_norm,0.29436860068259385,0.013318528460539422,0
7
+ arc_easy,acc,0.6094276094276094,0.01001105911206424,0
8
+ arc_easy,acc_norm,0.5631313131313131,0.010177672928157695,0
9
+ boolq,acc,0.5324159021406728,0.008726657178723137,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.4627446995868048,,1
12
+ copa,acc,0.71,0.04560480215720684,0
13
+ hellaswag,acc,0.3833897629954192,0.0048521826212742526,0
14
+ hellaswag,acc_norm,0.47769368651663013,0.00498481339101621,0
15
+ piqa,acc,0.750816104461371,0.010091882770120216,0
16
+ piqa,acc_norm,0.7584330794341676,0.009986718001804439,0
17
+ rte,acc,0.4657039711191336,0.030025579819366426,0
18
+ sciq,acc,0.84,0.011598902298689004,0
19
+ sciq,acc_norm,0.795,0.012772554096113118,0
20
+ storycloze_2016,acc,0.6456440406199893,0.011061031791615487,0
21
+ winogrande,acc,0.5706393054459353,0.01391153749996917,0
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_1_lm-eval_global_step80108_2023-02-15-14-49-22_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.312,
5
- "acc_stderr": 0.014658474370509005
6
- },
7
- "anli_r2": {
8
- "acc": 0.328,
9
- "acc_stderr": 0.014853842487270336
10
- },
11
- "anli_r3": {
12
- "acc": 0.32666666666666666,
13
- "acc_stderr": 0.013544340907003665
14
- },
15
- "cb": {
16
- "acc": 0.5,
17
- "acc_stderr": 0.06741998624632421,
18
- "f1": 0.4627446995868048
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.04560480215720684
23
- },
24
- "hellaswag": {
25
- "acc": 0.3833897629954192,
26
- "acc_stderr": 0.0048521826212742526,
27
- "acc_norm": 0.47769368651663013,
28
- "acc_norm_stderr": 0.00498481339101621
29
- },
30
- "rte": {
31
- "acc": 0.4657039711191336,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.5706393054459353,
36
- "acc_stderr": 0.01391153749996917
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6456440406199893,
40
- "acc_stderr": 0.011061031791615487
41
- },
42
- "boolq": {
43
- "acc": 0.5324159021406728,
44
- "acc_stderr": 0.008726657178723137
45
- },
46
- "arc_easy": {
47
- "acc": 0.6094276094276094,
48
- "acc_stderr": 0.01001105911206424,
49
- "acc_norm": 0.5631313131313131,
50
- "acc_norm_stderr": 0.010177672928157695
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2832764505119454,
54
- "acc_stderr": 0.013167478735134575,
55
- "acc_norm": 0.29436860068259385,
56
- "acc_norm_stderr": 0.013318528460539422
57
- },
58
- "sciq": {
59
- "acc": 0.84,
60
- "acc_stderr": 0.011598902298689004,
61
- "acc_norm": 0.795,
62
- "acc_norm_stderr": 0.012772554096113118
63
- },
64
- "piqa": {
65
- "acc": 0.750816104461371,
66
- "acc_stderr": 0.010091882770120216,
67
- "acc_norm": 0.7584330794341676,
68
- "acc_norm_stderr": 0.009986718001804439
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.308,0.014606483127342763,0
3
+ anli_r2,acc,0.323,0.014794927843348639,0
4
+ anli_r3,acc,0.3441666666666667,0.013720551062295755,0
5
+ arc_challenge,acc,0.29692832764505117,0.013352025976725223,0
6
+ arc_challenge,acc_norm,0.32081911262798635,0.013640943091946531,0
7
+ arc_easy,acc,0.6191077441077442,0.009964428212260372,0
8
+ arc_easy,acc_norm,0.5858585858585859,0.010107387673002528,0
9
+ boolq,acc,0.5577981651376147,0.008686430526114496,1
10
+ cb,acc,0.30357142857142855,0.06199938655510754,1
11
+ cb,f1,0.262831508114527,,1
12
+ copa,acc,0.71,0.045604802157206845,0
13
+ hellaswag,acc,0.3874726150169289,0.0048617741296124945,0
14
+ hellaswag,acc_norm,0.47759410476000796,0.004984768912326942,0
15
+ piqa,acc,0.7584330794341676,0.009986718001804463,0
16
+ piqa,acc_norm,0.766050054406964,0.009877236895137432,0
17
+ rte,acc,0.51985559566787,0.030072723167317184,0
18
+ sciq,acc,0.833,0.01180043432464459,0
19
+ sciq,acc_norm,0.8,0.012655439943366665,0
20
+ storycloze_2016,acc,0.6515232495991449,0.01101871778478849,0
21
+ winogrande,acc,0.5564325177584846,0.0139626949076204,0
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_2_lm-eval_global_step80108_2023-02-15-14-49-22_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.308,
5
- "acc_stderr": 0.014606483127342763
6
- },
7
- "anli_r2": {
8
- "acc": 0.323,
9
- "acc_stderr": 0.014794927843348639
10
- },
11
- "anli_r3": {
12
- "acc": 0.3441666666666667,
13
- "acc_stderr": 0.013720551062295755
14
- },
15
- "cb": {
16
- "acc": 0.30357142857142855,
17
- "acc_stderr": 0.06199938655510754,
18
- "f1": 0.262831508114527
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.3874726150169289,
26
- "acc_stderr": 0.0048617741296124945,
27
- "acc_norm": 0.47759410476000796,
28
- "acc_norm_stderr": 0.004984768912326942
29
- },
30
- "rte": {
31
- "acc": 0.51985559566787,
32
- "acc_stderr": 0.030072723167317184
33
- },
34
- "winogrande": {
35
- "acc": 0.5564325177584846,
36
- "acc_stderr": 0.0139626949076204
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6515232495991449,
40
- "acc_stderr": 0.01101871778478849
41
- },
42
- "boolq": {
43
- "acc": 0.5577981651376147,
44
- "acc_stderr": 0.008686430526114496
45
- },
46
- "arc_easy": {
47
- "acc": 0.6191077441077442,
48
- "acc_stderr": 0.009964428212260372,
49
- "acc_norm": 0.5858585858585859,
50
- "acc_norm_stderr": 0.010107387673002528
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29692832764505117,
54
- "acc_stderr": 0.013352025976725223,
55
- "acc_norm": 0.32081911262798635,
56
- "acc_norm_stderr": 0.013640943091946531
57
- },
58
- "sciq": {
59
- "acc": 0.833,
60
- "acc_stderr": 0.01180043432464459,
61
- "acc_norm": 0.8,
62
- "acc_norm_stderr": 0.012655439943366665
63
- },
64
- "piqa": {
65
- "acc": 0.7584330794341676,
66
- "acc_stderr": 0.009986718001804463,
67
- "acc_norm": 0.766050054406964,
68
- "acc_norm_stderr": 0.009877236895137432
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.347,0.015060472031706622,0
3
+ anli_r2,acc,0.346,0.015050266127564448,0
4
+ anli_r3,acc,0.335,0.013630871843821476,0
5
+ arc_challenge,acc,0.2935153583617747,0.013307250444941122,0
6
+ arc_challenge,acc_norm,0.31143344709897613,0.013532472099850942,0
7
+ arc_easy,acc,0.6174242424242424,0.009972837790531477,0
8
+ arc_easy,acc_norm,0.6102693602693603,0.010007169391797055,0
9
+ boolq,acc,0.5654434250764526,0.008669824006668013,1
10
+ cb,acc,0.35714285714285715,0.06460957383809221,1
11
+ cb,f1,0.2986564996368918,,1
12
+ copa,acc,0.8,0.04020151261036845,0
13
+ hellaswag,acc,0.38259310894244175,0.004850268986903357,0
14
+ hellaswag,acc_norm,0.48078072097191793,0.004986093791041665,0
15
+ piqa,acc,0.7573449401523396,0.010002002569708698,0
16
+ piqa,acc_norm,0.7665941240478781,0.00986924788952099,0
17
+ rte,acc,0.5595667870036101,0.029882123363118726,0
18
+ sciq,acc,0.84,0.011598902298689009,0
19
+ sciq,acc_norm,0.818,0.012207580637662144,0
20
+ storycloze_2016,acc,0.6483164083377873,0.011042025772682543,0
21
+ winogrande,acc,0.580110497237569,0.013870943986310396,0
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_3_lm-eval_global_step80108_2023-02-15-14-49-22_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.347,
5
- "acc_stderr": 0.015060472031706622
6
- },
7
- "anli_r2": {
8
- "acc": 0.346,
9
- "acc_stderr": 0.015050266127564448
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821476
14
- },
15
- "cb": {
16
- "acc": 0.35714285714285715,
17
- "acc_stderr": 0.06460957383809221,
18
- "f1": 0.2986564996368918
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.04020151261036845
23
- },
24
- "hellaswag": {
25
- "acc": 0.38259310894244175,
26
- "acc_stderr": 0.004850268986903357,
27
- "acc_norm": 0.48078072097191793,
28
- "acc_norm_stderr": 0.004986093791041665
29
- },
30
- "rte": {
31
- "acc": 0.5595667870036101,
32
- "acc_stderr": 0.029882123363118726
33
- },
34
- "winogrande": {
35
- "acc": 0.580110497237569,
36
- "acc_stderr": 0.013870943986310396
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6483164083377873,
40
- "acc_stderr": 0.011042025772682543
41
- },
42
- "boolq": {
43
- "acc": 0.5654434250764526,
44
- "acc_stderr": 0.008669824006668013
45
- },
46
- "arc_easy": {
47
- "acc": 0.6174242424242424,
48
- "acc_stderr": 0.009972837790531477,
49
- "acc_norm": 0.6102693602693603,
50
- "acc_norm_stderr": 0.010007169391797055
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2935153583617747,
54
- "acc_stderr": 0.013307250444941122,
55
- "acc_norm": 0.31143344709897613,
56
- "acc_norm_stderr": 0.013532472099850942
57
- },
58
- "sciq": {
59
- "acc": 0.84,
60
- "acc_stderr": 0.011598902298689009,
61
- "acc_norm": 0.818,
62
- "acc_norm_stderr": 0.012207580637662144
63
- },
64
- "piqa": {
65
- "acc": 0.7573449401523396,
66
- "acc_stderr": 0.010002002569708698,
67
- "acc_norm": 0.7665941240478781,
68
- "acc_norm_stderr": 0.00986924788952099
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.322,0.014782913600996666,0
3
+ anli_r2,acc,0.357,0.015158521721486774,0
4
+ anli_r3,acc,0.34833333333333333,0.013759437498874079,0
5
+ arc_challenge,acc,0.2901023890784983,0.01326157367752077,0
6
+ arc_challenge,acc_norm,0.3250853242320819,0.013688147309729124,0
7
+ arc_easy,acc,0.6321548821548821,0.009894923464455193,0
8
+ arc_easy,acc_norm,0.61489898989899,0.00998521479873725,0
9
+ boolq,acc,0.563914373088685,0.008673312776324932,1
10
+ cb,acc,0.32142857142857145,0.06297362289056341,1
11
+ cb,f1,0.2855772439105772,,1
12
+ copa,acc,0.77,0.042295258468165044,0
13
+ hellaswag,acc,0.3828918542123083,0.004850988215167546,0
14
+ hellaswag,acc_norm,0.4871539533957379,0.004988134303021793,0
15
+ piqa,acc,0.7600652883569097,0.009963625892809544,0
16
+ piqa,acc_norm,0.7687704026115343,0.009837063180625334,0
17
+ rte,acc,0.4729241877256318,0.030052303463143713,0
18
+ sciq,acc,0.85,0.0112972398234093,0
19
+ sciq,acc_norm,0.842,0.01153989467755957,0
20
+ storycloze_2016,acc,0.6702298236237306,0.010871682471395135,0
21
+ winogrande,acc,0.5722178374112076,0.013905134013839943,0
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_4_lm-eval_global_step80108_2023-02-15-14-49-22_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.322,
5
- "acc_stderr": 0.014782913600996666
6
- },
7
- "anli_r2": {
8
- "acc": 0.357,
9
- "acc_stderr": 0.015158521721486774
10
- },
11
- "anli_r3": {
12
- "acc": 0.34833333333333333,
13
- "acc_stderr": 0.013759437498874079
14
- },
15
- "cb": {
16
- "acc": 0.32142857142857145,
17
- "acc_stderr": 0.06297362289056341,
18
- "f1": 0.2855772439105772
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.042295258468165044
23
- },
24
- "hellaswag": {
25
- "acc": 0.3828918542123083,
26
- "acc_stderr": 0.004850988215167546,
27
- "acc_norm": 0.4871539533957379,
28
- "acc_norm_stderr": 0.004988134303021793
29
- },
30
- "rte": {
31
- "acc": 0.4729241877256318,
32
- "acc_stderr": 0.030052303463143713
33
- },
34
- "winogrande": {
35
- "acc": 0.5722178374112076,
36
- "acc_stderr": 0.013905134013839943
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6702298236237306,
40
- "acc_stderr": 0.010871682471395135
41
- },
42
- "boolq": {
43
- "acc": 0.563914373088685,
44
- "acc_stderr": 0.008673312776324932
45
- },
46
- "arc_easy": {
47
- "acc": 0.6321548821548821,
48
- "acc_stderr": 0.009894923464455193,
49
- "acc_norm": 0.61489898989899,
50
- "acc_norm_stderr": 0.00998521479873725
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2901023890784983,
54
- "acc_stderr": 0.01326157367752077,
55
- "acc_norm": 0.3250853242320819,
56
- "acc_norm_stderr": 0.013688147309729124
57
- },
58
- "sciq": {
59
- "acc": 0.85,
60
- "acc_stderr": 0.0112972398234093,
61
- "acc_norm": 0.842,
62
- "acc_norm_stderr": 0.01153989467755957
63
- },
64
- "piqa": {
65
- "acc": 0.7600652883569097,
66
- "acc_stderr": 0.009963625892809544,
67
- "acc_norm": 0.7687704026115343,
68
- "acc_norm_stderr": 0.009837063180625334
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.345,0.015039986742055238,0
3
+ anli_r2,acc,0.339,0.014976758771620345,0
4
+ anli_r3,acc,0.3516666666666667,0.013789711695404789,0
5
+ arc_challenge,acc,0.29692832764505117,0.013352025976725222,0
6
+ arc_challenge,acc_norm,0.32764505119453924,0.013715847940719346,0
7
+ arc_easy,acc,0.6393097643097643,0.009853512108416734,0
8
+ arc_easy,acc_norm,0.6220538720538721,0.009949405744045481,0
9
+ boolq,acc,0.5700305810397553,0.008658853690729254,1
10
+ cb,acc,0.2857142857142857,0.060914490387317256,1
11
+ cb,f1,0.26703155274583845,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.3828918542123083,0.004850988215167541,0
14
+ hellaswag,acc_norm,0.48605855407289383,0.00498784136740252,0
15
+ piqa,acc,0.7546245919477693,0.010039831320422396,0
16
+ piqa,acc_norm,0.76550598476605,0.00988520314324054,0
17
+ rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.853,0.011203415395160336,0
19
+ sciq,acc_norm,0.851,0.01126614068463217,0
20
+ storycloze_2016,acc,0.6622127204703367,0.010937034991003881,0
21
+ winogrande,acc,0.55327545382794,0.01397248837161669,0
4b284b84bc4v2/evaluation/rankeval/4b284b84bc4v2_5_lm-eval_global_step80108_2023-02-15-14-49-22_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.345,
5
- "acc_stderr": 0.015039986742055238
6
- },
7
- "anli_r2": {
8
- "acc": 0.339,
9
- "acc_stderr": 0.014976758771620345
10
- },
11
- "anli_r3": {
12
- "acc": 0.3516666666666667,
13
- "acc_stderr": 0.013789711695404789
14
- },
15
- "cb": {
16
- "acc": 0.2857142857142857,
17
- "acc_stderr": 0.060914490387317256,
18
- "f1": 0.26703155274583845
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.3828918542123083,
26
- "acc_stderr": 0.004850988215167541,
27
- "acc_norm": 0.48605855407289383,
28
- "acc_norm_stderr": 0.00498784136740252
29
- },
30
- "rte": {
31
- "acc": 0.516245487364621,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.55327545382794,
36
- "acc_stderr": 0.01397248837161669
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6622127204703367,
40
- "acc_stderr": 0.010937034991003881
41
- },
42
- "boolq": {
43
- "acc": 0.5700305810397553,
44
- "acc_stderr": 0.008658853690729254
45
- },
46
- "arc_easy": {
47
- "acc": 0.6393097643097643,
48
- "acc_stderr": 0.009853512108416734,
49
- "acc_norm": 0.6220538720538721,
50
- "acc_norm_stderr": 0.009949405744045481
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29692832764505117,
54
- "acc_stderr": 0.013352025976725222,
55
- "acc_norm": 0.32764505119453924,
56
- "acc_norm_stderr": 0.013715847940719346
57
- },
58
- "sciq": {
59
- "acc": 0.853,
60
- "acc_stderr": 0.011203415395160336,
61
- "acc_norm": 0.851,
62
- "acc_norm_stderr": 0.01126614068463217
63
- },
64
- "piqa": {
65
- "acc": 0.7546245919477693,
66
- "acc_stderr": 0.010039831320422396,
67
- "acc_norm": 0.76550598476605,
68
- "acc_norm_stderr": 0.00988520314324054
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }