diff --git "a/4b284b12bc4/eval/merged.json" "b/4b284b12bc4/eval/merged.json" new file mode 100644--- /dev/null +++ "b/4b284b12bc4/eval/merged.json" @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4070835356827751, "bleu_stderr": 0.03514958095848397, "rouge1_fmeasure": 0.11509298027342854, "rouge1_fmeasure_stderr": 0.002040147114373331, "rouge1_precision": 0.0758536616906455, "rouge1_precision_stderr": 0.0015747064380670645, "rouge1_recall": 0.3264375465319237, "rouge1_recall_stderr": 0.004888854445231445, "rouge2_fmeasure": 0.0532813862747049, "rouge2_fmeasure_stderr": 0.0012579627803205211, "rouge2_precision": 0.03493638633069714, "rouge2_precision_stderr": 0.0009342574915112234, "rouge2_recall": 0.15766160622381195, "rouge2_recall_stderr": 0.0033114573324024405, "rougeL_fmeasure": 0.1105412242108245, "rougeL_fmeasure_stderr": 0.0019072286738988954, "rougeL_precision": 0.07257604824526195, "rougeL_precision_stderr": 0.0014483785678009685, "rougeL_recall": 0.31637706878833355, "rougeL_recall_stderr": 0.004769735504597033, "rougeLsum_fmeasure": 0.10843545057843905, "rougeLsum_fmeasure_stderr": 0.0019083088150967664, "rougeLsum_precision": 0.0714774644843108, "rougeLsum_precision_stderr": 0.0014699104009543759, "rougeLsum_recall": 0.307939556685913, "rougeLsum_recall_stderr": 0.004520814685280998}, "explicit-graph-description2": {"bleu": 0.041197050082785285, "bleu_stderr": 0.001108996485999182, "rouge1_fmeasure": 0.17529221125446331, "rouge1_fmeasure_stderr": 0.0014916739627182348, "rouge1_precision": 0.2320345566104918, "rouge1_precision_stderr": 0.002028663485069812, "rouge1_recall": 0.15806405453574032, "rouge1_recall_stderr": 0.0018950997131969357, "rouge2_fmeasure": 0.012985384177633208, "rouge2_fmeasure_stderr": 0.0005793970465797221, "rouge2_precision": 0.01677596652301549, "rouge2_precision_stderr": 0.000700778712609751, "rouge2_recall": 0.012168726274182532, "rouge2_recall_stderr": 0.0006308549540794899, "rougeL_fmeasure": 0.14159539007628982, "rougeL_fmeasure_stderr": 0.0011656632831698861, "rougeL_precision": 0.1871404213284981, "rougeL_precision_stderr": 0.001604901624172935, "rougeL_recall": 0.12845338233190606, "rougeL_recall_stderr": 0.001541707987416157, "rougeLsum_fmeasure": 0.14922590343653938, "rougeLsum_fmeasure_stderr": 0.0011984205257227288, "rougeLsum_precision": 0.2002817812856504, "rougeLsum_precision_stderr": 0.0018638576492756192, "rougeLsum_recall": 0.13376513099281687, "rougeLsum_recall_stderr": 0.0015142202748567432}, "implicit-graph-description": {"bleu": 0.020794810389244665, "bleu_stderr": 0.008286245115526584, "rouge1_fmeasure": 0.04006653282921872, "rouge1_fmeasure_stderr": 0.0005481740262477606, "rouge1_precision": 0.022922154005776907, "rouge1_precision_stderr": 0.00035921812588188365, "rouge1_recall": 0.21352555655696376, "rouge1_recall_stderr": 0.0022208454109318375, "rouge2_fmeasure": 0.0019179536475281184, "rouge2_fmeasure_stderr": 8.819595205548223e-05, "rouge2_precision": 0.0010806676437478898, "rouge2_precision_stderr": 5.572788223887115e-05, "rouge2_recall": 0.013144891498511075, "rouge2_recall_stderr": 0.0006988181195494956, "rougeL_fmeasure": 0.03999564658771847, "rougeL_fmeasure_stderr": 0.0005417492107414437, "rougeL_precision": 0.02287171287886853, "rougeL_precision_stderr": 0.00035207258394197473, "rougeL_recall": 0.21329473631474377, "rougeL_recall_stderr": 0.0022102531793857646, "rougeLsum_fmeasure": 0.0259194629518608, "rougeLsum_fmeasure_stderr": 0.0003466200097817442, "rougeLsum_precision": 0.014803873787478817, "rougeLsum_precision_stderr": 0.00023955003605054956, "rougeLsum_recall": 0.14458682760689323, "rougeLsum_recall_stderr": 0.0015738069244451358}, "non-explicit-description": {"bleu": 0.011674876863475004, "bleu_stderr": 0.0021009000372254386, "rouge1_fmeasure": 0.04023278195426544, "rouge1_fmeasure_stderr": 0.0005477579043898359, "rouge1_precision": 0.023111606817375823, "rouge1_precision_stderr": 0.00034433259453187245, "rouge1_recall": 0.199898162382514, "rouge1_recall_stderr": 0.0022216437870279087, "rouge2_fmeasure": 0.004150191718708099, "rouge2_fmeasure_stderr": 0.00012739617704077432, "rouge2_precision": 0.002348515359030586, "rouge2_precision_stderr": 7.292209026413432e-05, "rouge2_recall": 0.02366089115265813, "rouge2_recall_stderr": 0.0008284819548104561, "rougeL_fmeasure": 0.04002572630665756, "rougeL_fmeasure_stderr": 0.0005453486432541972, "rougeL_precision": 0.0229905917432032, "rougeL_precision_stderr": 0.00034282566300562767, "rougeL_recall": 0.1990997429912886, "rougeL_recall_stderr": 0.0022213443978533944, "rougeLsum_fmeasure": 0.033238272140701595, "rougeLsum_fmeasure_stderr": 0.00042068255632681374, "rougeLsum_precision": 0.01905255519601842, "rougeLsum_precision_stderr": 0.0002660747474795132, "rougeLsum_recall": 0.16903333590221456, "rougeLsum_recall_stderr": 0.0018194040988673658}, "very-explicit-description": {"bleu": 0.0024661624004243305, "bleu_stderr": 0.00022679513212848655, "rouge1_fmeasure": 0.033848667122813925, "rouge1_fmeasure_stderr": 0.0004785862714798914, "rouge1_precision": 0.019449928935063972, "rouge1_precision_stderr": 0.0002996123768501138, "rouge1_recall": 0.1629409626311148, "rouge1_recall_stderr": 0.0014178344731677658, "rouge2_fmeasure": 6.345797512857661e-05, "rouge2_fmeasure_stderr": 1.5382459284932663e-05, "rouge2_precision": 3.7290701550733404e-05, "rouge2_precision_stderr": 9.057889706362649e-06, "rouge2_recall": 0.000247591268503305, "rouge2_recall_stderr": 6.608651546632237e-05, "rougeL_fmeasure": 0.033848667122813925, "rougeL_fmeasure_stderr": 0.0004785862714798914, "rougeL_precision": 0.019449928935063972, "rougeL_precision_stderr": 0.0002996123768501138, "rougeL_recall": 0.1629409626311148, "rougeL_recall_stderr": 0.0014178344731677658, "rougeLsum_fmeasure": 0.02335480003154784, "rougeLsum_fmeasure_stderr": 0.00031241488270193515, "rougeLsum_precision": 0.013350740331407294, "rougeLsum_precision_stderr": 0.0001939630721901288, "rougeLsum_recall": 0.11827840682573859, "rougeLsum_recall_stderr": 0.0010744315351548378}}, "1": {"PALM_prompt": {"bleu": 0.41914858834195134, "bleu_stderr": 0.030279335876129, "rouge1_fmeasure": 0.11424698089656772, "rouge1_fmeasure_stderr": 0.001973221738343803, "rouge1_precision": 0.07536633674836868, "rouge1_precision_stderr": 0.001620641410096321, "rouge1_recall": 0.3290768382699901, "rouge1_recall_stderr": 0.00481767508183653, "rouge2_fmeasure": 0.05368591058094131, "rouge2_fmeasure_stderr": 0.0012551880063213156, "rouge2_precision": 0.03540467062379218, "rouge2_precision_stderr": 0.001074817084017668, "rouge2_recall": 0.16089821041540717, "rouge2_recall_stderr": 0.0033011630774406127, "rougeL_fmeasure": 0.10991123942051419, "rougeL_fmeasure_stderr": 0.0018557651460448018, "rougeL_precision": 0.07231503158237214, "rougeL_precision_stderr": 0.0015163361416883465, "rougeL_recall": 0.3189205930522712, "rougeL_recall_stderr": 0.004694857387684187, "rougeLsum_fmeasure": 0.1082043807305256, "rougeLsum_fmeasure_stderr": 0.0018480349337665876, "rougeLsum_precision": 0.07148579673935408, "rougeLsum_precision_stderr": 0.0015357817111525064, "rougeLsum_recall": 0.3110112645350247, "rougeLsum_recall_stderr": 0.00441643475943137}, "explicit-graph-description2": {"bleu": 1.8996196002528378, "bleu_stderr": 0.19349493429212591, "rouge1_fmeasure": 0.1478464813817612, "rouge1_fmeasure_stderr": 0.0038320333818157508, "rouge1_precision": 0.1368123990805212, "rouge1_precision_stderr": 0.003859738443155984, "rouge1_recall": 0.21915722251491454, "rouge1_recall_stderr": 0.005434745495444146, "rouge2_fmeasure": 0.04640292562526275, "rouge2_fmeasure_stderr": 0.0020116942927143034, "rouge2_precision": 0.03979808957820905, "rouge2_precision_stderr": 0.0017989632072343414, "rouge2_recall": 0.07332816379948168, "rouge2_recall_stderr": 0.0030403554888855404, "rougeL_fmeasure": 0.1170703502765758, "rougeL_fmeasure_stderr": 0.0029311547998135696, "rougeL_precision": 0.10824512629470052, "rougeL_precision_stderr": 0.003088739020891906, "rougeL_recall": 0.18075899176594987, "rougeL_recall_stderr": 0.0044836417615243115, "rougeLsum_fmeasure": 0.1291123762138599, "rougeLsum_fmeasure_stderr": 0.0034020744059904736, "rougeLsum_precision": 0.12042587524943701, "rougeLsum_precision_stderr": 0.003511890506396, "rougeLsum_recall": 0.19133333683300907, "rougeLsum_recall_stderr": 0.004812564968367625}, "implicit-graph-description": {"bleu": 0.4861728821422337, "bleu_stderr": 0.03584212201959426, "rouge1_fmeasure": 0.08279157012164147, "rouge1_fmeasure_stderr": 0.0018413217717905348, "rouge1_precision": 0.05062962614453291, "rouge1_precision_stderr": 0.0013437525777807101, "rouge1_recall": 0.3417100308532278, "rouge1_recall_stderr": 0.00435704773787356, "rouge2_fmeasure": 0.025426464984268635, "rouge2_fmeasure_stderr": 0.0011153275941210136, "rouge2_precision": 0.015902434132343327, "rouge2_precision_stderr": 0.0007863360597732104, "rouge2_recall": 0.10098499560418767, "rouge2_recall_stderr": 0.0033802047604672408, "rougeL_fmeasure": 0.07649332751100299, "rougeL_fmeasure_stderr": 0.0014727630258473052, "rougeL_precision": 0.046198311688407115, "rougeL_precision_stderr": 0.0010451508171373679, "rougeL_recall": 0.3277232393217278, "rougeL_recall_stderr": 0.0040337705566764045, "rougeLsum_fmeasure": 0.06849359236303296, "rougeLsum_fmeasure_stderr": 0.0016968890488419251, "rougeLsum_precision": 0.04203232751325336, "rougeLsum_precision_stderr": 0.0012323483368875934, "rougeLsum_recall": 0.2835715586404924, "rougeLsum_recall_stderr": 0.004168417537419584}, "non-explicit-description": {"bleu": 0.9921366651933168, "bleu_stderr": 0.06271540865538455, "rouge1_fmeasure": 0.12976799696821192, "rouge1_fmeasure_stderr": 0.0027894949987997152, "rouge1_precision": 0.08967550123251418, "rouge1_precision_stderr": 0.002937976752458282, "rouge1_recall": 0.4773024647388192, "rouge1_recall_stderr": 0.005382160167374159, "rouge2_fmeasure": 0.054485621293343514, "rouge2_fmeasure_stderr": 0.0017012540588488038, "rouge2_precision": 0.03875399838383589, "rouge2_precision_stderr": 0.0017591539560822005, "rouge2_recall": 0.21057258380768887, "rouge2_recall_stderr": 0.004063079045737644, "rougeL_fmeasure": 0.11667620039186845, "rougeL_fmeasure_stderr": 0.002240166240092561, "rougeL_precision": 0.0790538192004279, "rougeL_precision_stderr": 0.0023876721978287347, "rougeL_recall": 0.45016078797153286, "rougeL_recall_stderr": 0.005136353137741063, "rougeLsum_fmeasure": 0.1093056100326863, "rougeLsum_fmeasure_stderr": 0.002463062122648188, "rougeLsum_precision": 0.0758745124201561, "rougeLsum_precision_stderr": 0.002591817404665531, "rougeLsum_recall": 0.40795420296814416, "rougeLsum_recall_stderr": 0.004919957414755}, "very-explicit-description": {"bleu": 0.9141145179175494, "bleu_stderr": 0.06536296915571908, "rouge1_fmeasure": 0.1299893322446928, "rouge1_fmeasure_stderr": 0.0025804647383953576, "rouge1_precision": 0.08707890601958627, "rouge1_precision_stderr": 0.0024828597057122404, "rouge1_recall": 0.4696545657848939, "rouge1_recall_stderr": 0.0050361060172909415, "rouge2_fmeasure": 0.051815361827752766, "rouge2_fmeasure_stderr": 0.001524719270466829, "rouge2_precision": 0.03503697621648224, "rouge2_precision_stderr": 0.0014028562973121522, "rouge2_recall": 0.20358535015420007, "rouge2_recall_stderr": 0.004005463978193232, "rougeL_fmeasure": 0.11759668044891276, "rougeL_fmeasure_stderr": 0.0020751461605538037, "rougeL_precision": 0.0774736609326366, "rougeL_precision_stderr": 0.0019875056608878654, "rougeL_recall": 0.4430174064260006, "rougeL_recall_stderr": 0.004789275467133504, "rougeLsum_fmeasure": 0.11034796536631672, "rougeLsum_fmeasure_stderr": 0.0022796848420741974, "rougeLsum_precision": 0.07410190870582686, "rougeLsum_precision_stderr": 0.002187569905436837, "rougeLsum_recall": 0.40495877524310353, "rougeLsum_recall_stderr": 0.004615318859024589}}, "2": {"PALM_prompt": {"bleu": 0.4241874936612034, "bleu_stderr": 0.03699728854949305, "rouge1_fmeasure": 0.11375522621692136, "rouge1_fmeasure_stderr": 0.0019642936162507533, "rouge1_precision": 0.07469786641233617, "rouge1_precision_stderr": 0.0015771153206732972, "rouge1_recall": 0.32891693541469197, "rouge1_recall_stderr": 0.004751520151482175, "rouge2_fmeasure": 0.05344291957030947, "rouge2_fmeasure_stderr": 0.001233885317072834, "rouge2_precision": 0.03462695918297652, "rouge2_precision_stderr": 0.0009079391487918842, "rouge2_recall": 0.16210166248343671, "rouge2_recall_stderr": 0.003411098262587952, "rougeL_fmeasure": 0.10934081987838024, "rougeL_fmeasure_stderr": 0.0018415550723111455, "rougeL_precision": 0.07161852924412807, "rougeL_precision_stderr": 0.0014761860684115284, "rougeL_recall": 0.31797917629392425, "rougeL_recall_stderr": 0.004598849198704314, "rougeLsum_fmeasure": 0.10823991385374933, "rougeLsum_fmeasure_stderr": 0.0018380668821100924, "rougeLsum_precision": 0.0711214967812572, "rougeLsum_precision_stderr": 0.00149377360051449, "rougeLsum_recall": 0.3130870814045286, "rougeLsum_recall_stderr": 0.004421211065212564}, "explicit-graph-description2": {"bleu": 1.640138834989728, "bleu_stderr": 0.06878839244992584, "rouge1_fmeasure": 0.19633951658374446, "rouge1_fmeasure_stderr": 0.003958953046041264, "rouge1_precision": 0.17608939105537802, "rouge1_precision_stderr": 0.004253817230115363, "rouge1_recall": 0.3431140264090531, "rouge1_recall_stderr": 0.00534101919038467, "rouge2_fmeasure": 0.07757017845101091, "rouge2_fmeasure_stderr": 0.002400954415731867, "rouge2_precision": 0.06908776827206395, "rouge2_precision_stderr": 0.002528705334523847, "rouge2_recall": 0.13897905823799864, "rouge2_recall_stderr": 0.0037630298921426447, "rougeL_fmeasure": 0.1586443874692295, "rougeL_fmeasure_stderr": 0.003057777683650459, "rougeL_precision": 0.14034375265741814, "rougeL_precision_stderr": 0.0033583359725794662, "rougeL_recall": 0.29506101479424657, "rougeL_recall_stderr": 0.004712859932989569, "rougeLsum_fmeasure": 0.1689535186370901, "rougeLsum_fmeasure_stderr": 0.0034887482192121964, "rougeLsum_precision": 0.15250105487675392, "rougeLsum_precision_stderr": 0.0037920002252605505, "rougeLsum_recall": 0.294858148838543, "rougeLsum_recall_stderr": 0.004765884466280505}, "implicit-graph-description": {"bleu": 0.7628040722794971, "bleu_stderr": 0.02211715710461336, "rouge1_fmeasure": 0.09191514123583507, "rouge1_fmeasure_stderr": 0.0015945801148090116, "rouge1_precision": 0.05503857963517623, "rouge1_precision_stderr": 0.0012530235158013246, "rouge1_recall": 0.4237722308516944, "rouge1_recall_stderr": 0.004678442445263298, "rouge2_fmeasure": 0.032591510976486694, "rouge2_fmeasure_stderr": 0.0009225376298957598, "rouge2_precision": 0.019614084687761036, "rouge2_precision_stderr": 0.0006958667373462588, "rouge2_recall": 0.1680913399724686, "rouge2_recall_stderr": 0.003936935332174803, "rougeL_fmeasure": 0.08631782016595303, "rougeL_fmeasure_stderr": 0.001327845800016554, "rougeL_precision": 0.051275890765769695, "rougeL_precision_stderr": 0.001008153954165397, "rougeL_recall": 0.40628782480488934, "rougeL_recall_stderr": 0.0044353577159538015, "rougeLsum_fmeasure": 0.07633989045692885, "rougeLsum_fmeasure_stderr": 0.0014222030587289858, "rougeLsum_precision": 0.04580878529585366, "rougeLsum_precision_stderr": 0.001122992125462262, "rougeLsum_recall": 0.3581492753310056, "rougeLsum_recall_stderr": 0.004381030885989483}, "non-explicit-description": {"bleu": 1.2825526231821482, "bleu_stderr": 0.043492144074173136, "rouge1_fmeasure": 0.14521489817363703, "rouge1_fmeasure_stderr": 0.002414506324594877, "rouge1_precision": 0.09279613971509679, "rouge1_precision_stderr": 0.002133744119404951, "rouge1_recall": 0.5489640435072753, "rouge1_recall_stderr": 0.004743143133672053, "rouge2_fmeasure": 0.061645677874947354, "rouge2_fmeasure_stderr": 0.0013858346723081972, "rouge2_precision": 0.03909773513534565, "rouge2_precision_stderr": 0.0011574465963972958, "rouge2_recall": 0.2581685386804721, "rouge2_recall_stderr": 0.004193436933392039, "rougeL_fmeasure": 0.1276872860164771, "rougeL_fmeasure_stderr": 0.0018563839053403013, "rougeL_precision": 0.08058580758928455, "rougeL_precision_stderr": 0.001676284644699535, "rougeL_recall": 0.5047433145790685, "rougeL_recall_stderr": 0.004508210716996378, "rougeLsum_fmeasure": 0.12134285926676484, "rougeLsum_fmeasure_stderr": 0.0020676862675326735, "rougeLsum_precision": 0.07756422907254948, "rougeLsum_precision_stderr": 0.0018428480505735482, "rougeLsum_recall": 0.46809438833435923, "rougeLsum_recall_stderr": 0.004457544768356877}, "very-explicit-description": {"bleu": 1.2160721235507446, "bleu_stderr": 0.04337158036151172, "rouge1_fmeasure": 0.1274257819886352, "rouge1_fmeasure_stderr": 0.002067479681389862, "rouge1_precision": 0.08092828074865113, "rouge1_precision_stderr": 0.001906490261574064, "rouge1_recall": 0.508339922983901, "rouge1_recall_stderr": 0.004811650108707499, "rouge2_fmeasure": 0.0538074768484528, "rouge2_fmeasure_stderr": 0.0012562150972743444, "rouge2_precision": 0.03434375738172082, "rouge2_precision_stderr": 0.0011440228205307629, "rouge2_recall": 0.23872647813314588, "rouge2_recall_stderr": 0.004162934850661181, "rougeL_fmeasure": 0.11571836456900784, "rougeL_fmeasure_stderr": 0.0016908336646337485, "rougeL_precision": 0.07273561966493279, "rougeL_precision_stderr": 0.001561121717880125, "rougeL_recall": 0.47591647381990865, "rougeL_recall_stderr": 0.0046462730133157, "rougeLsum_fmeasure": 0.10863800930732682, "rougeLsum_fmeasure_stderr": 0.0018630619874616906, "rougeLsum_precision": 0.06920233060909893, "rougeLsum_precision_stderr": 0.0017246238008069412, "rougeLsum_recall": 0.44001472970441646, "rougeLsum_recall_stderr": 0.004502443832581523}}, "3": {"PALM_prompt": {"bleu": 0.3916994292697065, "bleu_stderr": 0.02655023153261868, "rouge1_fmeasure": 0.11443103117633296, "rouge1_fmeasure_stderr": 0.0019845366723218495, "rouge1_precision": 0.07713695315738618, "rouge1_precision_stderr": 0.0018521617901133295, "rouge1_recall": 0.32641437991318506, "rouge1_recall_stderr": 0.004583689746653368, "rouge2_fmeasure": 0.05368996382308088, "rouge2_fmeasure_stderr": 0.0012403567348119643, "rouge2_precision": 0.036319480745632425, "rouge2_precision_stderr": 0.0012079439649413412, "rouge2_recall": 0.15985213856119682, "rouge2_recall_stderr": 0.003223582265695079, "rougeL_fmeasure": 0.10920281437234497, "rougeL_fmeasure_stderr": 0.0018472429946517301, "rougeL_precision": 0.07333561421491072, "rougeL_precision_stderr": 0.0017170202297610163, "rougeL_recall": 0.3129899723170166, "rougeL_recall_stderr": 0.004403504671395443, "rougeLsum_fmeasure": 0.10882089385505, "rougeLsum_fmeasure_stderr": 0.0018674885337082484, "rougeLsum_precision": 0.07332584684616669, "rougeLsum_precision_stderr": 0.0017553031622823821, "rougeLsum_recall": 0.31070372160179327, "rougeLsum_recall_stderr": 0.004314602758278112}, "explicit-graph-description2": {"bleu": 1.4187050896119742, "bleu_stderr": 0.05234246731012316, "rouge1_fmeasure": 0.16851267425125402, "rouge1_fmeasure_stderr": 0.00370915891341574, "rouge1_precision": 0.1480732637246692, "rouge1_precision_stderr": 0.004206315352922732, "rouge1_recall": 0.3705823844540417, "rouge1_recall_stderr": 0.005255405787076392, "rouge2_fmeasure": 0.06808437331115559, "rouge2_fmeasure_stderr": 0.0021900035041626056, "rouge2_precision": 0.05851115547615061, "rouge2_precision_stderr": 0.0023290778302875307, "rouge2_recall": 0.15887512559363004, "rouge2_recall_stderr": 0.0038897602570469245, "rougeL_fmeasure": 0.14033970117873198, "rougeL_fmeasure_stderr": 0.0028896013836388623, "rougeL_precision": 0.12082840824613766, "rougeL_precision_stderr": 0.0033519196264855337, "rougeL_recall": 0.33255889008853323, "rougeL_recall_stderr": 0.004852038022387119, "rougeLsum_fmeasure": 0.14398542256516328, "rougeLsum_fmeasure_stderr": 0.0033027404123591544, "rougeLsum_precision": 0.12767131894850914, "rougeLsum_precision_stderr": 0.0037644778529259577, "rougeLsum_recall": 0.3162971688824102, "rougeLsum_recall_stderr": 0.004747043761926457}, "implicit-graph-description": {"bleu": 0.8888663782890923, "bleu_stderr": 0.031568607067727, "rouge1_fmeasure": 0.09625702326801779, "rouge1_fmeasure_stderr": 0.0015524610950718182, "rouge1_precision": 0.05690926034885604, "rouge1_precision_stderr": 0.0011475096014499507, "rouge1_recall": 0.45426869216384025, "rouge1_recall_stderr": 0.004516571311698728, "rouge2_fmeasure": 0.036709719893509046, "rouge2_fmeasure_stderr": 0.0009246274500291746, "rouge2_precision": 0.021645434712929023, "rouge2_precision_stderr": 0.0006521179846718135, "rouge2_recall": 0.19508795473756288, "rouge2_recall_stderr": 0.0040357210440640995, "rougeL_fmeasure": 0.09064023633752837, "rougeL_fmeasure_stderr": 0.0013324331225587393, "rougeL_precision": 0.05330602414074714, "rougeL_precision_stderr": 0.000960960782770985, "rougeL_recall": 0.4341135138594583, "rougeL_recall_stderr": 0.004317367652444747, "rougeLsum_fmeasure": 0.08003764881477311, "rougeLsum_fmeasure_stderr": 0.0013775813454877939, "rougeLsum_precision": 0.04733129735724454, "rougeLsum_precision_stderr": 0.0010260711095464196, "rougeLsum_recall": 0.3853940847006117, "rougeLsum_recall_stderr": 0.004214948856669242}, "non-explicit-description": {"bleu": 1.408739047525639, "bleu_stderr": 0.037077565091005064, "rouge1_fmeasure": 0.14643243275737228, "rouge1_fmeasure_stderr": 0.0026100965175248907, "rouge1_precision": 0.09618625233754133, "rouge1_precision_stderr": 0.002582113721152913, "rouge1_recall": 0.5457125057626717, "rouge1_recall_stderr": 0.0045898974012682685, "rouge2_fmeasure": 0.06571024213935271, "rouge2_fmeasure_stderr": 0.0016742580356473582, "rouge2_precision": 0.04417002172231704, "rouge2_precision_stderr": 0.0017784684644853198, "rouge2_recall": 0.26721249890626364, "rouge2_recall_stderr": 0.004205423420143083, "rougeL_fmeasure": 0.12785451973740822, "rougeL_fmeasure_stderr": 0.002033342221993515, "rougeL_precision": 0.08292794661598844, "rougeL_precision_stderr": 0.0020960952284175775, "rougeL_recall": 0.49763673601262126, "rougeL_recall_stderr": 0.0043726175282994455, "rougeLsum_fmeasure": 0.1235235922343905, "rougeLsum_fmeasure_stderr": 0.0022715172117069057, "rougeLsum_precision": 0.08127713958483572, "rougeLsum_precision_stderr": 0.002294208086828085, "rougeLsum_recall": 0.4695938897289019, "rougeLsum_recall_stderr": 0.004353276805355962}, "very-explicit-description": {"bleu": 1.2968562517794118, "bleu_stderr": 0.03981886990665034, "rouge1_fmeasure": 0.1318424172773736, "rouge1_fmeasure_stderr": 0.002194221638712534, "rouge1_precision": 0.08332513771516135, "rouge1_precision_stderr": 0.0019868347121842745, "rouge1_recall": 0.5227234573351196, "rouge1_recall_stderr": 0.004548834912859631, "rouge2_fmeasure": 0.0579898457998029, "rouge2_fmeasure_stderr": 0.0013623575809253766, "rouge2_precision": 0.0368091501321238, "rouge2_precision_stderr": 0.0012212135534082967, "rouge2_recall": 0.25680330637418675, "rouge2_recall_stderr": 0.004245573310206607, "rougeL_fmeasure": 0.1196985480814696, "rougeL_fmeasure_stderr": 0.001792364415355123, "rougeL_precision": 0.07491478363930333, "rougeL_precision_stderr": 0.0016259914368083285, "rougeL_recall": 0.4884403183279412, "rougeL_recall_stderr": 0.0044306286785567766, "rougeLsum_fmeasure": 0.11277885328608343, "rougeLsum_fmeasure_stderr": 0.0019446200008606618, "rougeLsum_precision": 0.07137645686823287, "rougeLsum_precision_stderr": 0.0017762733333957057, "rougeLsum_recall": 0.4552935323318038, "rougeLsum_recall_stderr": 0.004331432435421035}}, "4": {"PALM_prompt": {"bleu": 0.37875018794247045, "bleu_stderr": 0.024296780304434905, "rouge1_fmeasure": 0.1102139471634063, "rouge1_fmeasure_stderr": 0.0019620155943445507, "rouge1_precision": 0.07231813177556075, "rouge1_precision_stderr": 0.0015118246585830762, "rouge1_recall": 0.31870699434574523, "rouge1_recall_stderr": 0.0046463072458484975, "rouge2_fmeasure": 0.0515680827205002, "rouge2_fmeasure_stderr": 0.001213141047008391, "rouge2_precision": 0.033695317164630666, "rouge2_precision_stderr": 0.0009163247572691914, "rouge2_recall": 0.15554105747469235, "rouge2_recall_stderr": 0.003236397171744527, "rougeL_fmeasure": 0.1054595308766645, "rougeL_fmeasure_stderr": 0.0018290764447497754, "rougeL_precision": 0.06901574750871842, "rougeL_precision_stderr": 0.0013947803448898716, "rougeL_recall": 0.30638147232578594, "rougeL_recall_stderr": 0.004472401624140904, "rougeLsum_fmeasure": 0.10492767242713451, "rougeLsum_fmeasure_stderr": 0.001836832016012516, "rougeLsum_precision": 0.06885871876103705, "rougeLsum_precision_stderr": 0.001420315845279487, "rougeLsum_recall": 0.30338100654345734, "rougeLsum_recall_stderr": 0.0043561963135752306}, "explicit-graph-description2": {"bleu": 1.245079321161806, "bleu_stderr": 0.02778718321398768, "rouge1_fmeasure": 0.13586161882742837, "rouge1_fmeasure_stderr": 0.003197161076846914, "rouge1_precision": 0.11219617600997056, "rouge1_precision_stderr": 0.003512511634359904, "rouge1_recall": 0.38024888704829407, "rouge1_recall_stderr": 0.005180394391296343, "rouge2_fmeasure": 0.052457179235399276, "rouge2_fmeasure_stderr": 0.0017515669453131311, "rouge2_precision": 0.042085938124191064, "rouge2_precision_stderr": 0.0018230760751544277, "rouge2_recall": 0.1645327089113456, "rouge2_recall_stderr": 0.003997411490344945, "rougeL_fmeasure": 0.11671197771344118, "rougeL_fmeasure_stderr": 0.0024641462705413553, "rougeL_precision": 0.09344570812882523, "rougeL_precision_stderr": 0.0027200533279541257, "rougeL_recall": 0.35242896432602544, "rougeL_recall_stderr": 0.0049265850395284186, "rougeLsum_fmeasure": 0.11463457129527055, "rougeLsum_fmeasure_stderr": 0.002803475929481998, "rougeLsum_precision": 0.09539660575352989, "rougeLsum_precision_stderr": 0.0030856580577620744, "rougeLsum_recall": 0.32391745612838724, "rougeLsum_recall_stderr": 0.004709017013998289}, "implicit-graph-description": {"bleu": 0.945854666461263, "bleu_stderr": 0.0402112381571833, "rouge1_fmeasure": 0.09460944755353914, "rouge1_fmeasure_stderr": 0.0014786963748578273, "rouge1_precision": 0.05604585253845832, "rouge1_precision_stderr": 0.0011427724679781285, "rouge1_recall": 0.4538828578884873, "rouge1_recall_stderr": 0.0044575960643494774, "rouge2_fmeasure": 0.036402813498665906, "rouge2_fmeasure_stderr": 0.0009043444520209133, "rouge2_precision": 0.021574156553869385, "rouge2_precision_stderr": 0.000666980759824044, "rouge2_recall": 0.19771338204023536, "rouge2_recall_stderr": 0.004119461394324271, "rougeL_fmeasure": 0.08940502094865169, "rougeL_fmeasure_stderr": 0.0012882532753422126, "rougeL_precision": 0.05270992641742948, "rougeL_precision_stderr": 0.000977850644744796, "rougeL_recall": 0.4347653663874296, "rougeL_recall_stderr": 0.004302103689446801, "rougeLsum_fmeasure": 0.07885538264707831, "rougeLsum_fmeasure_stderr": 0.0012911242439370375, "rougeLsum_precision": 0.046717493551301156, "rougeLsum_precision_stderr": 0.001010133723082569, "rougeLsum_recall": 0.38717142823870215, "rougeLsum_recall_stderr": 0.004286251024406202}, "non-explicit-description": {"bleu": 1.3615130495425574, "bleu_stderr": 0.03941780905887599, "rouge1_fmeasure": 0.14141133839063644, "rouge1_fmeasure_stderr": 0.0024329396682929907, "rouge1_precision": 0.09479449002481802, "rouge1_precision_stderr": 0.002655577330384621, "rouge1_recall": 0.5432614689390652, "rouge1_recall_stderr": 0.0045982556155288136, "rouge2_fmeasure": 0.06374220282296517, "rouge2_fmeasure_stderr": 0.0014814715876127712, "rouge2_precision": 0.04363505527515881, "rouge2_precision_stderr": 0.001672653936113288, "rouge2_recall": 0.27112334525352516, "rouge2_recall_stderr": 0.00422467374714849, "rougeL_fmeasure": 0.1229318925720696, "rougeL_fmeasure_stderr": 0.0019091259065232668, "rougeL_precision": 0.08141857432754715, "rougeL_precision_stderr": 0.0021601024343818197, "rougeL_recall": 0.4913866663151205, "rougeL_recall_stderr": 0.004409556454898837, "rougeLsum_fmeasure": 0.11894159333044178, "rougeLsum_fmeasure_stderr": 0.0020882148965706148, "rougeLsum_precision": 0.08007354294909853, "rougeLsum_precision_stderr": 0.0023461776768967193, "rougeLsum_recall": 0.46756254108881956, "rougeLsum_recall_stderr": 0.004390247412461436}, "very-explicit-description": {"bleu": 1.3609245244267043, "bleu_stderr": 0.033697352727562094, "rouge1_fmeasure": 0.13136185565213176, "rouge1_fmeasure_stderr": 0.002061912807178742, "rouge1_precision": 0.08350295350572431, "rouge1_precision_stderr": 0.001964565788491711, "rouge1_recall": 0.5269669069519911, "rouge1_recall_stderr": 0.004464761280028578, "rouge2_fmeasure": 0.05793797811823835, "rouge2_fmeasure_stderr": 0.0012534985992537161, "rouge2_precision": 0.036905855206776556, "rouge2_precision_stderr": 0.001145818807419939, "rouge2_recall": 0.26098146346727347, "rouge2_recall_stderr": 0.004211890254662535, "rougeL_fmeasure": 0.12003948650766051, "rougeL_fmeasure_stderr": 0.0017167875473952968, "rougeL_precision": 0.07557252948320996, "rougeL_precision_stderr": 0.0016449289985662312, "rougeL_recall": 0.49370616371308285, "rougeL_recall_stderr": 0.004327952531421013, "rougeLsum_fmeasure": 0.11220846020003944, "rougeLsum_fmeasure_stderr": 0.0018460781766260106, "rougeLsum_precision": 0.07149122309163901, "rougeLsum_precision_stderr": 0.0017785526726022418, "rougeLsum_recall": 0.4585698429370505, "rougeLsum_recall_stderr": 0.004258474803913088}}, "5": {"PALM_prompt": {"bleu": 0.3689406693649318, "bleu_stderr": 0.01833284872989782, "rouge1_fmeasure": 0.10942321553706275, "rouge1_fmeasure_stderr": 0.001960578009336271, "rouge1_precision": 0.0725733890131515, "rouge1_precision_stderr": 0.0016541722828599028, "rouge1_recall": 0.31647681542290346, "rouge1_recall_stderr": 0.004649887369574888, "rouge2_fmeasure": 0.05107734688924233, "rouge2_fmeasure_stderr": 0.00122669666906548, "rouge2_precision": 0.0340511038621137, "rouge2_precision_stderr": 0.001109810266658632, "rouge2_recall": 0.1539259113242296, "rouge2_recall_stderr": 0.003295678214536681, "rougeL_fmeasure": 0.10453686352175766, "rougeL_fmeasure_stderr": 0.0018279045748061345, "rougeL_precision": 0.06920155517233274, "rougeL_precision_stderr": 0.0015510532870385023, "rougeL_recall": 0.30372126675172995, "rougeL_recall_stderr": 0.004472388579309833, "rougeLsum_fmeasure": 0.1036182486745027, "rougeLsum_fmeasure_stderr": 0.0018321897232056268, "rougeLsum_precision": 0.0688242602910231, "rougeLsum_precision_stderr": 0.0015744817424633028, "rougeLsum_recall": 0.3000075619025587, "rougeLsum_recall_stderr": 0.004328528641012373}, "explicit-graph-description2": {"bleu": 1.1514814811735028, "bleu_stderr": 0.041945649669857865, "rouge1_fmeasure": 0.11729721313967732, "rouge1_fmeasure_stderr": 0.0027320054268922134, "rouge1_precision": 0.09393710373652846, "rouge1_precision_stderr": 0.003199269737822915, "rouge1_recall": 0.3745567023525177, "rouge1_recall_stderr": 0.005019704359364239, "rouge2_fmeasure": 0.045075512409701604, "rouge2_fmeasure_stderr": 0.0015508059027240881, "rouge2_precision": 0.03517936482889984, "rouge2_precision_stderr": 0.0016983516245859785, "rouge2_recall": 0.1605210440862932, "rouge2_recall_stderr": 0.0038664434520114016, "rougeL_fmeasure": 0.10332377556822082, "rougeL_fmeasure_stderr": 0.0021608967117124093, "rougeL_precision": 0.08008998231575896, "rougeL_precision_stderr": 0.00256681324622759, "rougeL_recall": 0.35287444686063746, "rougeL_recall_stderr": 0.004875421748566501, "rougeLsum_fmeasure": 0.09887761108872525, "rougeLsum_fmeasure_stderr": 0.002397235033540902, "rougeLsum_precision": 0.08023373310406072, "rougeLsum_precision_stderr": 0.00288649450260082, "rougeLsum_recall": 0.3198596952956621, "rougeLsum_recall_stderr": 0.004543895911760099}, "implicit-graph-description": {"bleu": 0.9354058693381648, "bleu_stderr": 0.02810525494577349, "rouge1_fmeasure": 0.09189669042348231, "rouge1_fmeasure_stderr": 0.0014444352152093109, "rouge1_precision": 0.054228941021206276, "rouge1_precision_stderr": 0.001096252527267709, "rouge1_recall": 0.44052473635855544, "rouge1_recall_stderr": 0.004434086426797913, "rouge2_fmeasure": 0.03482971179628577, "rouge2_fmeasure_stderr": 0.0009022373095447741, "rouge2_precision": 0.020624829786179254, "rouge2_precision_stderr": 0.0006718344531212511, "rouge2_recall": 0.18657022488728872, "rouge2_recall_stderr": 0.003970246553779722, "rougeL_fmeasure": 0.0870670682221663, "rougeL_fmeasure_stderr": 0.0012838750789689254, "rougeL_precision": 0.05120088679459415, "rougeL_precision_stderr": 0.0009633103342396731, "rougeL_recall": 0.4212363318994364, "rougeL_recall_stderr": 0.004211846253373217, "rougeLsum_fmeasure": 0.07691182427135196, "rougeLsum_fmeasure_stderr": 0.0013060342828319171, "rougeLsum_precision": 0.04543807296984024, "rougeLsum_precision_stderr": 0.0010157838089103063, "rougeLsum_recall": 0.37673581998691646, "rougeLsum_recall_stderr": 0.0042662278147285095}, "non-explicit-description": {"bleu": 1.343037351813773, "bleu_stderr": 0.036140936708617705, "rouge1_fmeasure": 0.13901031487174892, "rouge1_fmeasure_stderr": 0.0027114384521542325, "rouge1_precision": 0.09830029665183404, "rouge1_precision_stderr": 0.0032438344323854184, "rouge1_recall": 0.5307155773373864, "rouge1_recall_stderr": 0.004576617366471351, "rouge2_fmeasure": 0.0639493149144395, "rouge2_fmeasure_stderr": 0.0017580675198847523, "rouge2_precision": 0.04702658162841678, "rouge2_precision_stderr": 0.0021412388855118754, "rouge2_recall": 0.2655552533910532, "rouge2_recall_stderr": 0.004201121728916801, "rougeL_fmeasure": 0.12271761011397488, "rougeL_fmeasure_stderr": 0.0022754941590082543, "rougeL_precision": 0.08605550281508571, "rougeL_precision_stderr": 0.002809323179439482, "rougeL_recall": 0.4835198358351258, "rougeL_recall_stderr": 0.004434793924159799, "rougeLsum_fmeasure": 0.11803135941981421, "rougeLsum_fmeasure_stderr": 0.0024007564296031226, "rougeLsum_precision": 0.08393751091304703, "rougeLsum_precision_stderr": 0.0029130881830766323, "rougeLsum_recall": 0.4589032783950323, "rougeLsum_recall_stderr": 0.004394339374968956}, "very-explicit-description": {"bleu": 1.3127171878921704, "bleu_stderr": 0.04798969717597349, "rouge1_fmeasure": 0.12912740120963084, "rouge1_fmeasure_stderr": 0.00200364840452344, "rouge1_precision": 0.08171511695579445, "rouge1_precision_stderr": 0.0018633798688914489, "rouge1_recall": 0.5205014192182988, "rouge1_recall_stderr": 0.00444667928382001, "rouge2_fmeasure": 0.05600461944766409, "rouge2_fmeasure_stderr": 0.0012334741587230486, "rouge2_precision": 0.03547040912515195, "rouge2_precision_stderr": 0.0010877669331809023, "rouge2_recall": 0.2544026510929024, "rouge2_recall_stderr": 0.004229382794688773, "rougeL_fmeasure": 0.11838679216919511, "rougeL_fmeasure_stderr": 0.0016466394081195558, "rougeL_precision": 0.07419114114811591, "rougeL_precision_stderr": 0.0015341927152941845, "rougeL_recall": 0.4903982438780839, "rougeL_recall_stderr": 0.004319214854158963, "rougeLsum_fmeasure": 0.11026059508958445, "rougeLsum_fmeasure_stderr": 0.0018150927429308838, "rougeLsum_precision": 0.07010518510850142, "rougeLsum_precision_stderr": 0.001733506016922793, "rougeLsum_recall": 0.4514049055251742, "rougeLsum_recall_stderr": 0.00422208781632271}}}, "GEM/wiki_lingua_en": {"0": {"article_summary_en": {"bleu": 0.4446304965171368, "bleu_stderr": 0.02101564136831389, "rouge1_fmeasure": 0.09516029675770697, "rouge1_fmeasure_stderr": 0.0014796463176875243, "rouge1_precision": 0.08086960514171361, "rouge1_precision_stderr": 0.0014525200909349487, "rouge1_recall": 0.14073285537633912, "rouge1_recall_stderr": 0.0021365692846394106, "rouge2_fmeasure": 0.009594517812957653, "rouge2_fmeasure_stderr": 0.00045393089210895286, "rouge2_precision": 0.008229815685974943, "rouge2_precision_stderr": 0.0003995476871805908, "rouge2_recall": 0.014689368147720344, "rouge2_recall_stderr": 0.0008211286470989261, "rougeL_fmeasure": 0.08409008430092453, "rougeL_fmeasure_stderr": 0.0011584868305005939, "rougeL_precision": 0.07093712041741049, "rougeL_precision_stderr": 0.0011411235398140263, "rougeL_recall": 0.12604423823582933, "rougeL_recall_stderr": 0.0017629787569386458, "rougeLsum_fmeasure": 0.09152903732984705, "rougeLsum_fmeasure_stderr": 0.0013950176929762666, "rougeLsum_precision": 0.07781962570247349, "rougeLsum_precision_stderr": 0.0013794354759233953, "rougeLsum_recall": 0.13531803934376344, "rougeLsum_recall_stderr": 0.0020069265089291905}, "rephrase_en": {"bleu": 0.09735038671178359, "bleu_stderr": 0.01912399145191819, "rouge1_fmeasure": 0.07882898461240369, "rouge1_fmeasure_stderr": 0.001040524409407945, "rouge1_precision": 0.06660092740337692, "rouge1_precision_stderr": 0.0010310436131496232, "rouge1_recall": 0.11598354826382899, "rouge1_recall_stderr": 0.0014500523897201271, "rouge2_fmeasure": 0.003243321779952968, "rouge2_fmeasure_stderr": 0.00020438291184088177, "rouge2_precision": 0.0028614323156141855, "rouge2_precision_stderr": 0.000180186892026584, "rouge2_recall": 0.004436869555896244, "rouge2_recall_stderr": 0.0002993853890744701, "rougeL_fmeasure": 0.07272695606243273, "rougeL_fmeasure_stderr": 0.0008850843973306463, "rougeL_precision": 0.06084499114194029, "rougeL_precision_stderr": 0.0008567327435936374, "rougeL_recall": 0.10872582938119582, "rougeL_recall_stderr": 0.001328386237011712, "rougeLsum_fmeasure": 0.07152252322090602, "rougeLsum_fmeasure_stderr": 0.0009232099420132897, "rougeLsum_precision": 0.06025198497698202, "rougeLsum_precision_stderr": 0.0009166556048550033, "rougeLsum_recall": 0.10601353160224014, "rougeLsum_recall_stderr": 0.00132199783950748}, "summarize_above_en": {"bleu": 0.02719689123536131, "bleu_stderr": 0.0044363498052874435, "rouge1_fmeasure": 0.09629596799033903, "rouge1_fmeasure_stderr": 0.0011603461268498176, "rouge1_precision": 0.08473975013844745, "rouge1_precision_stderr": 0.0012652837122476013, "rouge1_recall": 0.1329672875133205, "rouge1_recall_stderr": 0.0014302651043992514, "rouge2_fmeasure": 0.0042667329498244436, "rouge2_fmeasure_stderr": 0.00020313195756774255, "rouge2_precision": 0.003906179417613626, "rouge2_precision_stderr": 0.00019031811304692613, "rouge2_recall": 0.0055735013712897765, "rouge2_recall_stderr": 0.00029292378757282304, "rougeL_fmeasure": 0.08883637966635893, "rougeL_fmeasure_stderr": 0.0010087038286843665, "rougeL_precision": 0.07735309410625876, "rougeL_precision_stderr": 0.001069934210773379, "rougeL_recall": 0.1244809192327779, "rougeL_recall_stderr": 0.001333807275878821, "rougeLsum_fmeasure": 0.09211538989861727, "rougeLsum_fmeasure_stderr": 0.0010998765766529236, "rougeLsum_precision": 0.08109267338305781, "rougeLsum_precision_stderr": 0.0012068363440650514, "rougeLsum_recall": 0.12723188277215594, "rougeLsum_recall_stderr": 0.001357597198069678}, "tldr_en": {"bleu": 0.14859459498800928, "bleu_stderr": 0.019538924284114197, "rouge1_fmeasure": 0.05739499438745971, "rouge1_fmeasure_stderr": 0.000959404112224303, "rouge1_precision": 0.0505257980847339, "rouge1_precision_stderr": 0.0009316535384306269, "rouge1_recall": 0.07944574030730557, "rouge1_recall_stderr": 0.0013436416711421135, "rouge2_fmeasure": 0.002874313185982406, "rouge2_fmeasure_stderr": 0.00022791640812011725, "rouge2_precision": 0.0025522430280765624, "rouge2_precision_stderr": 0.00019188137819214202, "rouge2_recall": 0.003889530091650386, "rouge2_recall_stderr": 0.0003502492712853139, "rougeL_fmeasure": 0.053008425140295905, "rougeL_fmeasure_stderr": 0.0008201296289562219, "rougeL_precision": 0.04641940622808299, "rougeL_precision_stderr": 0.0007892141876172595, "rougeL_recall": 0.07403750267734954, "rougeL_recall_stderr": 0.0011941764542527037, "rougeLsum_fmeasure": 0.05476968826480647, "rougeLsum_fmeasure_stderr": 0.0008953065390907387, "rougeLsum_precision": 0.04816018092226108, "rougeLsum_precision_stderr": 0.0008696938078640268, "rougeLsum_recall": 0.07599978162074517, "rougeLsum_recall_stderr": 0.0012632037879506343}, "write_abstract_en": {"bleu": 0.025005452627405448, "bleu_stderr": 0.004798052609004983, "rouge1_fmeasure": 0.05331824068511711, "rouge1_fmeasure_stderr": 0.0009108204142889356, "rouge1_precision": 0.050536252348243965, "rouge1_precision_stderr": 0.0010303713135263982, "rouge1_recall": 0.07249280873981946, "rouge1_recall_stderr": 0.0012874203666130126, "rouge2_fmeasure": 0.0011035986294212138, "rouge2_fmeasure_stderr": 0.00011286020352486199, "rouge2_precision": 0.0011620656110374618, "rouge2_precision_stderr": 0.00014360139445475996, "rouge2_recall": 0.001522213922602407, "rouge2_recall_stderr": 0.0001808244327036453, "rougeL_fmeasure": 0.049896775119845964, "rougeL_fmeasure_stderr": 0.0008243222785585304, "rougeL_precision": 0.046800024712754074, "rougeL_precision_stderr": 0.0009037799228350519, "rougeL_recall": 0.06839046112658723, "rougeL_recall_stderr": 0.0011982762108255986, "rougeLsum_fmeasure": 0.04804101501692195, "rougeLsum_fmeasure_stderr": 0.0008043684532992358, "rougeLsum_precision": 0.04562287089619549, "rougeLsum_precision_stderr": 0.0009343218195610894, "rougeLsum_recall": 0.06572279893423466, "rougeLsum_recall_stderr": 0.0011613172719751475}}, "1": {"article_summary_en": {"bleu": 0.8653788220477945, "bleu_stderr": 0.0447073485750703, "rouge1_fmeasure": 0.14245264302938512, "rouge1_fmeasure_stderr": 0.0016832867172118543, "rouge1_precision": 0.12036832068446836, "rouge1_precision_stderr": 0.001698822978107527, "rouge1_recall": 0.21036263585148327, "rouge1_recall_stderr": 0.002320909333489002, "rouge2_fmeasure": 0.017846850141455827, "rouge2_fmeasure_stderr": 0.0006531290340899643, "rouge2_precision": 0.015115117361918744, "rouge2_precision_stderr": 0.000577145021228364, "rouge2_recall": 0.02662558452098901, "rouge2_recall_stderr": 0.0010503502526558268, "rougeL_fmeasure": 0.1106783481573006, "rougeL_fmeasure_stderr": 0.00113630622144049, "rougeL_precision": 0.09229239279906276, "rougeL_precision_stderr": 0.0011398229641967105, "rougeL_recall": 0.16800779726654005, "rougeL_recall_stderr": 0.0017662341943095999, "rougeLsum_fmeasure": 0.13385718956690607, "rougeLsum_fmeasure_stderr": 0.0015602587391599571, "rougeLsum_precision": 0.11299875946868077, "rougeLsum_precision_stderr": 0.0015782836399768998, "rougeLsum_recall": 0.19806432139905789, "rougeLsum_recall_stderr": 0.002160572168760569}, "rephrase_en": {"bleu": 0.6029781033915351, "bleu_stderr": 0.028174541781078276, "rouge1_fmeasure": 0.10178108004797332, "rouge1_fmeasure_stderr": 0.0015749327774327397, "rouge1_precision": 0.08855732672619102, "rouge1_precision_stderr": 0.0015064472119031337, "rouge1_recall": 0.14544347329432597, "rouge1_recall_stderr": 0.002411071227170801, "rouge2_fmeasure": 0.010181112623842817, "rouge2_fmeasure_stderr": 0.0005119910300898817, "rouge2_precision": 0.008560544006655627, "rouge2_precision_stderr": 0.0004413067196946586, "rouge2_recall": 0.01616782557058256, "rouge2_recall_stderr": 0.000930039738517821, "rougeL_fmeasure": 0.07977466300345593, "rougeL_fmeasure_stderr": 0.001103140871324476, "rougeL_precision": 0.06881259230018068, "rougeL_precision_stderr": 0.0010402326401547975, "rougeL_recall": 0.1164247420147934, "rougeL_recall_stderr": 0.0018577911811822583, "rougeLsum_fmeasure": 0.09536226516262464, "rougeLsum_fmeasure_stderr": 0.0014532751500060744, "rougeLsum_precision": 0.08290521041610925, "rougeLsum_precision_stderr": 0.0013971705074040997, "rougeLsum_recall": 0.13668730020343445, "rougeLsum_recall_stderr": 0.0022409703680440853}, "summarize_above_en": {"bleu": 0.10510472666663716, "bleu_stderr": 0.02342215944592973, "rouge1_fmeasure": 0.0918764247127699, "rouge1_fmeasure_stderr": 0.0011260140101502995, "rouge1_precision": 0.07936181927044636, "rouge1_precision_stderr": 0.0011762568246866275, "rouge1_recall": 0.1308334139584999, "rouge1_recall_stderr": 0.0014823829252056944, "rouge2_fmeasure": 0.00423567615381497, "rouge2_fmeasure_stderr": 0.00021018422233903274, "rouge2_precision": 0.00393290828598269, "rouge2_precision_stderr": 0.0002040513153487916, "rouge2_recall": 0.005507726691318861, "rouge2_recall_stderr": 0.00030125280804493035, "rougeL_fmeasure": 0.08666723799936435, "rougeL_fmeasure_stderr": 0.000999850410378775, "rougeL_precision": 0.07417084074707798, "rougeL_precision_stderr": 0.001018585184337875, "rougeL_recall": 0.12499113633870057, "rougeL_recall_stderr": 0.0013966793071261054, "rougeLsum_fmeasure": 0.08825782099755484, "rougeLsum_fmeasure_stderr": 0.0010565687929762516, "rougeLsum_precision": 0.07610364515575635, "rougeLsum_precision_stderr": 0.0011064264710643904, "rougeLsum_recall": 0.12615810768661415, "rougeLsum_recall_stderr": 0.001412447688376772}, "tldr_en": {"bleu": 1.346869321685321, "bleu_stderr": 0.06607443264134674, "rouge1_fmeasure": 0.15836293018887368, "rouge1_fmeasure_stderr": 0.001891640433790272, "rouge1_precision": 0.13584035826185337, "rouge1_precision_stderr": 0.001883003830967405, "rouge1_recall": 0.2300655738827389, "rouge1_recall_stderr": 0.002771815067524841, "rouge2_fmeasure": 0.028190707681194575, "rouge2_fmeasure_stderr": 0.0008118715649523407, "rouge2_precision": 0.02400583408187123, "rouge2_precision_stderr": 0.0007285343955308211, "rouge2_recall": 0.04289680599794199, "rouge2_recall_stderr": 0.001353605133009988, "rougeL_fmeasure": 0.12130392597137107, "rougeL_fmeasure_stderr": 0.0012906736654645788, "rougeL_precision": 0.10298984208878467, "rougeL_precision_stderr": 0.0012746961770117027, "rougeL_recall": 0.18009351587124983, "rougeL_recall_stderr": 0.002104050870387258, "rougeLsum_fmeasure": 0.14788907195330203, "rougeLsum_fmeasure_stderr": 0.0017520956258023405, "rougeLsum_precision": 0.12668865175777289, "rougeLsum_precision_stderr": 0.0017429813042225584, "rougeLsum_recall": 0.21558088869876474, "rougeLsum_recall_stderr": 0.0026000978058887433}, "write_abstract_en": {"bleu": 0.6148382566867375, "bleu_stderr": 0.020840940915060138, "rouge1_fmeasure": 0.11453232328159935, "rouge1_fmeasure_stderr": 0.0015165645681982799, "rouge1_precision": 0.10336569841892168, "rouge1_precision_stderr": 0.0015572986318203581, "rouge1_recall": 0.15980468261934022, "rouge1_recall_stderr": 0.0021843290096728234, "rouge2_fmeasure": 0.010646298254836605, "rouge2_fmeasure_stderr": 0.0005004715634605305, "rouge2_precision": 0.00937353738620273, "rouge2_precision_stderr": 0.0004621640258235172, "rouge2_recall": 0.01583249449894538, "rouge2_recall_stderr": 0.0008242203780143205, "rougeL_fmeasure": 0.08826342080759857, "rougeL_fmeasure_stderr": 0.0010184729602732176, "rougeL_precision": 0.0790900720241622, "rougeL_precision_stderr": 0.0010673462835914323, "rougeL_recall": 0.126113983071195, "rougeL_recall_stderr": 0.00164608010815976, "rougeLsum_fmeasure": 0.10835203045408379, "rougeLsum_fmeasure_stderr": 0.0014203571658610793, "rougeLsum_precision": 0.09781712366399713, "rougeLsum_precision_stderr": 0.0014684084322258415, "rougeLsum_recall": 0.15149051165585709, "rougeLsum_recall_stderr": 0.0020607697766305863}}, "2": {"article_summary_en": {"bleu": 1.0323682964004037, "bleu_stderr": 0.05728647371845087, "rouge1_fmeasure": 0.15317578509745602, "rouge1_fmeasure_stderr": 0.0017377869775542976, "rouge1_precision": 0.12987828446896582, "rouge1_precision_stderr": 0.0017668539132634244, "rouge1_recall": 0.22449329045901115, "rouge1_recall_stderr": 0.002446761016968778, "rouge2_fmeasure": 0.022535640055881916, "rouge2_fmeasure_stderr": 0.00069644020248688, "rouge2_precision": 0.019205230811756614, "rouge2_precision_stderr": 0.0006259651415811169, "rouge2_recall": 0.034096443155707416, "rouge2_recall_stderr": 0.001170242516537603, "rougeL_fmeasure": 0.11716221211305587, "rougeL_fmeasure_stderr": 0.0011775173863940828, "rougeL_precision": 0.09796065026425818, "rougeL_precision_stderr": 0.001176944853197895, "rougeL_recall": 0.1766738402223476, "rougeL_recall_stderr": 0.0019113957567420726, "rougeLsum_fmeasure": 0.1434435746637532, "rougeLsum_fmeasure_stderr": 0.0016022296713075052, "rougeLsum_precision": 0.12148938565288571, "rougeLsum_precision_stderr": 0.0016327095456720253, "rougeLsum_recall": 0.21077828674873728, "rougeLsum_recall_stderr": 0.0022836578166374076}, "rephrase_en": {"bleu": 1.168914893531667, "bleu_stderr": 0.05316184069367659, "rouge1_fmeasure": 0.12386118481423918, "rouge1_fmeasure_stderr": 0.0019396226221506604, "rouge1_precision": 0.10757274244241427, "rouge1_precision_stderr": 0.001866638618407923, "rouge1_recall": 0.1768593713292845, "rouge1_recall_stderr": 0.0028614155741575843, "rouge2_fmeasure": 0.02117387153026309, "rouge2_fmeasure_stderr": 0.0007530823627085244, "rouge2_precision": 0.01811267415569796, "rouge2_precision_stderr": 0.0006924773828138044, "rouge2_recall": 0.03241684505960385, "rouge2_recall_stderr": 0.001268207503708144, "rougeL_fmeasure": 0.09831165574662265, "rougeL_fmeasure_stderr": 0.0014131960658590383, "rougeL_precision": 0.08462993923914566, "rougeL_precision_stderr": 0.00135467976486583, "rougeL_recall": 0.14336450100833226, "rougeL_recall_stderr": 0.002264556242154818, "rougeLsum_fmeasure": 0.11426827856742555, "rougeLsum_fmeasure_stderr": 0.0017893139149704248, "rougeLsum_precision": 0.09907287995201415, "rougeLsum_precision_stderr": 0.0017198819213882605, "rougeLsum_recall": 0.16367866039452192, "rougeLsum_recall_stderr": 0.0026624060636886954}, "summarize_above_en": {"bleu": 0.19993488449475255, "bleu_stderr": 0.04329139775558018, "rouge1_fmeasure": 0.09256018146781847, "rouge1_fmeasure_stderr": 0.0011555403499640406, "rouge1_precision": 0.07947361253543833, "rouge1_precision_stderr": 0.001193005591889888, "rouge1_recall": 0.1327421000432318, "rouge1_recall_stderr": 0.0015486342539799732, "rouge2_fmeasure": 0.004697153886380661, "rouge2_fmeasure_stderr": 0.00027487360812121926, "rouge2_precision": 0.004199424248706065, "rouge2_precision_stderr": 0.0002403958914677049, "rouge2_recall": 0.006627430748129385, "rouge2_recall_stderr": 0.00047327454697217267, "rougeL_fmeasure": 0.08788495344807927, "rougeL_fmeasure_stderr": 0.001034393917882335, "rougeL_precision": 0.07476922060726728, "rougeL_precision_stderr": 0.001036726053715039, "rougeL_recall": 0.127478433032578, "rougeL_recall_stderr": 0.0014601336227070283, "rougeLsum_fmeasure": 0.08832933606271764, "rougeLsum_fmeasure_stderr": 0.0010830718460803723, "rougeLsum_precision": 0.07575953247297404, "rougeLsum_precision_stderr": 0.0011219051960697426, "rougeLsum_recall": 0.12703786617602278, "rougeLsum_recall_stderr": 0.001464179232269078}, "tldr_en": {"bleu": 2.247794388992107, "bleu_stderr": 0.09928029909737168, "rouge1_fmeasure": 0.1979943409707515, "rouge1_fmeasure_stderr": 0.0019701979235594003, "rouge1_precision": 0.1708173947979992, "rouge1_precision_stderr": 0.002081500885534108, "rouge1_recall": 0.2857546859749413, "rouge1_recall_stderr": 0.002766073063501205, "rouge2_fmeasure": 0.04456119604899187, "rouge2_fmeasure_stderr": 0.0009779372383836055, "rouge2_precision": 0.03867526801746755, "rouge2_precision_stderr": 0.0009249837176721441, "rouge2_recall": 0.06488659439579164, "rouge2_recall_stderr": 0.0015151297174771778, "rougeL_fmeasure": 0.14561448339286645, "rougeL_fmeasure_stderr": 0.0013308339769298708, "rougeL_precision": 0.12436000788073821, "rougeL_precision_stderr": 0.0013936495773448447, "rougeL_recall": 0.21512580022359384, "rougeL_recall_stderr": 0.0021392819398317084, "rougeLsum_fmeasure": 0.18443888049277404, "rougeLsum_fmeasure_stderr": 0.0018323545176278458, "rougeLsum_precision": 0.15896932268065497, "rougeLsum_precision_stderr": 0.0019322450604166179, "rougeLsum_recall": 0.2668532631896298, "rougeLsum_recall_stderr": 0.00261075911429361}, "write_abstract_en": {"bleu": 1.157589240412585, "bleu_stderr": 0.06749886174670726, "rouge1_fmeasure": 0.1389508648632086, "rouge1_fmeasure_stderr": 0.0018875843940261779, "rouge1_precision": 0.1239472716246903, "rouge1_precision_stderr": 0.0018733837629968672, "rouge1_recall": 0.19751962957109306, "rouge1_recall_stderr": 0.0027503417388775164, "rouge2_fmeasure": 0.02129586388647884, "rouge2_fmeasure_stderr": 0.0007438044749129246, "rouge2_precision": 0.018494424650071603, "rouge2_precision_stderr": 0.0006803416768500626, "rouge2_recall": 0.0314952015813472, "rouge2_recall_stderr": 0.0011962370296272414, "rougeL_fmeasure": 0.10552665111055606, "rougeL_fmeasure_stderr": 0.0012791910056744688, "rougeL_precision": 0.09361354942631474, "rougeL_precision_stderr": 0.0012858909938593117, "rougeL_recall": 0.15349834743094354, "rougeL_recall_stderr": 0.0020743788462299056, "rougeLsum_fmeasure": 0.12992721021691855, "rougeLsum_fmeasure_stderr": 0.0017593130424382794, "rougeLsum_precision": 0.11595972550990691, "rougeLsum_precision_stderr": 0.001753308531131635, "rougeLsum_recall": 0.18500426729144692, "rougeLsum_recall_stderr": 0.0025796753919204852}}, "3": {"article_summary_en": {"bleu": 1.1833686716294687, "bleu_stderr": 0.07615919722768849, "rouge1_fmeasure": 0.1336180051181457, "rouge1_fmeasure_stderr": 0.001957815366131097, "rouge1_precision": 0.11777387943190545, "rouge1_precision_stderr": 0.0020113312278699025, "rouge1_recall": 0.19469078652969996, "rouge1_recall_stderr": 0.0027977487029445204, "rouge2_fmeasure": 0.021648290209856712, "rouge2_fmeasure_stderr": 0.0007245383572720933, "rouge2_precision": 0.018842490102852782, "rouge2_precision_stderr": 0.000680153964280099, "rouge2_recall": 0.03249985364518616, "rouge2_recall_stderr": 0.0011921354860818573, "rougeL_fmeasure": 0.10208979953114408, "rougeL_fmeasure_stderr": 0.001397295719317153, "rougeL_precision": 0.08919191590884211, "rougeL_precision_stderr": 0.0014541958760042968, "rougeL_recall": 0.153037543777827, "rougeL_recall_stderr": 0.002210331899287786, "rougeLsum_fmeasure": 0.12445447248036409, "rougeLsum_fmeasure_stderr": 0.0018122356410507367, "rougeLsum_precision": 0.10958122883328779, "rougeLsum_precision_stderr": 0.0018686579393142751, "rougeLsum_recall": 0.1819882090629157, "rougeLsum_recall_stderr": 0.0026135117445184627}, "rephrase_en": {"bleu": 1.550517196878354, "bleu_stderr": 0.06993149956571287, "rouge1_fmeasure": 0.11809803100929343, "rouge1_fmeasure_stderr": 0.0020464135073259295, "rouge1_precision": 0.10720171803251193, "rouge1_precision_stderr": 0.002082996644638251, "rouge1_recall": 0.16612796673148053, "rouge1_recall_stderr": 0.002954548192082436, "rouge2_fmeasure": 0.022197025590925616, "rouge2_fmeasure_stderr": 0.0007547906063236964, "rouge2_precision": 0.02009963124741285, "rouge2_precision_stderr": 0.0008011300876613331, "rouge2_recall": 0.03248360016391421, "rouge2_recall_stderr": 0.0012256498842740874, "rougeL_fmeasure": 0.09416415981113054, "rougeL_fmeasure_stderr": 0.0015409418875020468, "rougeL_precision": 0.08482141911566571, "rougeL_precision_stderr": 0.0015735036854485902, "rougeL_recall": 0.1351988789967234, "rougeL_recall_stderr": 0.002389596247655131, "rougeLsum_fmeasure": 0.10837367169734866, "rougeLsum_fmeasure_stderr": 0.001874380599850097, "rougeLsum_precision": 0.09848261188127233, "rougeLsum_precision_stderr": 0.0019260433293872472, "rougeLsum_recall": 0.15267739552040577, "rougeLsum_recall_stderr": 0.0027171840590306546}, "summarize_above_en": {"bleu": 0.24585325834528793, "bleu_stderr": 0.03702715577574997, "rouge1_fmeasure": 0.07737568279361932, "rouge1_fmeasure_stderr": 0.001265246486643532, "rouge1_precision": 0.07037274511161584, "rouge1_precision_stderr": 0.0014120231404269865, "rouge1_recall": 0.11052779587284725, "rouge1_recall_stderr": 0.0017290812995824128, "rouge2_fmeasure": 0.0040651974203171634, "rouge2_fmeasure_stderr": 0.00025930635468911277, "rouge2_precision": 0.00392300118223772, "rouge2_precision_stderr": 0.0002557581635323497, "rouge2_recall": 0.005589835600869042, "rouge2_recall_stderr": 0.000423176072989138, "rougeL_fmeasure": 0.07258759044318532, "rougeL_fmeasure_stderr": 0.001122423619227686, "rougeL_precision": 0.06512620770805046, "rougeL_precision_stderr": 0.00121951222912035, "rougeL_recall": 0.10533676686618007, "rougeL_recall_stderr": 0.0016204045193858512, "rougeLsum_fmeasure": 0.07343581428013704, "rougeLsum_fmeasure_stderr": 0.0011835280613330765, "rougeLsum_precision": 0.06672709213349835, "rougeLsum_precision_stderr": 0.0013263129049748312, "rougeLsum_recall": 0.10524011607734472, "rougeLsum_recall_stderr": 0.0016343300862625232}, "tldr_en": {"bleu": 2.212268753332442, "bleu_stderr": 0.09749124513916169, "rouge1_fmeasure": 0.17165885454358776, "rouge1_fmeasure_stderr": 0.002266523062316873, "rouge1_precision": 0.15395480397824707, "rouge1_precision_stderr": 0.002402542953049042, "rouge1_recall": 0.24557521515724243, "rouge1_recall_stderr": 0.0032199609208037362, "rouge2_fmeasure": 0.03887583188926559, "rouge2_fmeasure_stderr": 0.0009318869486006191, "rouge2_precision": 0.0345849314932535, "rouge2_precision_stderr": 0.000925086814362582, "rouge2_recall": 0.05685215254790676, "rouge2_recall_stderr": 0.0014741611375388177, "rougeL_fmeasure": 0.12611107179368627, "rougeL_fmeasure_stderr": 0.0015753873989903184, "rougeL_precision": 0.11262382269704937, "rougeL_precision_stderr": 0.001735467609168603, "rougeL_recall": 0.1853949845751863, "rougeL_recall_stderr": 0.002495030067359919, "rougeLsum_fmeasure": 0.15920866146470747, "rougeLsum_fmeasure_stderr": 0.0021025779963518188, "rougeLsum_precision": 0.1426484686620457, "rougeLsum_precision_stderr": 0.0022377962576445335, "rougeLsum_recall": 0.22876101619014125, "rougeLsum_recall_stderr": 0.0030299996388814796}, "write_abstract_en": {"bleu": 1.4447905460689463, "bleu_stderr": 0.0650728846043769, "rouge1_fmeasure": 0.12031046206784149, "rouge1_fmeasure_stderr": 0.002096721222333191, "rouge1_precision": 0.11061508225201387, "rouge1_precision_stderr": 0.00219210163323952, "rouge1_recall": 0.17229542877999413, "rouge1_recall_stderr": 0.0030954808441172386, "rouge2_fmeasure": 0.021617853652187335, "rouge2_fmeasure_stderr": 0.000749070842689932, "rouge2_precision": 0.019285223842382425, "rouge2_precision_stderr": 0.0007386242173115931, "rouge2_recall": 0.032665716658571785, "rouge2_recall_stderr": 0.0012591431714370847, "rougeL_fmeasure": 0.09132576653371158, "rougeL_fmeasure_stderr": 0.0014923093239882957, "rougeL_precision": 0.08351753016640841, "rougeL_precision_stderr": 0.0015962438735235313, "rougeL_recall": 0.13414611484067868, "rougeL_recall_stderr": 0.0024039219745857114, "rougeLsum_fmeasure": 0.11255926347759326, "rougeLsum_fmeasure_stderr": 0.0019516313006938469, "rougeLsum_precision": 0.10363733813080353, "rougeLsum_precision_stderr": 0.0020595302264073945, "rougeLsum_recall": 0.16163856120472586, "rougeLsum_recall_stderr": 0.0029085585556816312}}, "4": {"article_summary_en": {"bleu": 0.28624026182603163, "bleu_stderr": 0.029349438130954253, "rouge1_fmeasure": 0.04366405939122397, "rouge1_fmeasure_stderr": 0.0015831202808765945, "rouge1_precision": 0.03995701958166195, "rouge1_precision_stderr": 0.0016095372311748183, "rouge1_recall": 0.06674356961372938, "rouge1_recall_stderr": 0.002448300622281182, "rouge2_fmeasure": 0.007536808369783641, "rouge2_fmeasure_stderr": 0.0004892115817681631, "rouge2_precision": 0.006552251076514466, "rouge2_precision_stderr": 0.00045575183717078306, "rouge2_recall": 0.012072979341302284, "rouge2_recall_stderr": 0.0008431856424377719, "rougeL_fmeasure": 0.03438650141175432, "rougeL_fmeasure_stderr": 0.0012227168700738242, "rougeL_precision": 0.03139542886892282, "rougeL_precision_stderr": 0.0012706755231911566, "rougeL_recall": 0.054115916065237676, "rougeL_recall_stderr": 0.002006663515515118, "rougeLsum_fmeasure": 0.040753188180590226, "rougeLsum_fmeasure_stderr": 0.0014756445728831532, "rougeLsum_precision": 0.03739170639098872, "rougeLsum_precision_stderr": 0.0015159196007358158, "rougeLsum_recall": 0.062356306725636794, "rougeLsum_recall_stderr": 0.00228581975031369}, "rephrase_en": {"bleu": 0.2906818896887448, "bleu_stderr": 0.032241949265799034, "rouge1_fmeasure": 0.03946196457385396, "rouge1_fmeasure_stderr": 0.001572740226370851, "rouge1_precision": 0.03740904754421892, "rouge1_precision_stderr": 0.001610680414155072, "rouge1_recall": 0.057441900471446455, "rouge1_recall_stderr": 0.0023292227400479332, "rouge2_fmeasure": 0.008171833643724272, "rouge2_fmeasure_stderr": 0.0005061088990249447, "rouge2_precision": 0.007405435782553709, "rouge2_precision_stderr": 0.0004938171762244464, "rouge2_recall": 0.012822151487257908, "rouge2_recall_stderr": 0.000886391837086689, "rougeL_fmeasure": 0.03213695990330082, "rougeL_fmeasure_stderr": 0.0012354431038129371, "rougeL_precision": 0.0302784538841258, "rougeL_precision_stderr": 0.001265349446879213, "rougeL_recall": 0.047977766809024824, "rougeL_recall_stderr": 0.001937859235680591, "rougeLsum_fmeasure": 0.036104092106741856, "rougeLsum_fmeasure_stderr": 0.0014394317299840206, "rougeLsum_precision": 0.034287485555940835, "rougeLsum_precision_stderr": 0.0014785361480277362, "rougeLsum_recall": 0.05268920626830129, "rougeLsum_recall_stderr": 0.0021423352455123288}, "summarize_above_en": {"bleu": 0.0725273475872256, "bleu_stderr": 0.01184587819436072, "rouge1_fmeasure": 0.025475522197387277, "rouge1_fmeasure_stderr": 0.0010381041342986902, "rouge1_precision": 0.02381329967362824, "rouge1_precision_stderr": 0.0011028927436461343, "rouge1_recall": 0.0375355725294583, "rouge1_recall_stderr": 0.0014975837973007148, "rouge2_fmeasure": 0.0018254913452193152, "rouge2_fmeasure_stderr": 0.0002100606076864114, "rouge2_precision": 0.0019534338228814068, "rouge2_precision_stderr": 0.00025424799653585257, "rouge2_recall": 0.002574143785393473, "rouge2_recall_stderr": 0.00033042674187292473, "rougeL_fmeasure": 0.023474073198495527, "rougeL_fmeasure_stderr": 0.0009278481009221991, "rougeL_precision": 0.021656975285866457, "rougeL_precision_stderr": 0.0009662420131766899, "rougeL_recall": 0.035213960582413724, "rougeL_recall_stderr": 0.0013920460584087664, "rougeLsum_fmeasure": 0.0240648893225021, "rougeLsum_fmeasure_stderr": 0.0009742444146235487, "rougeLsum_precision": 0.022475087207169998, "rougeLsum_precision_stderr": 0.0010391682891249803, "rougeLsum_recall": 0.03568293304710748, "rougeLsum_recall_stderr": 0.001419767775872114}, "tldr_en": {"bleu": 0.5159665881377578, "bleu_stderr": 0.0355848354666848, "rouge1_fmeasure": 0.057194516095400834, "rouge1_fmeasure_stderr": 0.0019532780600384314, "rouge1_precision": 0.05323367871163294, "rouge1_precision_stderr": 0.002019764759552398, "rouge1_recall": 0.08477832633294664, "rouge1_recall_stderr": 0.0029256767843087337, "rouge2_fmeasure": 0.013407675922368708, "rouge2_fmeasure_stderr": 0.0006687476239191229, "rouge2_precision": 0.011892613439611567, "rouge2_precision_stderr": 0.0006525291351364055, "rouge2_recall": 0.021014952711012305, "rouge2_recall_stderr": 0.0011420293722952714, "rougeL_fmeasure": 0.0429641799328658, "rougeL_fmeasure_stderr": 0.0014362645716637073, "rougeL_precision": 0.03994947969161068, "rougeL_precision_stderr": 0.0015323136750776513, "rougeL_recall": 0.0654114763710059, "rougeL_recall_stderr": 0.0022890377474177525, "rougeLsum_fmeasure": 0.052823275351703086, "rougeLsum_fmeasure_stderr": 0.0018085153746161442, "rougeLsum_precision": 0.049203521589357486, "rougeLsum_precision_stderr": 0.0018745212624263795, "rougeLsum_recall": 0.07828616324493042, "rougeLsum_recall_stderr": 0.00270998239625019}, "write_abstract_en": {"bleu": 0.10806636279782142, "bleu_stderr": 0.008181526680029229, "rouge1_fmeasure": 0.032690132376649535, "rouge1_fmeasure_stderr": 0.0014923647623479581, "rouge1_precision": 0.029920984782127447, "rouge1_precision_stderr": 0.0014566610845788402, "rouge1_recall": 0.04785896357976774, "rouge1_recall_stderr": 0.002210898388888131, "rouge2_fmeasure": 0.00641884861944969, "rouge2_fmeasure_stderr": 0.0004672678132495265, "rouge2_precision": 0.005659374256624486, "rouge2_precision_stderr": 0.00043051326798354176, "rouge2_recall": 0.009804204059300893, "rouge2_recall_stderr": 0.000793438963373113, "rougeL_fmeasure": 0.024933006612978887, "rougeL_fmeasure_stderr": 0.0011060087069036857, "rougeL_precision": 0.022749908515385872, "rougeL_precision_stderr": 0.0010830329089833192, "rougeL_recall": 0.0373074119688992, "rougeL_recall_stderr": 0.0017190468611345826, "rougeLsum_fmeasure": 0.030339892293146535, "rougeLsum_fmeasure_stderr": 0.0013855806303329002, "rougeLsum_precision": 0.027846049257508616, "rougeLsum_precision_stderr": 0.0013603362197923356, "rougeLsum_recall": 0.04432125424579462, "rougeLsum_recall_stderr": 0.002040921786045664}}, "5": {"article_summary_en": {"bleu": 1.5436993670367395e-07, "bleu_stderr": 2.3789544446448377e-07, "rouge1_fmeasure": 0.006297564938231645, "rouge1_fmeasure_stderr": 0.0006539758063676466, "rouge1_precision": 0.007186612272605283, "rouge1_precision_stderr": 0.0009791397918263772, "rouge1_recall": 0.009626580155215095, "rouge1_recall_stderr": 0.0009843679259005512, "rouge2_fmeasure": 0.0011309269927620334, "rouge2_fmeasure_stderr": 0.0001883904593636651, "rouge2_precision": 0.0009639306536740955, "rouge2_precision_stderr": 0.00016209042335021826, "rouge2_recall": 0.0018204921865520307, "rouge2_recall_stderr": 0.0003123750046265983, "rougeL_fmeasure": 0.005007392411738194, "rougeL_fmeasure_stderr": 0.0004993418818895954, "rougeL_precision": 0.005946725242200601, "rougeL_precision_stderr": 0.0008795298956796886, "rougeL_recall": 0.007926123528146993, "rougeL_recall_stderr": 0.0008039901837259593, "rougeLsum_fmeasure": 0.005783114895049256, "rougeLsum_fmeasure_stderr": 0.000599975916196901, "rougeLsum_precision": 0.006742477351676616, "rougeLsum_precision_stderr": 0.0009493223215183244, "rougeLsum_recall": 0.008831833968476688, "rougeLsum_recall_stderr": 0.0009029356485515189}, "rephrase_en": {"bleu": 2.3695690005957795e-08, "bleu_stderr": 5.162992091823081e-08, "rouge1_fmeasure": 0.005997866632095373, "rouge1_fmeasure_stderr": 0.0006446489437867567, "rouge1_precision": 0.006416762662669792, "rouge1_precision_stderr": 0.0008067399599762491, "rouge1_recall": 0.008596118967962398, "rouge1_recall_stderr": 0.0009759171996390612, "rouge2_fmeasure": 0.0012168332924537228, "rouge2_fmeasure_stderr": 0.00019190718073544674, "rouge2_precision": 0.0012325600300179128, "rouge2_precision_stderr": 0.00023262166513716688, "rouge2_recall": 0.0019877130664642207, "rouge2_recall_stderr": 0.0003765849024360566, "rougeL_fmeasure": 0.004877643128103321, "rougeL_fmeasure_stderr": 0.0005141597079202806, "rougeL_precision": 0.005180574608491138, "rougeL_precision_stderr": 0.0006638732889323238, "rougeL_recall": 0.00723418284441452, "rougeL_recall_stderr": 0.000838041683119617, "rougeLsum_fmeasure": 0.005540816115674005, "rougeLsum_fmeasure_stderr": 0.0005975088748642658, "rougeLsum_precision": 0.00601329806116974, "rougeLsum_precision_stderr": 0.0007735773924533153, "rougeLsum_recall": 0.007945295399230258, "rougeLsum_recall_stderr": 0.0009128968590461367}, "summarize_above_en": {"bleu": 3.4432997292695364e-10, "bleu_stderr": 5.054319054078497e-10, "rouge1_fmeasure": 0.003773178583642327, "rouge1_fmeasure_stderr": 0.0004233214040303885, "rouge1_precision": 0.003599842790127908, "rouge1_precision_stderr": 0.0004357453635530341, "rouge1_recall": 0.005450956260566027, "rouge1_recall_stderr": 0.0006157647454877155, "rouge2_fmeasure": 0.00026468573039586365, "rouge2_fmeasure_stderr": 6.948326335135335e-05, "rouge2_precision": 0.00024100397622410201, "rouge2_precision_stderr": 6.428724333491122e-05, "rouge2_recall": 0.0003662081649364038, "rouge2_recall_stderr": 0.00010026927803212084, "rougeL_fmeasure": 0.0034927853884506757, "rougeL_fmeasure_stderr": 0.0003810695069644319, "rougeL_precision": 0.003261903403975752, "rougeL_precision_stderr": 0.0003814441870483113, "rougeL_recall": 0.005138441375666529, "rougeL_recall_stderr": 0.0005722130168539618, "rougeLsum_fmeasure": 0.00354234805200471, "rougeLsum_fmeasure_stderr": 0.00039437507462346326, "rougeLsum_precision": 0.003394903398841389, "rougeLsum_precision_stderr": 0.00041292623146590925, "rougeLsum_recall": 0.005123453693921271, "rougeLsum_recall_stderr": 0.000573270656957646}, "tldr_en": {"bleu": 5.133528491740168e-07, "bleu_stderr": 9.288876136024227e-07, "rouge1_fmeasure": 0.00885537771521418, "rouge1_fmeasure_stderr": 0.0008474145806800235, "rouge1_precision": 0.008397509125114434, "rouge1_precision_stderr": 0.0008676037544483993, "rouge1_recall": 0.013182384245950918, "rouge1_recall_stderr": 0.0012488579545023588, "rouge2_fmeasure": 0.0020845828252393957, "rouge2_fmeasure_stderr": 0.0002776449859686965, "rouge2_precision": 0.0018519121687661717, "rouge2_precision_stderr": 0.0002652232008037664, "rouge2_recall": 0.003201546871291623, "rouge2_recall_stderr": 0.00042951400653203656, "rougeL_fmeasure": 0.006795329450745382, "rougeL_fmeasure_stderr": 0.0006430172034977336, "rougeL_precision": 0.006312806712827651, "rougeL_precision_stderr": 0.0006339300731280119, "rougeL_recall": 0.010466170637582555, "rougeL_recall_stderr": 0.0010119772330203227, "rougeLsum_fmeasure": 0.008112137370163754, "rougeLsum_fmeasure_stderr": 0.0007719985459243486, "rougeLsum_precision": 0.0076894646083944945, "rougeLsum_precision_stderr": 0.000786099271409283, "rougeLsum_recall": 0.012227793486074013, "rougeLsum_recall_stderr": 0.0011695312324965957}, "write_abstract_en": {"bleu": 1.464939987980638e-14, "bleu_stderr": 4.151354860776468e-14, "rouge1_fmeasure": 0.003428639083443863, "rouge1_fmeasure_stderr": 0.0005233153160450194, "rouge1_precision": 0.0029392502676391357, "rouge1_precision_stderr": 0.0004853225161067237, "rouge1_recall": 0.005172582525022545, "rouge1_recall_stderr": 0.000765814668280875, "rouge2_fmeasure": 0.0006554801175224404, "rouge2_fmeasure_stderr": 0.00015037462344468343, "rouge2_precision": 0.0005834403624431923, "rouge2_precision_stderr": 0.0001396349568301966, "rouge2_recall": 0.0009316463701771211, "rouge2_recall_stderr": 0.00020955827176984525, "rougeL_fmeasure": 0.002504342041321112, "rougeL_fmeasure_stderr": 0.0003638861281311792, "rougeL_precision": 0.002100593784823454, "rougeL_precision_stderr": 0.0003210510857538418, "rougeL_recall": 0.00393696053174691, "rougeL_recall_stderr": 0.000586165963887078, "rougeLsum_fmeasure": 0.0031942471606000973, "rougeLsum_fmeasure_stderr": 0.000483410224652734, "rougeLsum_precision": 0.0027237658351042765, "rougeLsum_precision_stderr": 0.0004415537544986123, "rougeLsum_recall": 0.004866254903763763, "rougeLsum_recall_stderr": 0.0007242312593154887}}}, "anli_r1": {"0": {"GPT-3 style": {"acc": 0.334, "acc_norm": 0.323, "acc_norm_stderr": 0.014794927843348635, "acc_stderr": 0.014922019523732954, "subset": 1}, "MNLI crowdsource": {"acc": 0.334, "acc_norm": 0.35, "acc_norm_stderr": 0.015090650341444235, "acc_stderr": 0.014922019523732954, "subset": 1}, "can we infer": {"acc": 0.336, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229857, "acc_stderr": 0.014944140233795023, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.323, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229859, "acc_stderr": 0.014794927843348632, "subset": 1}, "justified in saying": {"acc": 0.329, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229857, "acc_stderr": 0.014865395385928354, "subset": 1}}, "1": {"GPT-3 style": {"acc": 0.334, "acc_norm": 0.341, "acc_norm_stderr": 0.014998131348402702, "acc_stderr": 0.014922019523732961, "subset": 1}, "MNLI crowdsource": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "can we infer": {"acc": 0.325, "acc_norm": 0.329, "acc_norm_stderr": 0.01486539538592837, "acc_stderr": 0.014818724459095526, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.33, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014876872027456734, "subset": 1}, "justified in saying": {"acc": 0.327, "acc_norm": 0.322, "acc_norm_stderr": 0.014782913600996676, "acc_stderr": 0.01484221315341124, "subset": 1}}, "2": {"GPT-3 style": {"acc": 0.349, "acc_norm": 0.342, "acc_norm_stderr": 0.01500870618212173, "acc_stderr": 0.0150806639915631, "subset": 1}, "MNLI crowdsource": {"acc": 0.361, "acc_norm": 0.36, "acc_norm_stderr": 0.015186527932040117, "acc_stderr": 0.015195720118175108, "subset": 1}, "can we infer": {"acc": 0.352, "acc_norm": 0.332, "acc_norm_stderr": 0.01489959724281149, "acc_stderr": 0.015110404505648663, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.323, "acc_norm": 0.325, "acc_norm_stderr": 0.014818724459095524, "acc_stderr": 0.014794927843348637, "subset": 1}, "justified in saying": {"acc": 0.345, "acc_norm": 0.33, "acc_norm_stderr": 0.01487687202745673, "acc_stderr": 0.015039986742055237, "subset": 1}}, "3": {"GPT-3 style": {"acc": 0.33, "acc_norm": 0.336, "acc_norm_stderr": 0.014944140233795023, "acc_stderr": 0.014876872027456736, "subset": 1}, "MNLI crowdsource": {"acc": 0.335, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229873, "acc_stderr": 0.014933117490932577, "subset": 1}, "can we infer": {"acc": 0.345, "acc_norm": 0.341, "acc_norm_stderr": 0.014998131348402706, "acc_stderr": 0.015039986742055233, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.32, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509003, "acc_stderr": 0.01475865230357487, "subset": 1}, "justified in saying": {"acc": 0.349, "acc_norm": 0.34, "acc_norm_stderr": 0.014987482264363937, "acc_stderr": 0.015080663991563102, "subset": 1}}, "4": {"GPT-3 style": {"acc": 0.318, "acc_norm": 0.308, "acc_norm_stderr": 0.01460648312734276, "acc_stderr": 0.014734079309311901, "subset": 1}, "MNLI crowdsource": {"acc": 0.332, "acc_norm": 0.342, "acc_norm_stderr": 0.015008706182121731, "acc_stderr": 0.0148995972428115, "subset": 1}, "can we infer": {"acc": 0.327, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229868, "acc_stderr": 0.014842213153411242, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.309, "acc_norm": 0.304, "acc_norm_stderr": 0.014553205687950438, "acc_stderr": 0.014619600977206491, "subset": 1}, "justified in saying": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229873, "acc_stderr": 0.01491084616422987, "subset": 1}}, "5": {"GPT-3 style": {"acc": 0.321, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574881, "acc_stderr": 0.014770821817934656, "subset": 1}, "MNLI crowdsource": {"acc": 0.343, "acc_norm": 0.332, "acc_norm_stderr": 0.01489959724281149, "acc_stderr": 0.015019206922356951, "subset": 1}, "can we infer": {"acc": 0.315, "acc_norm": 0.322, "acc_norm_stderr": 0.014782913600996678, "acc_stderr": 0.0146966319607925, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.33, "acc_norm": 0.318, "acc_norm_stderr": 0.0147340793093119, "acc_stderr": 0.014876872027456734, "subset": 1}, "justified in saying": {"acc": 0.333, "acc_norm": 0.325, "acc_norm_stderr": 0.014818724459095524, "acc_stderr": 0.014910846164229868, "subset": 1}}}, "anli_r2": {"0": {"GPT-3 style": {"acc": 0.336, "acc_norm": 0.34, "acc_norm_stderr": 0.014987482264363937, "acc_stderr": 0.01494414023379502, "subset": 2}, "MNLI crowdsource": {"acc": 0.334, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456725, "acc_stderr": 0.014922019523732958, "subset": 2}, "can we infer": {"acc": 0.336, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014944140233795027, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.325, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422985, "acc_stderr": 0.014818724459095526, "subset": 2}, "justified in saying": {"acc": 0.319, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014746404865473484, "subset": 2}}, "1": {"GPT-3 style": {"acc": 0.305, "acc_norm": 0.308, "acc_norm_stderr": 0.01460648312734276, "acc_stderr": 0.014566646394664378, "subset": 2}, "MNLI crowdsource": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "can we infer": {"acc": 0.312, "acc_norm": 0.313, "acc_norm_stderr": 0.014671272822977883, "acc_stderr": 0.014658474370509012, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.313, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014671272822977885, "subset": 2}, "justified in saying": {"acc": 0.314, "acc_norm": 0.307, "acc_norm_stderr": 0.014593284892852628, "acc_stderr": 0.014683991951087974, "subset": 2}}, "2": {"GPT-3 style": {"acc": 0.305, "acc_norm": 0.338, "acc_norm_stderr": 0.014965960710224475, "acc_stderr": 0.014566646394664382, "subset": 2}, "MNLI crowdsource": {"acc": 0.336, "acc_norm": 0.327, "acc_norm_stderr": 0.014842213153411237, "acc_stderr": 0.014944140233795021, "subset": 2}, "can we infer": {"acc": 0.332, "acc_norm": 0.321, "acc_norm_stderr": 0.014770821817934652, "acc_stderr": 0.014899597242811495, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.328, "acc_norm": 0.326, "acc_norm_stderr": 0.014830507204541035, "acc_stderr": 0.014853842487270333, "subset": 2}, "justified in saying": {"acc": 0.335, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928378, "acc_stderr": 0.01493311749093258, "subset": 2}}, "3": {"GPT-3 style": {"acc": 0.317, "acc_norm": 0.325, "acc_norm_stderr": 0.014818724459095526, "acc_stderr": 0.014721675438880215, "subset": 2}, "MNLI crowdsource": {"acc": 0.311, "acc_norm": 0.314, "acc_norm_stderr": 0.01468399195108795, "acc_stderr": 0.014645596385722695, "subset": 2}, "can we infer": {"acc": 0.333, "acc_norm": 0.337, "acc_norm_stderr": 0.014955087918653603, "acc_stderr": 0.014910846164229863, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.335, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456732, "acc_stderr": 0.014933117490932573, "subset": 2}, "justified in saying": {"acc": 0.339, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456734, "acc_stderr": 0.014976758771620345, "subset": 2}}, "4": {"GPT-3 style": {"acc": 0.313, "acc_norm": 0.311, "acc_norm_stderr": 0.014645596385722692, "acc_stderr": 0.014671272822977886, "subset": 2}, "MNLI crowdsource": {"acc": 0.323, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456736, "acc_stderr": 0.014794927843348635, "subset": 2}, "can we infer": {"acc": 0.317, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732963, "acc_stderr": 0.014721675438880219, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.34, "acc_norm": 0.337, "acc_norm_stderr": 0.014955087918653602, "acc_stderr": 0.014987482264363935, "subset": 2}, "justified in saying": {"acc": 0.319, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203933, "acc_stderr": 0.014746404865473484, "subset": 2}}, "5": {"GPT-3 style": {"acc": 0.324, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370508998, "acc_stderr": 0.014806864733738863, "subset": 2}, "MNLI crowdsource": {"acc": 0.338, "acc_norm": 0.337, "acc_norm_stderr": 0.014955087918653603, "acc_stderr": 0.01496596071022448, "subset": 2}, "can we infer": {"acc": 0.327, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014842213153411242, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.337, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811478, "acc_stderr": 0.014955087918653602, "subset": 2}, "justified in saying": {"acc": 0.315, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229868, "acc_stderr": 0.0146966319607925, "subset": 2}}}, "anli_r3": {"0": {"GPT-3 style": {"acc": 0.3383333333333333, "acc_norm": 0.34, "acc_norm_stderr": 0.01368049572576779, "acc_stderr": 0.013664144006618268, "subset": 3}, "MNLI crowdsource": {"acc": 0.33666666666666667, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003663, "acc_stderr": 0.013647602942406389, "subset": 3}, "can we infer": {"acc": 0.33916666666666667, "acc_norm": 0.335, "acc_norm_stderr": 0.013630871843821469, "acc_stderr": 0.013672343491681815, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.2991666666666667, "acc_norm": 0.33, "acc_norm_stderr": 0.013579531277800918, "acc_stderr": 0.013223742523347383, "subset": 3}, "justified in saying": {"acc": 0.3433333333333333, "acc_norm": 0.33416666666666667, "acc_norm_stderr": 0.013622434813136774, "acc_stderr": 0.013712633830465858, "subset": 3}}, "1": {"GPT-3 style": {"acc": 0.3325, "acc_norm": 0.3283333333333333, "acc_norm_stderr": 0.013562032919529019, "acc_stderr": 0.013605417345710526, "subset": 3}, "MNLI crowdsource": {"acc": 0.3358333333333333, "acc_norm": 0.3358333333333333, "acc_norm_stderr": 0.013639261190932889, "acc_stderr": 0.013639261190932889, "subset": 3}, "can we infer": {"acc": 0.3408333333333333, "acc_norm": 0.3358333333333333, "acc_norm_stderr": 0.013639261190932886, "acc_stderr": 0.013688600793296934, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.33666666666666667, "acc_norm": 0.3375, "acc_norm_stderr": 0.013655897185463653, "acc_stderr": 0.01364760294240639, "subset": 3}, "justified in saying": {"acc": 0.33916666666666667, "acc_norm": 0.3283333333333333, "acc_norm_stderr": 0.013562032919529019, "acc_stderr": 0.01367234349168182, "subset": 3}}, "2": {"GPT-3 style": {"acc": 0.32416666666666666, "acc_norm": 0.32416666666666666, "acc_norm_stderr": 0.013517438120881643, "acc_stderr": 0.013517438120881629, "subset": 3}, "MNLI crowdsource": {"acc": 0.32, "acc_norm": 0.315, "acc_norm_stderr": 0.013415009084004862, "acc_stderr": 0.013471620929769142, "subset": 3}, "can we infer": {"acc": 0.31166666666666665, "acc_norm": 0.30083333333333334, "acc_norm_stderr": 0.013244749345624925, "acc_stderr": 0.013376268790982108, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.305, "acc_norm": 0.3016666666666667, "acc_norm_stderr": 0.013255174729956493, "acc_stderr": 0.013296358936471108, "subset": 3}, "justified in saying": {"acc": 0.30416666666666664, "acc_norm": 0.3225, "acc_norm_stderr": 0.013499258621103244, "acc_stderr": 0.013286140243317446, "subset": 3}}, "3": {"GPT-3 style": {"acc": 0.3408333333333333, "acc_norm": 0.3458333333333333, "acc_norm_stderr": 0.013736245342311014, "acc_stderr": 0.013688600793296936, "subset": 3}, "MNLI crowdsource": {"acc": 0.35, "acc_norm": 0.3525, "acc_norm_stderr": 0.013797164918918357, "acc_stderr": 0.013774667009018552, "subset": 3}, "can we infer": {"acc": 0.3333333333333333, "acc_norm": 0.3375, "acc_norm_stderr": 0.013655897185463658, "acc_stderr": 0.013613950010225603, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.31916666666666665, "acc_norm": 0.3175, "acc_norm_stderr": 0.013443538681348054, "acc_stderr": 0.01346230971200513, "subset": 3}, "justified in saying": {"acc": 0.3441666666666667, "acc_norm": 0.3383333333333333, "acc_norm_stderr": 0.013664144006618268, "acc_stderr": 0.013720551062295756, "subset": 3}}, "4": {"GPT-3 style": {"acc": 0.33166666666666667, "acc_norm": 0.3383333333333333, "acc_norm_stderr": 0.013664144006618268, "acc_stderr": 0.013596836729485168, "subset": 3}, "MNLI crowdsource": {"acc": 0.3275, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.01364760294240639, "acc_stderr": 0.013553211167251944, "subset": 3}, "can we infer": {"acc": 0.3383333333333333, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.013613950010225603, "acc_stderr": 0.013664144006618266, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3375, "acc_norm": 0.3283333333333333, "acc_norm_stderr": 0.01356203291952902, "acc_stderr": 0.01365589718546366, "subset": 3}, "justified in saying": {"acc": 0.3358333333333333, "acc_norm": 0.3283333333333333, "acc_norm_stderr": 0.013562032919529019, "acc_stderr": 0.01363926119093288, "subset": 3}}, "5": {"GPT-3 style": {"acc": 0.32166666666666666, "acc_norm": 0.31833333333333336, "acc_norm_stderr": 0.013452948996996296, "acc_stderr": 0.013490095282989521, "subset": 3}, "MNLI crowdsource": {"acc": 0.32, "acc_norm": 0.32416666666666666, "acc_norm_stderr": 0.013517438120881622, "acc_stderr": 0.01347162092976913, "subset": 3}, "can we infer": {"acc": 0.33666666666666667, "acc_norm": 0.3375, "acc_norm_stderr": 0.013655897185463662, "acc_stderr": 0.013647602942406398, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.32666666666666666, "acc_norm": 0.31083333333333335, "acc_norm_stderr": 0.013366457845965433, "acc_stderr": 0.013544340907003663, "subset": 3}, "justified in saying": {"acc": 0.32416666666666666, "acc_norm": 0.335, "acc_norm_stderr": 0.013630871843821479, "acc_stderr": 0.01351743812088163, "subset": 3}}}, "arc_easy": {"0": {"heres_a_problem": {"acc": 0.23890784982935154, "acc_norm": 0.23890784982935154, "acc_norm_stderr": 0.012461071376316628, "acc_stderr": 0.012461071376316628}, "i_am_hesitating": {"acc": 0.3042929292929293, "acc_norm": 0.2840909090909091, "acc_norm_stderr": 0.009253921261885763, "acc_stderr": 0.009441202922359185}, "multiple_choice": {"acc": 0.25715488215488214, "acc_norm": 0.26346801346801346, "acc_norm_stderr": 0.009039157374497706, "acc_stderr": 0.00896839476897199}, "pick_the_most_correct_option": {"acc": 0.22866894197952217, "acc_norm": 0.22866894197952217, "acc_norm_stderr": 0.012272853582540807, "acc_stderr": 0.012272853582540807}, "qa_options": {"acc": 0.2525597269624573, "acc_norm": 0.29180887372013653, "acc_norm_stderr": 0.013284525292403501, "acc_stderr": 0.012696728980207702}}, "1": {"heres_a_problem": {"acc": 0.2398989898989899, "acc_norm": 0.2398989898989899, "acc_norm_stderr": 0.008762298774190583, "acc_stderr": 0.008762298774190583}, "i_am_hesitating": {"acc": 0.2627986348122867, "acc_norm": 0.2935153583617747, "acc_norm_stderr": 0.013307250444941129, "acc_stderr": 0.012862523175351335}, "multiple_choice": {"acc": 0.2836700336700337, "acc_norm": 0.2878787878787879, "acc_norm_stderr": 0.00929073316167016, "acc_stderr": 0.00924978169114074}, "pick_the_most_correct_option": {"acc": 0.23122866894197952, "acc_norm": 0.23122866894197952, "acc_norm_stderr": 0.01232085883477228, "acc_stderr": 0.01232085883477228}, "qa_options": {"acc": 0.25426621160409557, "acc_norm": 0.29266211604095566, "acc_norm_stderr": 0.013295916103619413, "acc_stderr": 0.012724999945157729}}, "2": {"heres_a_problem": {"acc": 0.24494949494949494, "acc_norm": 0.24494949494949494, "acc_norm_stderr": 0.008824588611219073, "acc_stderr": 0.008824588611219073}, "i_am_hesitating": {"acc": 0.2946127946127946, "acc_norm": 0.2849326599326599, "acc_norm_stderr": 0.00926217069559066, "acc_stderr": 0.009354224395837097}, "multiple_choice": {"acc": 0.23293515358361774, "acc_norm": 0.2645051194539249, "acc_norm_stderr": 0.012889272949313364, "acc_stderr": 0.012352507042617405}, "pick_the_most_correct_option": {"acc": 0.2354948805460751, "acc_norm": 0.2354948805460751, "acc_norm_stderr": 0.012399451855004753, "acc_stderr": 0.012399451855004753}, "qa_options": {"acc": 0.31523569023569026, "acc_norm": 0.2946127946127946, "acc_norm_stderr": 0.009354224395837102, "acc_stderr": 0.009533589368505863}}, "3": {"heres_a_problem": {"acc": 0.25336700336700335, "acc_norm": 0.25336700336700335, "acc_norm_stderr": 0.008924765424529257, "acc_stderr": 0.008924765424529257}, "i_am_hesitating": {"acc": 0.26791808873720135, "acc_norm": 0.29266211604095566, "acc_norm_stderr": 0.01329591610361941, "acc_stderr": 0.01294203019513642}, "multiple_choice": {"acc": 0.2431740614334471, "acc_norm": 0.2721843003412969, "acc_norm_stderr": 0.013006600406423707, "acc_stderr": 0.012536554144587089}, "pick_the_most_correct_option": {"acc": 0.24061433447098976, "acc_norm": 0.24061433447098976, "acc_norm_stderr": 0.012491468532390578, "acc_stderr": 0.012491468532390578}, "qa_options": {"acc": 0.31734006734006737, "acc_norm": 0.29208754208754206, "acc_norm_stderr": 0.009330705616569073, "acc_stderr": 0.009550648343947771}}, "4": {"heres_a_problem": {"acc": 0.2380546075085324, "acc_norm": 0.2380546075085324, "acc_norm_stderr": 0.012445770028026201, "acc_stderr": 0.012445770028026201}, "i_am_hesitating": {"acc": 0.29713804713804715, "acc_norm": 0.29335016835016836, "acc_norm_stderr": 0.009342508331708561, "acc_stderr": 0.009377397867796849}, "multiple_choice": {"acc": 0.2908249158249158, "acc_norm": 0.2908249158249158, "acc_norm_stderr": 0.009318815921176662, "acc_stderr": 0.009318815921176657}, "pick_the_most_correct_option": {"acc": 0.2361111111111111, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.00871448049171129, "acc_stderr": 0.00871448049171129}, "qa_options": {"acc": 0.26791808873720135, "acc_norm": 0.29692832764505117, "acc_norm_stderr": 0.013352025976725225, "acc_stderr": 0.012942030195136432}}, "5": {"heres_a_problem": {"acc": 0.2226962457337884, "acc_norm": 0.2226962457337884, "acc_norm_stderr": 0.012158314774829919, "acc_stderr": 0.012158314774829919}, "i_am_hesitating": {"acc": 0.30303030303030304, "acc_norm": 0.28703703703703703, "acc_norm_stderr": 0.009282621598983076, "acc_stderr": 0.009430140669278955}, "multiple_choice": {"acc": 0.2967171717171717, "acc_norm": 0.2925084175084175, "acc_norm_stderr": 0.009334649503078416, "acc_stderr": 0.009373559492986846}, "pick_the_most_correct_option": {"acc": 0.24957912457912457, "acc_norm": 0.24957912457912457, "acc_norm_stderr": 0.008880241465504344, "acc_stderr": 0.008880241465504344}, "qa_options": {"acc": 0.2619453924914676, "acc_norm": 0.2790102389078498, "acc_norm_stderr": 0.013106784883601352, "acc_stderr": 0.012849054826858114}}}, "boolq": {"0": {"GPT-3 Style": {"acc": 0.6163333333333333, "acc_norm": 0.624, "acc_norm_stderr": 0.008845002997512754, "acc_stderr": 0.008879665985151403}, "after_reading": {"acc": 0.622, "acc_norm": 0.44, "acc_norm_stderr": 0.009064255084676055, "acc_stderr": 0.008854272003440052}, "exercise": {"acc": 0.6236666666666667, "acc_norm": 0.6183333333333333, "acc_norm_stderr": 0.008870849530787626, "acc_stderr": 0.008846558976258922}, "valid_binary": {"acc": 0.565, "acc_norm": 0.4096666666666667, "acc_norm_stderr": 0.008979987547601863, "acc_stderr": 0.009052751926300883}, "yes_no_question": {"acc": 0.5426666666666666, "acc_norm": 0.6236666666666667, "acc_norm_stderr": 0.008846558976258922, "acc_stderr": 0.009096928229880426}}, "1": {"GPT-3 Style": {"acc": 0.596, "acc_norm": 0.633, "acc_norm_stderr": 0.008801296548822387, "acc_stderr": 0.0089603624944537}, "after_reading": {"acc": 0.546, "acc_norm": 0.543, "acc_norm_stderr": 0.00909640486825282, "acc_stderr": 0.009091509877386519}, "exercise": {"acc": 0.5566666666666666, "acc_norm": 0.5473333333333333, "acc_norm_stderr": 0.009089227499483241, "acc_stderr": 0.009071405243621038}, "valid_binary": {"acc": 0.5693333333333334, "acc_norm": 0.5513333333333333, "acc_norm_stderr": 0.009081985306932099, "acc_stderr": 0.00904202497793108}, "yes_no_question": {"acc": 0.5436666666666666, "acc_norm": 0.554, "acc_norm_stderr": 0.009076827433934427, "acc_stderr": 0.009095345834327868}}, "2": {"GPT-3 Style": {"acc": 0.5923333333333334, "acc_norm": 0.617, "acc_norm_stderr": 0.008876744835033232, "acc_stderr": 0.008973202213879655}, "after_reading": {"acc": 0.5926666666666667, "acc_norm": 0.5723333333333334, "acc_norm_stderr": 0.009034185176145654, "acc_stderr": 0.008972056373066369}, "exercise": {"acc": 0.576, "acc_norm": 0.539, "acc_norm_stderr": 0.009102414587191052, "acc_stderr": 0.00902414234419792}, "valid_binary": {"acc": 0.5973333333333334, "acc_norm": 0.576, "acc_norm_stderr": 0.009024142344197916, "acc_stderr": 0.008955564831687456}, "yes_no_question": {"acc": 0.562, "acc_norm": 0.573, "acc_norm_stderr": 0.009032396953831092, "acc_stderr": 0.009059765989615446}}, "3": {"GPT-3 Style": {"acc": 0.6083333333333333, "acc_norm": 0.6213333333333333, "acc_norm_stderr": 0.008857326053368308, "acc_stderr": 0.008913348354532972}, "after_reading": {"acc": 0.58, "acc_norm": 0.5613333333333334, "acc_norm_stderr": 0.009061278956794627, "acc_stderr": 0.009012606487132148}, "exercise": {"acc": 0.5796666666666667, "acc_norm": 0.5516666666666666, "acc_norm_stderr": 0.00908135501204554, "acc_stderr": 0.00901359097963683}, "valid_binary": {"acc": 0.5966666666666667, "acc_norm": 0.5606666666666666, "acc_norm_stderr": 0.009062775319073721, "acc_stderr": 0.008957972256087366}, "yes_no_question": {"acc": 0.5646666666666667, "acc_norm": 0.5933333333333334, "acc_norm_stderr": 0.008969751860881003, "acc_stderr": 0.009053547904033165}}, "4": {"GPT-3 Style": {"acc": 0.6136666666666667, "acc_norm": 0.6226666666666667, "acc_norm_stderr": 0.00885120015653439, "acc_stderr": 0.008891174310695492}, "after_reading": {"acc": 0.5633333333333334, "acc_norm": 0.5403333333333333, "acc_norm_stderr": 0.00910047692710895, "acc_stderr": 0.009056690207178123}, "exercise": {"acc": 0.593, "acc_norm": 0.5486666666666666, "acc_norm_stderr": 0.009086879312708494, "acc_stderr": 0.008970906255948515}, "valid_binary": {"acc": 0.5913333333333334, "acc_norm": 0.556, "acc_norm_stderr": 0.009072785596468859, "acc_stderr": 0.008976614094836195}, "yes_no_question": {"acc": 0.5516666666666666, "acc_norm": 0.5833333333333334, "acc_norm_stderr": 0.009002529294393654, "acc_stderr": 0.009081355012045529}}, "5": {"GPT-3 Style": {"acc": 0.609, "acc_norm": 0.6203333333333333, "acc_norm_stderr": 0.008861873799148993, "acc_stderr": 0.00891063782727302}, "after_reading": {"acc": 0.5546666666666666, "acc_norm": 0.5166666666666667, "acc_norm_stderr": 0.009125157363376123, "acc_stderr": 0.009075496684215473}, "exercise": {"acc": 0.5896666666666667, "acc_norm": 0.5536666666666666, "acc_norm_stderr": 0.009077486613450288, "acc_stderr": 0.008982215188519146}, "valid_binary": {"acc": 0.583, "acc_norm": 0.545, "acc_norm_stderr": 0.009093178503605503, "acc_stderr": 0.009003556038613138}, "yes_no_question": {"acc": 0.5483333333333333, "acc_norm": 0.576, "acc_norm_stderr": 0.009024142344197917, "acc_stderr": 0.009087472531749432}}}, "cb": {"0": {"GPT-3 style": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.1940928270042194}, "MNLI crowdsource": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.1940928270042194}, "can we infer": {"acc": 0.2857142857142857, "acc_stderr": 0.06091449038731726, "f1": 0.24789746965043147}, "guaranteed/possible/impossible": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.20779220779220778}, "justified in saying": {"acc": 0.19642857142857142, "acc_stderr": 0.05357142857142859, "f1": 0.14814814814814814}}, "1": {"GPT-3 style": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2824214792299899}, "MNLI crowdsource": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2794380587484036}, "guaranteed/possible/impossible": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.3172825681224338}}, "2": {"GPT-3 style": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.28708133971291866}, "MNLI crowdsource": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.30977982590885816}, "can we infer": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.3141821946169772}, "guaranteed/possible/impossible": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.32608695652173914}, "justified in saying": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.310790273556231}}, "3": {"GPT-3 style": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2485426603073662}, "MNLI crowdsource": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.20370370370370372}, "can we infer": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.32702915681639083}, "guaranteed/possible/impossible": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.27226982184142523}, "justified in saying": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.27441920164292133}}, "4": {"GPT-3 style": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.25564695129912524}, "MNLI crowdsource": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.22305244223052442}, "can we infer": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.298989898989899}, "guaranteed/possible/impossible": {"acc": 0.5357142857142857, "acc_stderr": 0.06724777654937658, "f1": 0.37449908925318764}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.3224993701184178}}, "5": {"GPT-3 style": {"acc": 0.48214285714285715, "acc_stderr": 0.0673769750864465, "f1": 0.325}, "MNLI crowdsource": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.2295932295932296}, "can we infer": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.26798881261123825}, "guaranteed/possible/impossible": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.270516717325228}, "justified in saying": {"acc": 0.39285714285714285, "acc_stderr": 0.06585388898066351, "f1": 0.28703703703703703}}}, "copa": {"0": {"best_option": {"acc": 0.53, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.05016135580465919}, "cause_effect": {"acc": 0.52, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.050211673156867795}, "choose": {"acc": 0.49, "acc_norm": 0.48, "acc_norm_stderr": 0.05021167315686779, "acc_stderr": 0.05024183937956912}, "i_am_hesitating": {"acc": 0.54, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.05009082659620333}, "plausible_alternatives": {"acc": 0.53, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.050161355804659205}}, "1": {"best_option": {"acc": 0.59, "acc_norm": 0.56, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.04943110704237102}, "cause_effect": {"acc": 0.46, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428, "acc_stderr": 0.05009082659620332}, "choose": {"acc": 0.45, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.04999999999999999}, "i_am_hesitating": {"acc": 0.45, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.049999999999999996}, "plausible_alternatives": {"acc": 0.46, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.05009082659620332}}, "2": {"best_option": {"acc": 0.51, "acc_norm": 0.51, "acc_norm_stderr": 0.050241839379569095, "acc_stderr": 0.05024183937956911}, "cause_effect": {"acc": 0.45, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.049999999999999996}, "choose": {"acc": 0.45, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.04999999999999999}, "i_am_hesitating": {"acc": 0.49, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.05024183937956912}, "plausible_alternatives": {"acc": 0.46, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.05009082659620332}}, "3": {"best_option": {"acc": 0.55, "acc_norm": 0.53, "acc_norm_stderr": 0.050161355804659205, "acc_stderr": 0.05}, "cause_effect": {"acc": 0.47, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.05016135580465919}, "choose": {"acc": 0.49, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.05024183937956912}, "i_am_hesitating": {"acc": 0.48, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.050211673156867795}, "plausible_alternatives": {"acc": 0.49, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.05024183937956913}}, "4": {"best_option": {"acc": 0.49, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.05024183937956912}, "cause_effect": {"acc": 0.48, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.05021167315686779}, "choose": {"acc": 0.51, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956913, "acc_stderr": 0.05024183937956911}, "i_am_hesitating": {"acc": 0.51, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.05024183937956911}, "plausible_alternatives": {"acc": 0.48, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.050211673156867795}}, "5": {"best_option": {"acc": 0.54, "acc_norm": 0.53, "acc_norm_stderr": 0.050161355804659205, "acc_stderr": 0.05009082659620333}, "cause_effect": {"acc": 0.51, "acc_norm": 0.51, "acc_norm_stderr": 0.050241839379569095, "acc_stderr": 0.05024183937956911}, "choose": {"acc": 0.46, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.05009082659620332}, "i_am_hesitating": {"acc": 0.51, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.05024183937956911}, "plausible_alternatives": {"acc": 0.5, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956911, "acc_stderr": 0.050251890762960605}}}, "e2e_nlg_cleaned": {"0": {"coherent_text": {"bleu": 3.6185138767655642, "bleu_stderr": 0.027545555030891664, "rouge1_fmeasure": 0.37659938602738596, "rouge1_fmeasure_stderr": 0.0025625344135822327, "rouge1_precision": 0.3580184040254419, "rouge1_precision_stderr": 0.002769550276517225, "rouge1_recall": 0.44573433671378293, "rouge1_recall_stderr": 0.0029443445157491567, "rouge2_fmeasure": 0.15645061177192066, "rouge2_fmeasure_stderr": 0.0014050173197206021, "rouge2_precision": 0.14943282342597658, "rouge2_precision_stderr": 0.001681547997613015, "rouge2_recall": 0.18664925231515925, "rouge2_recall_stderr": 0.0017720627212135355, "rougeL_fmeasure": 0.2727904799998107, "rougeL_fmeasure_stderr": 0.001753190259230686, "rougeL_precision": 0.2605556278541228, "rougeL_precision_stderr": 0.0020429532483913155, "rougeL_recall": 0.32788024129204274, "rougeL_recall_stderr": 0.002331773243025902, "rougeLsum_fmeasure": 0.30749802061359216, "rougeLsum_fmeasure_stderr": 0.0020994896088046516, "rougeLsum_precision": 0.2944910613704785, "rougeLsum_precision_stderr": 0.0024027482769953855, "rougeLsum_recall": 0.36501687453094783, "rougeLsum_recall_stderr": 0.002476795135078731}, "create_text_for_me": {"bleu": 3.112918446128787, "bleu_stderr": 0.053810399237995354, "rouge1_fmeasure": 0.20084482886106741, "rouge1_fmeasure_stderr": 0.002261750612159959, "rouge1_precision": 0.15456658592030545, "rouge1_precision_stderr": 0.0018143739370322604, "rouge1_recall": 0.3025102581811532, "rouge1_recall_stderr": 0.0033588873082462944, "rouge2_fmeasure": 0.06347842363431547, "rouge2_fmeasure_stderr": 0.0014480298422468413, "rouge2_precision": 0.04828817820081409, "rouge2_precision_stderr": 0.0011102317342711607, "rouge2_recall": 0.09771928488920142, "rouge2_recall_stderr": 0.0022721293090265083, "rougeL_fmeasure": 0.16303102771266634, "rougeL_fmeasure_stderr": 0.0017676167469174691, "rougeL_precision": 0.12503267788087088, "rougeL_precision_stderr": 0.001400501149320938, "rougeL_recall": 0.24757730148054255, "rougeL_recall_stderr": 0.00273114149938929, "rougeLsum_fmeasure": 0.1791143226266472, "rougeLsum_fmeasure_stderr": 0.0020064066499178943, "rougeLsum_precision": 0.13774800739240092, "rougeLsum_precision_stderr": 0.0016042003429823542, "rougeLsum_recall": 0.2701867863891116, "rougeLsum_recall_stderr": 0.00300144186310966}, "generate_gramatically_correct_text": {"bleu": 1.728040337370279e-139, "bleu_stderr": 5.383410053108398e-114, "rouge1_fmeasure": 0.0015568503799152772, "rouge1_fmeasure_stderr": 0.00024935374592081596, "rouge1_precision": 0.01413888888888889, "rouge1_precision_stderr": 0.0020691161490258166, "rouge1_recall": 0.0008690455993586293, "rouge1_recall_stderr": 0.0001433594447135255, "rouge2_fmeasure": 0.00012067093428409366, "rouge2_fmeasure_stderr": 6.188322773568673e-05, "rouge2_precision": 0.0006693121693121694, "rouge2_precision_stderr": 0.0003915966570899454, "rouge2_recall": 6.965488215488215e-05, "rouge2_recall_stderr": 3.5859820215685834e-05, "rougeL_fmeasure": 0.00136520023174339, "rougeL_fmeasure_stderr": 0.00021513459627798125, "rougeL_precision": 0.013305555555555555, "rougeL_precision_stderr": 0.0019973445104400247, "rougeL_recall": 0.0007596001007290696, "rougeL_recall_stderr": 0.00012365940229831348, "rougeLsum_fmeasure": 0.001454791799595827, "rougeLsum_fmeasure_stderr": 0.00022889798881739265, "rougeLsum_precision": 0.01375, "rougeLsum_precision_stderr": 0.0020331190505576643, "rougeLsum_recall": 0.0008096001007290697, "rougeLsum_recall_stderr": 0.00013118909454719753}, "generate_text_restaurant": {"bleu": 0.02601889547824242, "bleu_stderr": 0.008845990174481217, "rouge1_fmeasure": 0.016342850382717815, "rouge1_fmeasure_stderr": 0.00034912236129064063, "rouge1_precision": 0.013188289488289925, "rouge1_precision_stderr": 0.0003020110162685144, "rouge1_recall": 0.02331912229418711, "rouge1_recall_stderr": 0.0005487407492538043, "rouge2_fmeasure": 0.00024104025657346095, "rouge2_fmeasure_stderr": 5.3010616942847675e-05, "rouge2_precision": 0.00018882696164487857, "rouge2_precision_stderr": 4.0143613745290394e-05, "rouge2_recall": 0.0003925072247489244, "rouge2_recall_stderr": 9.925690772584578e-05, "rougeL_fmeasure": 0.016342850382717815, "rougeL_fmeasure_stderr": 0.00034912236129064063, "rougeL_precision": 0.013188289488289925, "rougeL_precision_stderr": 0.0003020110162685144, "rougeL_recall": 0.02331912229418711, "rougeL_recall_stderr": 0.0005487407492538043, "rougeLsum_fmeasure": 0.015820908294670494, "rougeLsum_fmeasure_stderr": 0.00033227991604236137, "rougeLsum_precision": 0.012772945572946016, "rougeLsum_precision_stderr": 0.000289959796992291, "rougeLsum_recall": 0.022577540598345845, "rougeLsum_recall_stderr": 0.0005237847970514886}, "text": {"bleu": 3.026054687943056, "bleu_stderr": 0.05078028430795098, "rouge1_fmeasure": 0.27711271390608916, "rouge1_fmeasure_stderr": 0.0024014333114825595, "rouge1_precision": 0.22893008098167747, "rouge1_precision_stderr": 0.0023230957141361385, "rouge1_recall": 0.3777465242623155, "rouge1_recall_stderr": 0.0028593004842754027, "rouge2_fmeasure": 0.10910465326076894, "rouge2_fmeasure_stderr": 0.0013890979482078735, "rouge2_precision": 0.09022137807900418, "rouge2_precision_stderr": 0.0012595960763224963, "rouge2_recall": 0.14929791491865801, "rouge2_recall_stderr": 0.001792572395692284, "rougeL_fmeasure": 0.20293981930119875, "rougeL_fmeasure_stderr": 0.001618501844111904, "rougeL_precision": 0.16625817127303685, "rougeL_precision_stderr": 0.0015302425277458882, "rougeL_recall": 0.2807767212855276, "rougeL_recall_stderr": 0.0021268958542967034, "rougeLsum_fmeasure": 0.22644058640190284, "rougeLsum_fmeasure_stderr": 0.002021283150451286, "rougeLsum_precision": 0.18691135016064478, "rougeLsum_precision_stderr": 0.0019403851673907941, "rougeLsum_recall": 0.3092909346574639, "rougeLsum_recall_stderr": 0.002464400666534724}}, "1": {"coherent_text": {"bleu": 5.940650821215176, "bleu_stderr": 0.07251202191064558, "rouge1_fmeasure": 0.43127560616047034, "rouge1_fmeasure_stderr": 0.001978521697967289, "rouge1_precision": 0.3562088549882284, "rouge1_precision_stderr": 0.0020586504966668153, "rouge1_recall": 0.5809146861845771, "rouge1_recall_stderr": 0.002622892280494673, "rouge2_fmeasure": 0.1870937559813721, "rouge2_fmeasure_stderr": 0.0014484391370237738, "rouge2_precision": 0.1532140575666167, "rouge2_precision_stderr": 0.0012898457667003178, "rouge2_recall": 0.25724312942661537, "rouge2_recall_stderr": 0.002124196698748588, "rougeL_fmeasure": 0.3071708177800624, "rougeL_fmeasure_stderr": 0.0014249199674985732, "rougeL_precision": 0.25186821456916564, "rougeL_precision_stderr": 0.0013627204573516862, "rougeL_recall": 0.41938362031941356, "rougeL_recall_stderr": 0.0023184382496243294, "rougeLsum_fmeasure": 0.3552092130581474, "rougeLsum_fmeasure_stderr": 0.0018353427248767173, "rougeLsum_precision": 0.2932254446894908, "rougeLsum_precision_stderr": 0.0018317984365110538, "rougeLsum_recall": 0.47920715808647235, "rougeLsum_recall_stderr": 0.0025056807862094345}, "create_text_for_me": {"bleu": 6.053334944424697, "bleu_stderr": 0.06721594685515214, "rouge1_fmeasure": 0.38936702412346486, "rouge1_fmeasure_stderr": 0.0016662898170015916, "rouge1_precision": 0.30801660806432923, "rouge1_precision_stderr": 0.0016273405349466305, "rouge1_recall": 0.5619641447190051, "rouge1_recall_stderr": 0.0025318813867833377, "rouge2_fmeasure": 0.16511209673657395, "rouge2_fmeasure_stderr": 0.0013352227993337418, "rouge2_precision": 0.12945275179345114, "rouge2_precision_stderr": 0.0011074882101870791, "rouge2_recall": 0.2439184132930377, "rouge2_recall_stderr": 0.0021503693219860074, "rougeL_fmeasure": 0.2659105891113473, "rougeL_fmeasure_stderr": 0.0013002731121155686, "rougeL_precision": 0.2092448958861131, "rougeL_precision_stderr": 0.0011482208527024708, "rougeL_recall": 0.388326334625175, "rougeL_recall_stderr": 0.0023092628247571192, "rougeLsum_fmeasure": 0.32608244869018355, "rougeLsum_fmeasure_stderr": 0.0016028641869031382, "rougeLsum_precision": 0.25794640278330544, "rougeLsum_precision_stderr": 0.0014986373993336347, "rougeLsum_recall": 0.4707609586779557, "rougeLsum_recall_stderr": 0.0024378163211520084}, "generate_gramatically_correct_text": {"bleu": 0.20327154719905477, "bleu_stderr": 0.034768468483106676, "rouge1_fmeasure": 0.06799149288514703, "rouge1_fmeasure_stderr": 0.0027079375902244007, "rouge1_precision": 0.11777046155460656, "rouge1_precision_stderr": 0.004787552105977667, "rouge1_recall": 0.0755797825871043, "rouge1_recall_stderr": 0.003197252899893255, "rouge2_fmeasure": 0.025195913355673966, "rouge2_fmeasure_stderr": 0.0012304784432672295, "rouge2_precision": 0.025207113777389382, "rouge2_precision_stderr": 0.001302556335227448, "rouge2_recall": 0.029482308327976877, "rouge2_recall_stderr": 0.0014690441197124206, "rougeL_fmeasure": 0.048776083503696964, "rougeL_fmeasure_stderr": 0.0019127958949913073, "rougeL_precision": 0.09889828492871724, "rougeL_precision_stderr": 0.004441805820458746, "rougeL_recall": 0.0535759232563342, "rougeL_recall_stderr": 0.0022760149883595216, "rougeLsum_fmeasure": 0.057898257679435405, "rougeLsum_fmeasure_stderr": 0.002300168304929039, "rougeLsum_precision": 0.10804282075733199, "rougeLsum_precision_stderr": 0.004604581172532683, "rougeLsum_recall": 0.06371930488469042, "rougeLsum_recall_stderr": 0.002700349737694304}, "generate_text_restaurant": {"bleu": 10.11194167971178, "bleu_stderr": 0.11325409958385195, "rouge1_fmeasure": 0.396708212066539, "rouge1_fmeasure_stderr": 0.0023271634805327288, "rouge1_precision": 0.44662580344767955, "rouge1_precision_stderr": 0.0029145470980427935, "rouge1_recall": 0.403325138761631, "rouge1_recall_stderr": 0.0029812540605776657, "rouge2_fmeasure": 0.1714205638298909, "rouge2_fmeasure_stderr": 0.0017391694364972787, "rouge2_precision": 0.19849995481108837, "rouge2_precision_stderr": 0.0023301034372897886, "rouge2_recall": 0.17476709268624893, "rouge2_recall_stderr": 0.0019834376005625296, "rougeL_fmeasure": 0.28469304359904984, "rougeL_fmeasure_stderr": 0.0018043411811584805, "rougeL_precision": 0.3259862976774872, "rougeL_precision_stderr": 0.0025491424613272398, "rougeL_recall": 0.28928330619087117, "rougeL_recall_stderr": 0.002329063084463768, "rougeLsum_fmeasure": 0.3323030796627126, "rougeLsum_fmeasure_stderr": 0.002111384569072981, "rougeLsum_precision": 0.37806486217321694, "rougeLsum_precision_stderr": 0.002787004218176944, "rougeLsum_recall": 0.3367501905387613, "rougeLsum_recall_stderr": 0.002641205675752919}, "text": {"bleu": 6.1092098281513625, "bleu_stderr": 0.09127976996292099, "rouge1_fmeasure": 0.4510092881121515, "rouge1_fmeasure_stderr": 0.0017948907933983346, "rouge1_precision": 0.38095564749467725, "rouge1_precision_stderr": 0.0019709425802198264, "rouge1_recall": 0.5860861386660807, "rouge1_recall_stderr": 0.002426219681779596, "rouge2_fmeasure": 0.20219167803744306, "rouge2_fmeasure_stderr": 0.0014562059203023273, "rouge2_precision": 0.1695045775140401, "rouge2_precision_stderr": 0.0013385932546957934, "rouge2_recall": 0.26780052036509366, "rouge2_recall_stderr": 0.0020869517300861743, "rougeL_fmeasure": 0.3135694240329392, "rougeL_fmeasure_stderr": 0.0014545694851811255, "rougeL_precision": 0.26311648134376764, "rougeL_precision_stderr": 0.0014102722543714375, "rougeL_recall": 0.412622808220604, "rougeL_recall_stderr": 0.0023307564982918876, "rougeLsum_fmeasure": 0.37081917705524653, "rougeLsum_fmeasure_stderr": 0.0017754950366532115, "rougeLsum_precision": 0.3132751166335577, "rougeLsum_precision_stderr": 0.0018346757269402277, "rougeLsum_recall": 0.4820736892660319, "rougeLsum_recall_stderr": 0.0023982212141781824}}, "2": {"coherent_text": {"bleu": 6.33821093669954, "bleu_stderr": 0.09379960768199452, "rouge1_fmeasure": 0.4183289686770281, "rouge1_fmeasure_stderr": 0.0020625774095387943, "rouge1_precision": 0.3436726890451921, "rouge1_precision_stderr": 0.002099670338530863, "rouge1_recall": 0.5695885225308102, "rouge1_recall_stderr": 0.002705227771510273, "rouge2_fmeasure": 0.18600518275150685, "rouge2_fmeasure_stderr": 0.001526977649422954, "rouge2_precision": 0.15145968082746675, "rouge2_precision_stderr": 0.0013480346131076102, "rouge2_recall": 0.25931282860324156, "rouge2_recall_stderr": 0.002266505933044581, "rougeL_fmeasure": 0.30465189918657554, "rougeL_fmeasure_stderr": 0.001501929754244832, "rougeL_precision": 0.24856833227974034, "rougeL_precision_stderr": 0.001426163937446131, "rougeL_recall": 0.42055124580774705, "rougeL_recall_stderr": 0.0024091709808996154, "rougeLsum_fmeasure": 0.34695172233964533, "rougeLsum_fmeasure_stderr": 0.0019140373403072556, "rougeLsum_precision": 0.28486827225361294, "rougeLsum_precision_stderr": 0.0018810468253166184, "rougeLsum_recall": 0.47345290666526635, "rougeLsum_recall_stderr": 0.0025997531141343233}, "create_text_for_me": {"bleu": 6.561233872713581, "bleu_stderr": 0.09108477364016405, "rouge1_fmeasure": 0.38740329284198877, "rouge1_fmeasure_stderr": 0.0016694245894016385, "rouge1_precision": 0.30557997424529676, "rouge1_precision_stderr": 0.00163203899887396, "rouge1_recall": 0.5616556592909835, "rouge1_recall_stderr": 0.002469578127127456, "rouge2_fmeasure": 0.17074360575215342, "rouge2_fmeasure_stderr": 0.001379403429380591, "rouge2_precision": 0.13344266402318036, "rouge2_precision_stderr": 0.0011422005728536733, "rouge2_recall": 0.2537552926960219, "rouge2_recall_stderr": 0.002233457163496332, "rougeL_fmeasure": 0.27191789630264573, "rougeL_fmeasure_stderr": 0.0013215420536361491, "rougeL_precision": 0.21330386118745845, "rougeL_precision_stderr": 0.0011724736772492886, "rougeL_recall": 0.3990960192237072, "rougeL_recall_stderr": 0.0023163293106240614, "rougeLsum_fmeasure": 0.32843508910149455, "rougeLsum_fmeasure_stderr": 0.0016104077951613867, "rougeLsum_precision": 0.25891502247240206, "rougeLsum_precision_stderr": 0.0015027118915720513, "rougeLsum_recall": 0.47690020829771507, "rougeLsum_recall_stderr": 0.0024407095587869695}, "generate_gramatically_correct_text": {"bleu": 1.5836328567888602, "bleu_stderr": 0.11778913760335698, "rouge1_fmeasure": 0.11549730510595205, "rouge1_fmeasure_stderr": 0.003274787299760805, "rouge1_precision": 0.16142810684796663, "rouge1_precision_stderr": 0.004922963050007353, "rouge1_recall": 0.12928310552643027, "rouge1_recall_stderr": 0.0038524381489268497, "rouge2_fmeasure": 0.04447784117945149, "rouge2_fmeasure_stderr": 0.0015076383673862696, "rouge2_precision": 0.04496956013107435, "rouge2_precision_stderr": 0.0016446318482885244, "rouge2_recall": 0.051485578046763486, "rouge2_recall_stderr": 0.0017825906801569658, "rougeL_fmeasure": 0.08223474440316005, "rougeL_fmeasure_stderr": 0.0023266800965344265, "rougeL_precision": 0.12870133893762628, "rougeL_precision_stderr": 0.004434980395614415, "rougeL_recall": 0.09129308646670774, "rougeL_recall_stderr": 0.0027323522213050914, "rougeLsum_fmeasure": 0.0974072227519707, "rougeLsum_fmeasure_stderr": 0.002774866384923383, "rougeLsum_precision": 0.1434849944924965, "rougeLsum_precision_stderr": 0.004641427545467661, "rougeLsum_recall": 0.1085877089549346, "rougeLsum_recall_stderr": 0.0032584779260767995}, "generate_text_restaurant": {"bleu": 10.765851233592166, "bleu_stderr": 0.1147637687545087, "rouge1_fmeasure": 0.4194853660757534, "rouge1_fmeasure_stderr": 0.002222220598248002, "rouge1_precision": 0.5029109698404333, "rouge1_precision_stderr": 0.0032326251122724503, "rouge1_recall": 0.399072749631299, "rouge1_recall_stderr": 0.002770754484292409, "rouge2_fmeasure": 0.19259169221915515, "rouge2_fmeasure_stderr": 0.0018423181170468268, "rouge2_precision": 0.23608052397188775, "rouge2_precision_stderr": 0.002567411684370418, "rouge2_recall": 0.18324767332759007, "rouge2_recall_stderr": 0.001989894281113984, "rougeL_fmeasure": 0.3027130154416743, "rougeL_fmeasure_stderr": 0.001900798692043736, "rougeL_precision": 0.3660346353864222, "rougeL_precision_stderr": 0.0028533265797324394, "rougeL_recall": 0.28763635547225785, "rougeL_recall_stderr": 0.002249079446710024, "rougeLsum_fmeasure": 0.34512069878778673, "rougeLsum_fmeasure_stderr": 0.00213278119755698, "rougeLsum_precision": 0.4152150772992765, "rougeLsum_precision_stderr": 0.0030631524445173153, "rougeLsum_recall": 0.32804484742896467, "rougeLsum_recall_stderr": 0.002520109021821381}, "text": {"bleu": 6.46596372404059, "bleu_stderr": 0.10222019609135197, "rouge1_fmeasure": 0.43663869577593034, "rouge1_fmeasure_stderr": 0.0019277436539416571, "rouge1_precision": 0.3651355772456116, "rouge1_precision_stderr": 0.0020414630989659278, "rouge1_recall": 0.576357756634015, "rouge1_recall_stderr": 0.002576181875133805, "rouge2_fmeasure": 0.19722529213201134, "rouge2_fmeasure_stderr": 0.0015363235060857502, "rouge2_precision": 0.16361081730624932, "rouge2_precision_stderr": 0.0013840748749673782, "rouge2_recall": 0.26604599291958914, "rouge2_recall_stderr": 0.002256079964522369, "rougeL_fmeasure": 0.31324149645192695, "rougeL_fmeasure_stderr": 0.0015109155091120891, "rougeL_precision": 0.2602719118470187, "rougeL_precision_stderr": 0.0014547965301518198, "rougeL_recall": 0.41863229087687953, "rougeL_recall_stderr": 0.0024024801780310424, "rougeLsum_fmeasure": 0.36392101007128824, "rougeLsum_fmeasure_stderr": 0.0018682328209929703, "rougeLsum_precision": 0.3042042275101151, "rougeLsum_precision_stderr": 0.0018792439139020645, "rougeLsum_recall": 0.48113522705415146, "rougeLsum_recall_stderr": 0.0025570057038425613}}, "3": {"coherent_text": {"bleu": 6.311617860948694, "bleu_stderr": 0.08548221958370308, "rouge1_fmeasure": 0.4075743912737001, "rouge1_fmeasure_stderr": 0.002146090481914194, "rouge1_precision": 0.3349961502490069, "rouge1_precision_stderr": 0.0021680799575113964, "rouge1_recall": 0.5565544757647279, "rouge1_recall_stderr": 0.0028291574048521497, "rouge2_fmeasure": 0.18307097946148873, "rouge2_fmeasure_stderr": 0.0015714553612069671, "rouge2_precision": 0.1491383011087303, "rouge2_precision_stderr": 0.001393658080239258, "rouge2_recall": 0.25579275901300663, "rouge2_recall_stderr": 0.0023006833356505204, "rougeL_fmeasure": 0.3005272202343599, "rougeL_fmeasure_stderr": 0.0015463466057729666, "rougeL_precision": 0.24541633137395016, "rougeL_precision_stderr": 0.0014787103833905858, "rougeL_recall": 0.4155102693950093, "rougeL_recall_stderr": 0.0024152269725431605, "rougeLsum_fmeasure": 0.3388120379840606, "rougeLsum_fmeasure_stderr": 0.0019607864106101058, "rougeLsum_precision": 0.2783595624546861, "rougeLsum_precision_stderr": 0.001923194297919911, "rougeLsum_recall": 0.46345985055371536, "rougeLsum_recall_stderr": 0.0026579629870791487}, "create_text_for_me": {"bleu": 6.776788277555548, "bleu_stderr": 0.08908888038445728, "rouge1_fmeasure": 0.3840971278102562, "rouge1_fmeasure_stderr": 0.001668160634088681, "rouge1_precision": 0.30214712366118474, "rouge1_precision_stderr": 0.0016165598811510716, "rouge1_recall": 0.5592881145815344, "rouge1_recall_stderr": 0.002519531092257658, "rouge2_fmeasure": 0.17213478001357976, "rouge2_fmeasure_stderr": 0.001408023581847778, "rouge2_precision": 0.13422079490420966, "rouge2_precision_stderr": 0.0011667771588757267, "rouge2_recall": 0.2566073804641227, "rouge2_recall_stderr": 0.0022681846275686155, "rougeL_fmeasure": 0.27303357092908037, "rougeL_fmeasure_stderr": 0.0013329054733735002, "rougeL_precision": 0.21374794181163875, "rougeL_precision_stderr": 0.0011830966338559523, "rougeL_recall": 0.4017997775289984, "rougeL_recall_stderr": 0.0023191692353490933, "rougeLsum_fmeasure": 0.3277571464283619, "rougeLsum_fmeasure_stderr": 0.0016344606337249433, "rougeLsum_precision": 0.25778209737895724, "rougeLsum_precision_stderr": 0.0015185694019430708, "rougeLsum_recall": 0.47756511166340027, "rougeLsum_recall_stderr": 0.00247847378438377}, "generate_gramatically_correct_text": {"bleu": 1.2251973658832964, "bleu_stderr": 0.06840209821605239, "rouge1_fmeasure": 0.10350129867546688, "rouge1_fmeasure_stderr": 0.0030845729308722892, "rouge1_precision": 0.15582501118061967, "rouge1_precision_stderr": 0.0050057405436943385, "rouge1_recall": 0.11693432609433732, "rouge1_recall_stderr": 0.0036830042145809486, "rouge2_fmeasure": 0.038284747118588126, "rouge2_fmeasure_stderr": 0.001383633589370129, "rouge2_precision": 0.037407232865964785, "rouge2_precision_stderr": 0.0014744828664782648, "rouge2_recall": 0.045276097550827864, "rouge2_recall_stderr": 0.001668077617467975, "rougeL_fmeasure": 0.07447530948404624, "rougeL_fmeasure_stderr": 0.002203151007472917, "rougeL_precision": 0.12825581895473886, "rougeL_precision_stderr": 0.0046484472676466704, "rougeL_recall": 0.08317479654292953, "rougeL_recall_stderr": 0.002625824446891428, "rougeLsum_fmeasure": 0.08797882063149828, "rougeLsum_fmeasure_stderr": 0.0026214171075166757, "rougeLsum_precision": 0.14099199993249983, "rougeLsum_precision_stderr": 0.0047989121526212306, "rougeLsum_recall": 0.09896314553373009, "rougeLsum_recall_stderr": 0.003133393959775989}, "generate_text_restaurant": {"bleu": 10.504414399066166, "bleu_stderr": 0.14144404460789148, "rouge1_fmeasure": 0.42179412730593185, "rouge1_fmeasure_stderr": 0.0021527795098671134, "rouge1_precision": 0.509958754752616, "rouge1_precision_stderr": 0.003154730928607417, "rouge1_recall": 0.3954899202585752, "rouge1_recall_stderr": 0.002608601302312822, "rouge2_fmeasure": 0.19636018570824587, "rouge2_fmeasure_stderr": 0.0018014064871279597, "rouge2_precision": 0.24242098353513022, "rouge2_precision_stderr": 0.002484157206893866, "rouge2_recall": 0.18360253637850166, "rouge2_recall_stderr": 0.001904297402323581, "rougeL_fmeasure": 0.30275824983777616, "rougeL_fmeasure_stderr": 0.0018031245552577217, "rougeL_precision": 0.36777401786978037, "rougeL_precision_stderr": 0.002648283289980144, "rougeL_recall": 0.28409076327843025, "rougeL_recall_stderr": 0.002115488452238855, "rougeLsum_fmeasure": 0.3452111361624014, "rougeLsum_fmeasure_stderr": 0.0020448977561221436, "rougeLsum_precision": 0.4185549498251625, "rougeLsum_precision_stderr": 0.0029283489596877298, "rougeLsum_recall": 0.32342169868683124, "rougeLsum_recall_stderr": 0.0023588979812351725}, "text": {"bleu": 6.660859618805935, "bleu_stderr": 0.1172038453417659, "rouge1_fmeasure": 0.43141055859111643, "rouge1_fmeasure_stderr": 0.0019365140286472272, "rouge1_precision": 0.3587661880944985, "rouge1_precision_stderr": 0.002029335942530353, "rouge1_recall": 0.5745232266800461, "rouge1_recall_stderr": 0.0026290745515900977, "rouge2_fmeasure": 0.1964954395976402, "rouge2_fmeasure_stderr": 0.0015922666319257745, "rouge2_precision": 0.16212842829928545, "rouge2_precision_stderr": 0.0014186202913670232, "rouge2_recall": 0.2670249903635504, "rouge2_recall_stderr": 0.0023104076885023827, "rougeL_fmeasure": 0.3090193571542357, "rougeL_fmeasure_stderr": 0.0014996989848492066, "rougeL_precision": 0.2554771790178779, "rougeL_precision_stderr": 0.0014430344926745968, "rougeL_recall": 0.41621281205529004, "rougeL_recall_stderr": 0.002379747308022119, "rougeLsum_fmeasure": 0.3606858214661092, "rougeLsum_fmeasure_stderr": 0.0019006216836532906, "rougeLsum_precision": 0.29982900805480683, "rougeLsum_precision_stderr": 0.0018862992882486876, "rougeLsum_recall": 0.4809216581836253, "rougeLsum_recall_stderr": 0.002619984298616209}}, "4": {"coherent_text": {"bleu": 6.3919491491112925, "bleu_stderr": 0.08289306046133078, "rouge1_fmeasure": 0.4212144473373565, "rouge1_fmeasure_stderr": 0.0020829348398401265, "rouge1_precision": 0.34763102577482935, "rouge1_precision_stderr": 0.002149233305061666, "rouge1_recall": 0.5691735901016647, "rouge1_recall_stderr": 0.0027018434473941233, "rouge2_fmeasure": 0.19134136835621748, "rouge2_fmeasure_stderr": 0.001586785118204661, "rouge2_precision": 0.15665647152252904, "rouge2_precision_stderr": 0.0014198314983417174, "rouge2_recall": 0.2641675222249262, "rouge2_recall_stderr": 0.00230189707841787, "rougeL_fmeasure": 0.3107651494371883, "rougeL_fmeasure_stderr": 0.0015448622097670513, "rougeL_precision": 0.25481057551592495, "rougeL_precision_stderr": 0.0014950883188841327, "rougeL_recall": 0.42503194148437995, "rougeL_recall_stderr": 0.002378950785673738, "rougeLsum_fmeasure": 0.3484043781862422, "rougeLsum_fmeasure_stderr": 0.0019286965322109623, "rougeLsum_precision": 0.2874633204241909, "rougeLsum_precision_stderr": 0.0019236933160864085, "rougeLsum_recall": 0.47133290982559206, "rougeLsum_recall_stderr": 0.0025524403420989594}, "create_text_for_me": {"bleu": 6.799544352473561, "bleu_stderr": 0.08238532707877078, "rouge1_fmeasure": 0.37986106085998556, "rouge1_fmeasure_stderr": 0.0016771413118236993, "rouge1_precision": 0.2983364923590482, "rouge1_precision_stderr": 0.0016137840485591932, "rouge1_recall": 0.5543295832415513, "rouge1_recall_stderr": 0.0025227410815235234, "rouge2_fmeasure": 0.17010384910521295, "rouge2_fmeasure_stderr": 0.0014281699846839927, "rouge2_precision": 0.13254250642240248, "rouge2_precision_stderr": 0.0011814668653123278, "rouge2_recall": 0.2537164843195013, "rouge2_recall_stderr": 0.0022749137857616354, "rougeL_fmeasure": 0.2705591293748383, "rougeL_fmeasure_stderr": 0.001350504706167112, "rougeL_precision": 0.21162550086306248, "rougeL_precision_stderr": 0.001199938107743782, "rougeL_recall": 0.398387439159313, "rougeL_recall_stderr": 0.002291054746321359, "rougeLsum_fmeasure": 0.3252326795969852, "rougeLsum_fmeasure_stderr": 0.0016631041936946884, "rougeLsum_precision": 0.25543730355908567, "rougeLsum_precision_stderr": 0.0015294759522292728, "rougeLsum_recall": 0.474621306847035, "rougeLsum_recall_stderr": 0.002506933541584412}, "generate_gramatically_correct_text": {"bleu": 1.151782007884036, "bleu_stderr": 0.10917316383418346, "rouge1_fmeasure": 0.10057563196631636, "rouge1_fmeasure_stderr": 0.0030484749132734744, "rouge1_precision": 0.153541975103279, "rouge1_precision_stderr": 0.005017944717364684, "rouge1_recall": 0.11318510222695441, "rouge1_recall_stderr": 0.0036272890057710877, "rouge2_fmeasure": 0.037516989850184534, "rouge2_fmeasure_stderr": 0.0013843301579119466, "rouge2_precision": 0.03679979821425693, "rouge2_precision_stderr": 0.0014791535592911546, "rouge2_recall": 0.04434331636886536, "rouge2_recall_stderr": 0.0016832005869182524, "rougeL_fmeasure": 0.07193779825133709, "rougeL_fmeasure_stderr": 0.0021569276149122152, "rougeL_precision": 0.12610380196461451, "rougeL_precision_stderr": 0.004649593887639469, "rougeL_recall": 0.08015880317670764, "rougeL_recall_stderr": 0.002581090415922211, "rougeLsum_fmeasure": 0.0849355679057312, "rougeLsum_fmeasure_stderr": 0.002576383143938577, "rougeLsum_precision": 0.13842028827055886, "rougeLsum_precision_stderr": 0.004798912895625669, "rougeLsum_recall": 0.0951361419739092, "rougeLsum_recall_stderr": 0.003071944475077794}, "generate_text_restaurant": {"bleu": 10.453461006006084, "bleu_stderr": 0.20323399299325623, "rouge1_fmeasure": 0.4184745538314975, "rouge1_fmeasure_stderr": 0.002152424221911221, "rouge1_precision": 0.5152975398825912, "rouge1_precision_stderr": 0.0032876121566522126, "rouge1_recall": 0.3875757012647283, "rouge1_recall_stderr": 0.002563136912882847, "rouge2_fmeasure": 0.19590832872090894, "rouge2_fmeasure_stderr": 0.0017695553874619732, "rouge2_precision": 0.24767323400172267, "rouge2_precision_stderr": 0.002578983235298861, "rouge2_recall": 0.1802765312255684, "rouge2_recall_stderr": 0.0018268424519338505, "rougeL_fmeasure": 0.30013711995116676, "rougeL_fmeasure_stderr": 0.001799774395616833, "rougeL_precision": 0.37129359217610464, "rougeL_precision_stderr": 0.0027272181980460375, "rougeL_recall": 0.2780682094337028, "rougeL_recall_stderr": 0.0020775510109816452, "rougeLsum_fmeasure": 0.34206938848652774, "rougeLsum_fmeasure_stderr": 0.0020120604230570572, "rougeLsum_precision": 0.4223690533102043, "rougeLsum_precision_stderr": 0.003005111083307187, "rougeLsum_recall": 0.3169024351381635, "rougeLsum_recall_stderr": 0.0023241001320442878}, "text": {"bleu": 6.669912151694831, "bleu_stderr": 0.08998324955735416, "rouge1_fmeasure": 0.4288209301236101, "rouge1_fmeasure_stderr": 0.0019594115330927517, "rouge1_precision": 0.35581234182428756, "rouge1_precision_stderr": 0.002046080023554725, "rouge1_recall": 0.5730282795070204, "rouge1_recall_stderr": 0.002647698915295993, "rouge2_fmeasure": 0.19536984000862256, "rouge2_fmeasure_stderr": 0.0016184104384078227, "rouge2_precision": 0.16088582394762457, "rouge2_precision_stderr": 0.0014414061079427547, "rouge2_recall": 0.26635743558551805, "rouge2_recall_stderr": 0.00235484941364112, "rougeL_fmeasure": 0.3082343858102462, "rougeL_fmeasure_stderr": 0.0015342038958960327, "rougeL_precision": 0.25423407055295205, "rougeL_precision_stderr": 0.001471223089557672, "rougeL_recall": 0.4166450671427576, "rougeL_recall_stderr": 0.0024110898290359713, "rougeLsum_fmeasure": 0.35880379298869464, "rougeLsum_fmeasure_stderr": 0.0019316365194543348, "rougeLsum_precision": 0.2978062975462721, "rougeLsum_precision_stderr": 0.0019203204157120337, "rougeLsum_recall": 0.4794333800307403, "rougeLsum_recall_stderr": 0.0026149511981629395}}, "5": {"coherent_text": {"bleu": 6.2572003772902205, "bleu_stderr": 0.08895652378880192, "rouge1_fmeasure": 0.419255158305618, "rouge1_fmeasure_stderr": 0.0020235703061870854, "rouge1_precision": 0.345895095360714, "rouge1_precision_stderr": 0.0021030157057793835, "rouge1_recall": 0.5666156627306035, "rouge1_recall_stderr": 0.002673571521803025, "rouge2_fmeasure": 0.18872128486346074, "rouge2_fmeasure_stderr": 0.0015510519443660666, "rouge2_precision": 0.15435273004708677, "rouge2_precision_stderr": 0.001384871199965745, "rouge2_recall": 0.2608233042022455, "rouge2_recall_stderr": 0.0022656300390166877, "rougeL_fmeasure": 0.3087981367592903, "rougeL_fmeasure_stderr": 0.001478900728238061, "rougeL_precision": 0.25300940768488106, "rougeL_precision_stderr": 0.001425167078952542, "rougeL_recall": 0.42247328220972996, "rougeL_recall_stderr": 0.002353052068466571, "rougeLsum_fmeasure": 0.3451904558791544, "rougeLsum_fmeasure_stderr": 0.0018918033978421436, "rougeLsum_precision": 0.284821827767446, "rougeLsum_precision_stderr": 0.0018887063780582047, "rougeLsum_recall": 0.46645604559897036, "rougeLsum_recall_stderr": 0.002510391137764271}, "create_text_for_me": {"bleu": 6.697212269931585, "bleu_stderr": 0.08297111861263176, "rouge1_fmeasure": 0.3769172098402743, "rouge1_fmeasure_stderr": 0.0016610029156639574, "rouge1_precision": 0.2957939018771136, "rouge1_precision_stderr": 0.0016095004602289511, "rouge1_recall": 0.5510537049287292, "rouge1_recall_stderr": 0.0024827548140078834, "rouge2_fmeasure": 0.1683711858028947, "rouge2_fmeasure_stderr": 0.0014048286913860427, "rouge2_precision": 0.1310898884871186, "rouge2_precision_stderr": 0.0011695549044830754, "rouge2_recall": 0.2514989870527316, "rouge2_recall_stderr": 0.0022100542132105377, "rougeL_fmeasure": 0.26868909350069764, "rougeL_fmeasure_stderr": 0.0013557031410614696, "rougeL_precision": 0.20985604615376746, "rougeL_precision_stderr": 0.0012015087997438354, "rougeL_recall": 0.3967540619875405, "rougeL_recall_stderr": 0.0022977607550473756, "rougeLsum_fmeasure": 0.3220148127414, "rougeLsum_fmeasure_stderr": 0.0016679134262009298, "rougeLsum_precision": 0.2527358286967553, "rougeLsum_precision_stderr": 0.0015401236971888642, "rougeLsum_recall": 0.4706633716608987, "rougeLsum_recall_stderr": 0.0024801531809459493}, "generate_gramatically_correct_text": {"bleu": 1.2641292736786365, "bleu_stderr": 0.097056472591506, "rouge1_fmeasure": 0.10178181807506882, "rouge1_fmeasure_stderr": 0.0030246507145682004, "rouge1_precision": 0.15290909151602036, "rouge1_precision_stderr": 0.0049595466535822955, "rouge1_recall": 0.11499927704768495, "rouge1_recall_stderr": 0.0035939559105208365, "rouge2_fmeasure": 0.038242180726931196, "rouge2_fmeasure_stderr": 0.0013857098786122237, "rouge2_precision": 0.03683534296753499, "rouge2_precision_stderr": 0.0014422357682860986, "rouge2_recall": 0.045310106634696305, "rouge2_recall_stderr": 0.0016718535230670977, "rougeL_fmeasure": 0.07332298026105327, "rougeL_fmeasure_stderr": 0.002162653775276333, "rougeL_precision": 0.12582895602125432, "rougeL_precision_stderr": 0.004611397478517857, "rougeL_recall": 0.0820526177378216, "rougeL_recall_stderr": 0.00257106175276383, "rougeLsum_fmeasure": 0.0858643495813317, "rougeLsum_fmeasure_stderr": 0.0025590711901703174, "rougeLsum_precision": 0.1378091723714546, "rougeLsum_precision_stderr": 0.00475897658619402, "rougeLsum_recall": 0.09649871874135414, "rougeLsum_recall_stderr": 0.003040230110947172}, "generate_text_restaurant": {"bleu": 10.336987597938899, "bleu_stderr": 0.20513507856533955, "rouge1_fmeasure": 0.4170315564856164, "rouge1_fmeasure_stderr": 0.002124259039598598, "rouge1_precision": 0.5143545157562858, "rouge1_precision_stderr": 0.003337934175013788, "rouge1_recall": 0.3857868235592206, "rouge1_recall_stderr": 0.0024923995035469678, "rouge2_fmeasure": 0.19402158147865167, "rouge2_fmeasure_stderr": 0.0017876994973534497, "rouge2_precision": 0.24631251817723201, "rouge2_precision_stderr": 0.002657555991369566, "rouge2_recall": 0.17787346956272435, "rouge2_recall_stderr": 0.001792217462473695, "rougeL_fmeasure": 0.3009842944910936, "rougeL_fmeasure_stderr": 0.0018055725861154817, "rougeL_precision": 0.37299125515052484, "rougeL_precision_stderr": 0.002815154394732632, "rougeL_recall": 0.27850203471081203, "rougeL_recall_stderr": 0.002038628599689064, "rougeLsum_fmeasure": 0.3425854571657328, "rougeLsum_fmeasure_stderr": 0.0020124639191903327, "rougeLsum_precision": 0.42379837225389067, "rougeLsum_precision_stderr": 0.0031034115053880863, "rougeLsum_recall": 0.31679500554979756, "rougeLsum_recall_stderr": 0.002259000761426462}, "text": {"bleu": 6.620027445143791, "bleu_stderr": 0.09375792130378542, "rouge1_fmeasure": 0.4206980580109448, "rouge1_fmeasure_stderr": 0.0019612026897740434, "rouge1_precision": 0.34835218897272274, "rouge1_precision_stderr": 0.0020272801988729847, "rouge1_recall": 0.5642232234850835, "rouge1_recall_stderr": 0.0026596537106805445, "rouge2_fmeasure": 0.19119099944111612, "rouge2_fmeasure_stderr": 0.0016112209219818218, "rouge2_precision": 0.15719137843889044, "rouge2_precision_stderr": 0.0014304675402805396, "rouge2_recall": 0.26134576822206673, "rouge2_recall_stderr": 0.002331449346276716, "rougeL_fmeasure": 0.3042241306518907, "rougeL_fmeasure_stderr": 0.0015396594533973271, "rougeL_precision": 0.2505811728785357, "rougeL_precision_stderr": 0.001468887250836978, "rougeL_recall": 0.4121293895675098, "rougeL_recall_stderr": 0.0023983911022547304, "rougeLsum_fmeasure": 0.35323293802974065, "rougeLsum_fmeasure_stderr": 0.0018984796157355765, "rougeLsum_precision": 0.2925850043674371, "rougeLsum_precision_stderr": 0.001886060348908665, "rougeLsum_recall": 0.4736084330903114, "rougeLsum_recall_stderr": 0.0025599032117973853}}}, "gem_xsum": {"0": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.5037467911105132, "bleu_stderr": 0.07923156278606476, "rouge1_fmeasure": 0.11088128154225567, "rouge1_fmeasure_stderr": 0.002247020389730472, "rouge1_precision": 0.07843610918389116, "rouge1_precision_stderr": 0.0016304546568532293, "rouge1_recall": 0.19781222980051757, "rouge1_recall_stderr": 0.004008447460929873, "rouge2_fmeasure": 0.014155568509608755, "rouge2_fmeasure_stderr": 0.0008063057663091102, "rouge2_precision": 0.009871857582884632, "rouge2_precision_stderr": 0.0005640034860620715, "rouge2_recall": 0.026368970171206544, "rouge2_recall_stderr": 0.0015701889057261075, "rougeL_fmeasure": 0.09287467603067165, "rougeL_fmeasure_stderr": 0.0017134113448546992, "rougeL_precision": 0.06563283750134696, "rougeL_precision_stderr": 0.0012425342106169393, "rougeL_recall": 0.16622607422512797, "rougeL_recall_stderr": 0.0031008472748429656, "rougeLsum_fmeasure": 0.08976009975925459, "rougeLsum_fmeasure_stderr": 0.0018118631890276285, "rougeLsum_precision": 0.0633625318403226, "rougeLsum_precision_stderr": 0.001306257704460161, "rougeLsum_recall": 0.1611657589875393, "rougeLsum_recall_stderr": 0.0033067458850410307}, "DOC_tldr": {"bleu": 0.261262554550647, "bleu_stderr": 0.05001884278243581, "rouge1_fmeasure": 0.07476921388091318, "rouge1_fmeasure_stderr": 0.001653767538430365, "rouge1_precision": 0.05503216443294283, "rouge1_precision_stderr": 0.0013208178441723422, "rouge1_recall": 0.12247694126975207, "rouge1_recall_stderr": 0.0025016532931361178, "rouge2_fmeasure": 0.005848067139995684, "rouge2_fmeasure_stderr": 0.0005763476196672857, "rouge2_precision": 0.0044345537535997155, "rouge2_precision_stderr": 0.0004672948221132119, "rouge2_recall": 0.009160898916001237, "rouge2_recall_stderr": 0.0008442058248417855, "rougeL_fmeasure": 0.06711829058828381, "rougeL_fmeasure_stderr": 0.0013521377427549437, "rougeL_precision": 0.04930420474602141, "rougeL_precision_stderr": 0.0010839279734555082, "rougeL_recall": 0.11045765316293539, "rougeL_recall_stderr": 0.002067309095667932, "rougeLsum_fmeasure": 0.06282572428962137, "rougeLsum_fmeasure_stderr": 0.0013169186728595223, "rougeLsum_precision": 0.04612657642022773, "rougeLsum_precision_stderr": 0.0010535188227501662, "rougeLsum_recall": 0.1036251864997193, "rougeLsum_recall_stderr": 0.0020318246258696476}, "article_DOC_summary": {"bleu": 0.711911214189282, "bleu_stderr": 0.062271095680873176, "rouge1_fmeasure": 0.11144536072130062, "rouge1_fmeasure_stderr": 0.002449613618907836, "rouge1_precision": 0.08057623396604993, "rouge1_precision_stderr": 0.0018427190672612415, "rouge1_recall": 0.1903611222785915, "rouge1_recall_stderr": 0.004119319495363413, "rouge2_fmeasure": 0.01730052045113504, "rouge2_fmeasure_stderr": 0.0010550283614850532, "rouge2_precision": 0.012439638488265747, "rouge2_precision_stderr": 0.0007674450472725088, "rouge2_recall": 0.029923546174765742, "rouge2_recall_stderr": 0.0018325311907880977, "rougeL_fmeasure": 0.09482469984719238, "rougeL_fmeasure_stderr": 0.001891115330305303, "rougeL_precision": 0.06836163772266587, "rougeL_precision_stderr": 0.001418824283099982, "rougeL_recall": 0.16290273051352785, "rougeL_recall_stderr": 0.0032456160924942976, "rougeLsum_fmeasure": 0.09329416183941738, "rougeLsum_fmeasure_stderr": 0.0019565438808381327, "rougeLsum_precision": 0.06720724706425169, "rougeLsum_precision_stderr": 0.001458820899362241, "rougeLsum_recall": 0.16056752379802697, "rougeLsum_recall_stderr": 0.003380030301216589}, "summarize_DOC": {"bleu": 1.2025114531043062, "bleu_stderr": 0.10655129877623105, "rouge1_fmeasure": 0.16881791798821513, "rouge1_fmeasure_stderr": 0.0025746455343428143, "rouge1_precision": 0.1290022798344525, "rouge1_precision_stderr": 0.002385695780968086, "rouge1_recall": 0.281977709942812, "rouge1_recall_stderr": 0.004346065008819319, "rouge2_fmeasure": 0.031013676801335422, "rouge2_fmeasure_stderr": 0.0013438213374447965, "rouge2_precision": 0.02404571804293375, "rouge2_precision_stderr": 0.0013659743034430886, "rouge2_recall": 0.052449257583153115, "rouge2_recall_stderr": 0.002298683613553151, "rougeL_fmeasure": 0.1293936279197813, "rougeL_fmeasure_stderr": 0.001905181791064322, "rougeL_precision": 0.0991345122176881, "rougeL_precision_stderr": 0.001904974022092299, "rougeL_recall": 0.21710334307827317, "rougeL_recall_stderr": 0.0032407820193998097, "rougeLsum_fmeasure": 0.13530856271788863, "rougeLsum_fmeasure_stderr": 0.002063773976743065, "rougeLsum_precision": 0.10327457231643225, "rougeLsum_precision_stderr": 0.001973038870062356, "rougeLsum_recall": 0.2277601179019926, "rougeLsum_recall_stderr": 0.0035824212720264757}, "summarize_this_DOC_summary": {"bleu": 1.6188434485166334, "bleu_stderr": 0.1109564339936777, "rouge1_fmeasure": 0.1900087776056167, "rouge1_fmeasure_stderr": 0.0026590712025527704, "rouge1_precision": 0.14203799200266776, "rouge1_precision_stderr": 0.002270644021652014, "rouge1_recall": 0.31732904475268425, "rouge1_recall_stderr": 0.004576952267589019, "rouge2_fmeasure": 0.040900489822348056, "rouge2_fmeasure_stderr": 0.0015444912268875982, "rouge2_precision": 0.030454613592282354, "rouge2_precision_stderr": 0.001210755597735052, "rouge2_recall": 0.07004943298066989, "rouge2_recall_stderr": 0.0027142097244891622, "rougeL_fmeasure": 0.14526081266341737, "rougeL_fmeasure_stderr": 0.0020046277742893984, "rougeL_precision": 0.1084493048877211, "rougeL_precision_stderr": 0.0017320556174949715, "rougeL_recall": 0.24390681149682772, "rougeL_recall_stderr": 0.0035304448560530047, "rougeLsum_fmeasure": 0.15054379734755471, "rougeLsum_fmeasure_stderr": 0.0021753294084953185, "rougeLsum_precision": 0.11212112563171714, "rougeLsum_precision_stderr": 0.0018151655419957806, "rougeLsum_recall": 0.25327587259397216, "rougeLsum_recall_stderr": 0.0038632056834617744}}, "1": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.8932555367691859, "bleu_stderr": 0.08607396085989544, "rouge1_fmeasure": 0.15514646838340018, "rouge1_fmeasure_stderr": 0.002450478109255688, "rouge1_precision": 0.12349083765294645, "rouge1_precision_stderr": 0.0020272360845469154, "rouge1_recall": 0.24222254243586877, "rouge1_recall_stderr": 0.0044424282559846874, "rouge2_fmeasure": 0.020262527556005907, "rouge2_fmeasure_stderr": 0.0011049454084872373, "rouge2_precision": 0.015138644522028519, "rouge2_precision_stderr": 0.0008835351646857368, "rouge2_recall": 0.034641410753845345, "rouge2_recall_stderr": 0.0019099475070161016, "rougeL_fmeasure": 0.11242576970395425, "rougeL_fmeasure_stderr": 0.0017048075155665426, "rougeL_precision": 0.09008731584721652, "rougeL_precision_stderr": 0.0014860613149287796, "rougeL_recall": 0.17489515829679106, "rougeL_recall_stderr": 0.003107522060120389, "rougeLsum_fmeasure": 0.12222801015667094, "rougeLsum_fmeasure_stderr": 0.001959356143537563, "rougeLsum_precision": 0.09704873972752216, "rougeLsum_precision_stderr": 0.0016068769402850923, "rougeLsum_recall": 0.19197702048862725, "rougeLsum_recall_stderr": 0.0036060489138808275}, "DOC_tldr": {"bleu": 0.5215024206550787, "bleu_stderr": 0.03819786214468247, "rouge1_fmeasure": 0.10028369295745565, "rouge1_fmeasure_stderr": 0.0021540097009297766, "rouge1_precision": 0.07325117374129199, "rouge1_precision_stderr": 0.001655255942277255, "rouge1_recall": 0.1712083046666863, "rouge1_recall_stderr": 0.0036789822383315267, "rouge2_fmeasure": 0.012072025290438592, "rouge2_fmeasure_stderr": 0.0008659837789447237, "rouge2_precision": 0.008743555836383905, "rouge2_precision_stderr": 0.0006295019423085548, "rouge2_recall": 0.021042178818604606, "rouge2_recall_stderr": 0.0015442126736674505, "rougeL_fmeasure": 0.08551310402778783, "rougeL_fmeasure_stderr": 0.001624161272115942, "rougeL_precision": 0.06221015702227314, "rougeL_precision_stderr": 0.001239278126124422, "rougeL_recall": 0.1469982161156331, "rougeL_recall_stderr": 0.0028600463911305847, "rougeLsum_fmeasure": 0.08177586301155866, "rougeLsum_fmeasure_stderr": 0.0016972607545623106, "rougeLsum_precision": 0.05955098102853854, "rougeLsum_precision_stderr": 0.0012927157425887115, "rougeLsum_recall": 0.14057793619440717, "rougeLsum_recall_stderr": 0.0029763151410751565}, "article_DOC_summary": {"bleu": 0.7485653629026496, "bleu_stderr": 0.10161315249240252, "rouge1_fmeasure": 0.12151658666281231, "rouge1_fmeasure_stderr": 0.0025542911124592704, "rouge1_precision": 0.08658830544513134, "rouge1_precision_stderr": 0.0018612611645494558, "rouge1_recall": 0.21204874097915127, "rouge1_recall_stderr": 0.004437751832600217, "rouge2_fmeasure": 0.019132118327200527, "rouge2_fmeasure_stderr": 0.0010395559026960023, "rouge2_precision": 0.013510304634606875, "rouge2_precision_stderr": 0.0007312649740318255, "rouge2_recall": 0.03415295300257043, "rouge2_recall_stderr": 0.0019425669982816587, "rougeL_fmeasure": 0.09875044840716671, "rougeL_fmeasure_stderr": 0.0018572603815356456, "rougeL_precision": 0.0702987343925042, "rougeL_precision_stderr": 0.0013468612933160927, "rougeL_recall": 0.17295883718094748, "rougeL_recall_stderr": 0.0033149082764143117, "rougeLsum_fmeasure": 0.10073542075586238, "rougeLsum_fmeasure_stderr": 0.002008424522536892, "rougeLsum_precision": 0.07169769773353042, "rougeLsum_precision_stderr": 0.001454220219767976, "rougeLsum_recall": 0.17642520944692217, "rougeLsum_recall_stderr": 0.0035711244979865823}, "summarize_DOC": {"bleu": 1.7762050506045142, "bleu_stderr": 0.09926295387249034, "rouge1_fmeasure": 0.20187019098994882, "rouge1_fmeasure_stderr": 0.00244120776332977, "rouge1_precision": 0.14417853642749137, "rouge1_precision_stderr": 0.0018611534066647365, "rouge1_recall": 0.3537153376225196, "rouge1_recall_stderr": 0.0042546837674507666, "rouge2_fmeasure": 0.04334620232538617, "rouge2_fmeasure_stderr": 0.0015695593858469068, "rouge2_precision": 0.030641973891149483, "rouge2_precision_stderr": 0.0011156075040613734, "rouge2_recall": 0.07800524098991497, "rouge2_recall_stderr": 0.0029228930462181354, "rougeL_fmeasure": 0.1507291074053435, "rougeL_fmeasure_stderr": 0.0018184345906770076, "rougeL_precision": 0.10744022317808914, "rougeL_precision_stderr": 0.0013648772722477944, "rougeL_recall": 0.26585677142562, "rougeL_recall_stderr": 0.0033591403270810637, "rougeLsum_fmeasure": 0.16062701136729154, "rougeLsum_fmeasure_stderr": 0.0020501392930297666, "rougeLsum_precision": 0.11450326924869379, "rougeLsum_precision_stderr": 0.001534569154151368, "rougeLsum_recall": 0.2828354708969096, "rougeLsum_recall_stderr": 0.003689314769244396}, "summarize_this_DOC_summary": {"bleu": 1.5775644045947161, "bleu_stderr": 0.10747802490972798, "rouge1_fmeasure": 0.18445302591356055, "rouge1_fmeasure_stderr": 0.002358252600127643, "rouge1_precision": 0.1310768758781413, "rouge1_precision_stderr": 0.0017542917965564532, "rouge1_recall": 0.324216765100875, "rouge1_recall_stderr": 0.004101611205461987, "rouge2_fmeasure": 0.038774277981477374, "rouge2_fmeasure_stderr": 0.0014338449153380509, "rouge2_precision": 0.02727413002222952, "rouge2_precision_stderr": 0.0010077697140453462, "rouge2_recall": 0.06992767890558657, "rouge2_recall_stderr": 0.0026846374203769387, "rougeL_fmeasure": 0.14321716258429631, "rougeL_fmeasure_stderr": 0.001812335103355609, "rougeL_precision": 0.10150380205667145, "rougeL_precision_stderr": 0.001323267745441165, "rougeL_recall": 0.2537245165326843, "rougeL_recall_stderr": 0.003368126070672136, "rougeLsum_fmeasure": 0.14987577117769335, "rougeLsum_fmeasure_stderr": 0.001995803271934798, "rougeLsum_precision": 0.10624902236675653, "rougeLsum_precision_stderr": 0.0014584079648567682, "rougeLsum_recall": 0.26509353636861205, "rougeLsum_recall_stderr": 0.003626121575001854}}, "2": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.1080989461946913, "bleu_stderr": 0.09146307320028801, "rouge1_fmeasure": 0.1672594497220971, "rouge1_fmeasure_stderr": 0.0023821447431805398, "rouge1_precision": 0.12268381637109076, "rouge1_precision_stderr": 0.0018383239534197776, "rouge1_recall": 0.2846467249093633, "rouge1_recall_stderr": 0.00416866407812642, "rouge2_fmeasure": 0.02824595859604695, "rouge2_fmeasure_stderr": 0.0011938469906633742, "rouge2_precision": 0.020208437649204568, "rouge2_precision_stderr": 0.0008690875508991531, "rouge2_recall": 0.04992195461086151, "rouge2_recall_stderr": 0.0021341735292668943, "rougeL_fmeasure": 0.12312979455198288, "rougeL_fmeasure_stderr": 0.0016331251581310881, "rougeL_precision": 0.0902464384634346, "rougeL_precision_stderr": 0.0012744889966160597, "rougeL_recall": 0.21079924235160477, "rougeL_recall_stderr": 0.0029910133249198756, "rougeLsum_fmeasure": 0.1318735139607911, "rougeLsum_fmeasure_stderr": 0.0018667917007005078, "rougeLsum_precision": 0.09655299851334746, "rougeLsum_precision_stderr": 0.001434721313601341, "rougeLsum_recall": 0.22560901395675043, "rougeLsum_recall_stderr": 0.0033495485137766405}, "DOC_tldr": {"bleu": 1.127011706781588, "bleu_stderr": 0.08800278654546138, "rouge1_fmeasure": 0.14215569897506622, "rouge1_fmeasure_stderr": 0.002716440843767019, "rouge1_precision": 0.10254403634811139, "rouge1_precision_stderr": 0.00205888122861127, "rouge1_recall": 0.24531036433761338, "rouge1_recall_stderr": 0.004615810458504819, "rouge2_fmeasure": 0.02751335673945438, "rouge2_fmeasure_stderr": 0.0013158018001875164, "rouge2_precision": 0.019657431817826233, "rouge2_precision_stderr": 0.000956296408061839, "rouge2_recall": 0.048701420734218445, "rouge2_recall_stderr": 0.0023261449730351148, "rougeL_fmeasure": 0.1152398964081201, "rougeL_fmeasure_stderr": 0.0020348120567768926, "rougeL_precision": 0.0829632148581498, "rougeL_precision_stderr": 0.001546655224020721, "rougeL_recall": 0.1999850784660946, "rougeL_recall_stderr": 0.003529529330015122, "rougeLsum_fmeasure": 0.11464083134883725, "rougeLsum_fmeasure_stderr": 0.0021942880329705087, "rougeLsum_precision": 0.08247426765333944, "rougeLsum_precision_stderr": 0.0016467303434733415, "rougeLsum_recall": 0.1991641708522746, "rougeLsum_recall_stderr": 0.003807983414613027}, "article_DOC_summary": {"bleu": 1.0047358326681721, "bleu_stderr": 0.07696741647689843, "rouge1_fmeasure": 0.15372555310846955, "rouge1_fmeasure_stderr": 0.0024917036123975646, "rouge1_precision": 0.10919972863971208, "rouge1_precision_stderr": 0.0018326070695569962, "rouge1_recall": 0.27085077984418787, "rouge1_recall_stderr": 0.004340987074007443, "rouge2_fmeasure": 0.026545543337132424, "rouge2_fmeasure_stderr": 0.001202505861016018, "rouge2_precision": 0.018660104842681158, "rouge2_precision_stderr": 0.0008521301906071727, "rouge2_recall": 0.048121178106264206, "rouge2_recall_stderr": 0.002236763054096929, "rougeL_fmeasure": 0.12124448179130719, "rougeL_fmeasure_stderr": 0.0018406701231741705, "rougeL_precision": 0.08598679476507894, "rougeL_precision_stderr": 0.001347479595633704, "rougeL_recall": 0.21491972476150967, "rougeL_recall_stderr": 0.0033319697703370205, "rougeLsum_fmeasure": 0.12600811759241443, "rougeLsum_fmeasure_stderr": 0.002023272430841333, "rougeLsum_precision": 0.08936454412712036, "rougeLsum_precision_stderr": 0.00147707143314301, "rougeLsum_recall": 0.22313932236999878, "rougeLsum_recall_stderr": 0.003637762226705496}, "summarize_DOC": {"bleu": 1.6727033378910847, "bleu_stderr": 0.09495317261707087, "rouge1_fmeasure": 0.19932155852386563, "rouge1_fmeasure_stderr": 0.002392922806012122, "rouge1_precision": 0.1415629115103802, "rouge1_precision_stderr": 0.0017838032550705376, "rouge1_recall": 0.35091090779004147, "rouge1_recall_stderr": 0.0041843712980091045, "rouge2_fmeasure": 0.04362070001507444, "rouge2_fmeasure_stderr": 0.0015090115501029922, "rouge2_precision": 0.030702156454896785, "rouge2_precision_stderr": 0.0010693945263455453, "rouge2_recall": 0.07873922100816551, "rouge2_recall_stderr": 0.002781171908981175, "rougeL_fmeasure": 0.15012616545978386, "rougeL_fmeasure_stderr": 0.0017748529391718238, "rougeL_precision": 0.10636784579398613, "rougeL_precision_stderr": 0.0013043402766199064, "rougeL_recall": 0.2663335312559493, "rougeL_recall_stderr": 0.0032882907730531676, "rougeLsum_fmeasure": 0.15932750811184002, "rougeLsum_fmeasure_stderr": 0.00199892928658887, "rougeLsum_precision": 0.11293750566369144, "rougeLsum_precision_stderr": 0.0014708469401565362, "rougeLsum_recall": 0.2820820207716993, "rougeLsum_recall_stderr": 0.0036056577642723567}, "summarize_this_DOC_summary": {"bleu": 1.4631699049107234, "bleu_stderr": 0.0818224941441452, "rouge1_fmeasure": 0.18223568358253522, "rouge1_fmeasure_stderr": 0.0023053601425562413, "rouge1_precision": 0.12929139605322415, "rouge1_precision_stderr": 0.0017250930103395053, "rouge1_recall": 0.32127297765523716, "rouge1_recall_stderr": 0.0039219092067371624, "rouge2_fmeasure": 0.03664914264570665, "rouge2_fmeasure_stderr": 0.0013235856869957899, "rouge2_precision": 0.02577874922325936, "rouge2_precision_stderr": 0.0009382634954700045, "rouge2_recall": 0.06615069752192394, "rouge2_recall_stderr": 0.0024396021514475836, "rougeL_fmeasure": 0.1411906750067624, "rougeL_fmeasure_stderr": 0.0017542193090228139, "rougeL_precision": 0.09993852104012416, "rougeL_precision_stderr": 0.001292895388479902, "rougeL_recall": 0.2507415234860173, "rougeL_recall_stderr": 0.00317071813365378, "rougeLsum_fmeasure": 0.14863975004627103, "rougeLsum_fmeasure_stderr": 0.001961623937053356, "rougeLsum_precision": 0.10524886017120061, "rougeLsum_precision_stderr": 0.0014473708528824547, "rougeLsum_recall": 0.2635160093607098, "rougeLsum_recall_stderr": 0.0034669204083299546}}, "3": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.0644281139673455, "bleu_stderr": 0.05948687397505724, "rouge1_fmeasure": 0.16149874282108523, "rouge1_fmeasure_stderr": 0.002582011824364168, "rouge1_precision": 0.11891262113570582, "rouge1_precision_stderr": 0.002001687206629081, "rouge1_recall": 0.27367870029835756, "rouge1_recall_stderr": 0.004475568220518732, "rouge2_fmeasure": 0.02800561543388405, "rouge2_fmeasure_stderr": 0.0011878939240017126, "rouge2_precision": 0.020146490797565537, "rouge2_precision_stderr": 0.0008642878920700437, "rouge2_recall": 0.048883434139436875, "rouge2_recall_stderr": 0.0021010331707940486, "rougeL_fmeasure": 0.12036250889852927, "rougeL_fmeasure_stderr": 0.001844426823256787, "rougeL_precision": 0.08847153927567653, "rougeL_precision_stderr": 0.0014199450476196438, "rougeL_recall": 0.205201991606198, "rougeL_recall_stderr": 0.003311790274919139, "rougeLsum_fmeasure": 0.1289105526954833, "rougeLsum_fmeasure_stderr": 0.0020915120619956393, "rougeLsum_precision": 0.09480824337780196, "rougeLsum_precision_stderr": 0.0016157369586947718, "rougeLsum_recall": 0.21941969701456296, "rougeLsum_recall_stderr": 0.0036949095354463033}, "DOC_tldr": {"bleu": 1.722308165092786, "bleu_stderr": 0.11202608038530598, "rouge1_fmeasure": 0.1724090676689418, "rouge1_fmeasure_stderr": 0.0028418088279796444, "rouge1_precision": 0.1271432311032113, "rouge1_precision_stderr": 0.0023074163775109917, "rouge1_recall": 0.29372316623222805, "rouge1_recall_stderr": 0.0048620902876557745, "rouge2_fmeasure": 0.0402095932041227, "rouge2_fmeasure_stderr": 0.0015367221854734703, "rouge2_precision": 0.029429750673113986, "rouge2_precision_stderr": 0.0011851736679672697, "rouge2_recall": 0.07034873459931881, "rouge2_recall_stderr": 0.002719393016178684, "rougeL_fmeasure": 0.1365099491658134, "rougeL_fmeasure_stderr": 0.0022085318962482355, "rougeL_precision": 0.10042286402298024, "rougeL_precision_stderr": 0.0017760282981670063, "rougeL_recall": 0.23390892681593342, "rougeL_recall_stderr": 0.003893287643962479, "rougeLsum_fmeasure": 0.1368078332815587, "rougeLsum_fmeasure_stderr": 0.002368380322788649, "rougeLsum_precision": 0.10075735870532844, "rougeLsum_precision_stderr": 0.0018958298832118272, "rougeLsum_recall": 0.2342919714250497, "rougeLsum_recall_stderr": 0.0041467607949485135}, "article_DOC_summary": {"bleu": 1.3876528749760366, "bleu_stderr": 0.09352517366139018, "rouge1_fmeasure": 0.16788870363340208, "rouge1_fmeasure_stderr": 0.0026015869149755492, "rouge1_precision": 0.12156176985300436, "rouge1_precision_stderr": 0.0019916743006548566, "rouge1_recall": 0.2919486745230574, "rouge1_recall_stderr": 0.004594202930152972, "rouge2_fmeasure": 0.03291830334125208, "rouge2_fmeasure_stderr": 0.0013348595001679636, "rouge2_precision": 0.023390561110934703, "rouge2_precision_stderr": 0.0009482124699139898, "rouge2_recall": 0.05931283277905219, "rouge2_recall_stderr": 0.00253961685376344, "rougeL_fmeasure": 0.12998891345659275, "rougeL_fmeasure_stderr": 0.00194502365003499, "rougeL_precision": 0.09410425031338661, "rougeL_precision_stderr": 0.001505615984367887, "rougeL_recall": 0.22729829891507616, "rougeL_recall_stderr": 0.0035793671647219765, "rougeLsum_fmeasure": 0.13536430596241408, "rougeLsum_fmeasure_stderr": 0.0020963457328233175, "rougeLsum_precision": 0.09794367331761664, "rougeLsum_precision_stderr": 0.0016077634502078913, "rougeLsum_recall": 0.23679450093245694, "rougeLsum_recall_stderr": 0.003856459984346274}, "summarize_DOC": {"bleu": 1.742293158445559, "bleu_stderr": 0.12475047173558831, "rouge1_fmeasure": 0.1884376588525654, "rouge1_fmeasure_stderr": 0.0025642932469730627, "rouge1_precision": 0.1366930941690243, "rouge1_precision_stderr": 0.0019671239516403935, "rouge1_recall": 0.32595135506801376, "rouge1_recall_stderr": 0.004477539106095352, "rouge2_fmeasure": 0.0400453211123096, "rouge2_fmeasure_stderr": 0.0014862606183809399, "rouge2_precision": 0.028528787233145586, "rouge2_precision_stderr": 0.0010703763072091093, "rouge2_recall": 0.07151102797558972, "rouge2_recall_stderr": 0.0027297730367125982, "rougeL_fmeasure": 0.14317246926611424, "rougeL_fmeasure_stderr": 0.0019379403187274123, "rougeL_precision": 0.1040074617013233, "rougeL_precision_stderr": 0.0015105673758063752, "rougeL_recall": 0.2486131025320305, "rougeL_recall_stderr": 0.0035056965604027203, "rougeLsum_fmeasure": 0.15181862614650385, "rougeLsum_fmeasure_stderr": 0.0021601749082742227, "rougeLsum_precision": 0.11019224321987507, "rougeLsum_precision_stderr": 0.0016599707159991781, "rougeLsum_recall": 0.2636677319513756, "rougeLsum_recall_stderr": 0.0038844849580001823}, "summarize_this_DOC_summary": {"bleu": 1.6024997408404598, "bleu_stderr": 0.08124844005963922, "rouge1_fmeasure": 0.17573577441569346, "rouge1_fmeasure_stderr": 0.0026465445466935253, "rouge1_precision": 0.1270012394870458, "rouge1_precision_stderr": 0.002091745941086545, "rouge1_recall": 0.30424480950743027, "rouge1_recall_stderr": 0.004523774432666432, "rouge2_fmeasure": 0.03701973106444136, "rouge2_fmeasure_stderr": 0.0014575936915393465, "rouge2_precision": 0.026605921898331948, "rouge2_precision_stderr": 0.0011008712275775547, "rouge2_recall": 0.06560029901447437, "rouge2_recall_stderr": 0.002618665050517068, "rougeL_fmeasure": 0.135384466992445, "rougeL_fmeasure_stderr": 0.0020400308616851347, "rougeL_precision": 0.09771059146617143, "rougeL_precision_stderr": 0.0016171559842817049, "rougeL_recall": 0.2357515770528734, "rougeL_recall_stderr": 0.0035751298993674996, "rougeLsum_fmeasure": 0.1425837105636293, "rougeLsum_fmeasure_stderr": 0.0022180825213510558, "rougeLsum_precision": 0.10278375996682501, "rougeLsum_precision_stderr": 0.001730936323308653, "rougeLsum_recall": 0.24848166333986804, "rougeLsum_recall_stderr": 0.0038955439908868214}}, "4": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.5155503095519125, "bleu_stderr": 0.12577156331553185, "rouge1_fmeasure": 0.040556158583151695, "rouge1_fmeasure_stderr": 0.002414780159795799, "rouge1_precision": 0.0354039096330574, "rouge1_precision_stderr": 0.0024221017075653785, "rouge1_recall": 0.06348261494972335, "rouge1_recall_stderr": 0.003911537053406857, "rouge2_fmeasure": 0.00666835063292078, "rouge2_fmeasure_stderr": 0.0007560756618639585, "rouge2_precision": 0.005222818308406, "rouge2_precision_stderr": 0.0006123821346103362, "rouge2_recall": 0.011105721412738316, "rouge2_recall_stderr": 0.001297198171401599, "rougeL_fmeasure": 0.03166574075281406, "rougeL_fmeasure_stderr": 0.0018736700983275614, "rougeL_precision": 0.02786040640367174, "rougeL_precision_stderr": 0.001905350850283381, "rougeL_recall": 0.04948373109395915, "rougeL_recall_stderr": 0.003054509322886882, "rougeLsum_fmeasure": 0.032061891009607074, "rougeLsum_fmeasure_stderr": 0.0019225657173298427, "rougeLsum_precision": 0.02824398060982596, "rougeLsum_precision_stderr": 0.0020175634460573052, "rougeLsum_recall": 0.050498368281708256, "rougeLsum_recall_stderr": 0.0031803074224818545}, "DOC_tldr": {"bleu": 0.7851146582603888, "bleu_stderr": 0.14248563320706192, "rouge1_fmeasure": 0.04736351863009491, "rouge1_fmeasure_stderr": 0.002774562583687205, "rouge1_precision": 0.039100730854070335, "rouge1_precision_stderr": 0.0024339409571454023, "rouge1_recall": 0.07309487784533099, "rouge1_recall_stderr": 0.004271503462792883, "rouge2_fmeasure": 0.010845224152235416, "rouge2_fmeasure_stderr": 0.0009811807847928044, "rouge2_precision": 0.008641952844867724, "rouge2_precision_stderr": 0.0008209970490511706, "rouge2_recall": 0.017044130872016457, "rouge2_recall_stderr": 0.0015311516528441142, "rougeL_fmeasure": 0.037416996326406736, "rougeL_fmeasure_stderr": 0.002165687085012485, "rougeL_precision": 0.031181187159859413, "rougeL_precision_stderr": 0.001963515304654915, "rougeL_recall": 0.05798493820733094, "rougeL_recall_stderr": 0.0033712132693060827, "rougeLsum_fmeasure": 0.03817823692527911, "rougeLsum_fmeasure_stderr": 0.0022299777911451387, "rougeLsum_precision": 0.031826221071430495, "rougeLsum_precision_stderr": 0.002021894152923463, "rougeLsum_recall": 0.05922841807653992, "rougeLsum_recall_stderr": 0.0034905888872791228}, "article_DOC_summary": {"bleu": 0.8091606018729823, "bleu_stderr": 0.13365493953263705, "rouge1_fmeasure": 0.04817132624367883, "rouge1_fmeasure_stderr": 0.002710318861619294, "rouge1_precision": 0.04061391536633297, "rouge1_precision_stderr": 0.0025885117246031656, "rouge1_recall": 0.0774019664424454, "rouge1_recall_stderr": 0.004501264779451914, "rouge2_fmeasure": 0.010104068388385765, "rouge2_fmeasure_stderr": 0.0009810253225945517, "rouge2_precision": 0.007571424737600509, "rouge2_precision_stderr": 0.000774569805719568, "rouge2_recall": 0.017306355810365114, "rouge2_recall_stderr": 0.0017078169253319931, "rougeL_fmeasure": 0.03758941214391744, "rougeL_fmeasure_stderr": 0.002107269506833207, "rougeL_precision": 0.03226926320041072, "rougeL_precision_stderr": 0.0021853447839680425, "rougeL_recall": 0.0606544963878778, "rougeL_recall_stderr": 0.0035695265285015203, "rougeLsum_fmeasure": 0.03925168824333697, "rougeLsum_fmeasure_stderr": 0.0022220799806552142, "rougeLsum_precision": 0.03361123521194327, "rougeLsum_precision_stderr": 0.002255713167385482, "rougeLsum_recall": 0.063038545340123, "rougeLsum_recall_stderr": 0.003730201727070476}, "summarize_DOC": {"bleu": 0.8218145273207486, "bleu_stderr": 0.13259402458841246, "rouge1_fmeasure": 0.0510345184963839, "rouge1_fmeasure_stderr": 0.002788963063519097, "rouge1_precision": 0.04410024857440917, "rouge1_precision_stderr": 0.002722944168692114, "rouge1_recall": 0.07974432748836109, "rouge1_recall_stderr": 0.004470156036803333, "rouge2_fmeasure": 0.010522073701869125, "rouge2_fmeasure_stderr": 0.0009826679169524012, "rouge2_precision": 0.008214763901131556, "rouge2_precision_stderr": 0.0008388994526412292, "rouge2_recall": 0.017137458752784145, "rouge2_recall_stderr": 0.0016302525508184434, "rougeL_fmeasure": 0.039029603911429886, "rougeL_fmeasure_stderr": 0.0021514536166429943, "rougeL_precision": 0.03366378122590164, "rougeL_precision_stderr": 0.0020732284962604146, "rougeL_recall": 0.06128142643464567, "rougeL_recall_stderr": 0.003508572574499959, "rougeLsum_fmeasure": 0.0410270223606163, "rougeLsum_fmeasure_stderr": 0.002268750341839602, "rougeLsum_precision": 0.03523949695288221, "rougeLsum_precision_stderr": 0.0021552641230410848, "rougeLsum_recall": 0.0645342916738449, "rougeLsum_recall_stderr": 0.003702351211849086}, "summarize_this_DOC_summary": {"bleu": 0.7057325975464012, "bleu_stderr": 0.1666309026142423, "rouge1_fmeasure": 0.04277432877434612, "rouge1_fmeasure_stderr": 0.0025508570289566007, "rouge1_precision": 0.03513411483539431, "rouge1_precision_stderr": 0.002220905791930144, "rouge1_recall": 0.06807789519735524, "rouge1_recall_stderr": 0.004165774223715686, "rouge2_fmeasure": 0.008786196844590121, "rouge2_fmeasure_stderr": 0.0009097928996719041, "rouge2_precision": 0.007057485487635765, "rouge2_precision_stderr": 0.0009007492112467273, "rouge2_recall": 0.014720343410307037, "rouge2_recall_stderr": 0.0015318015560624846, "rougeL_fmeasure": 0.03335136680279893, "rougeL_fmeasure_stderr": 0.0019990065744291984, "rougeL_precision": 0.02768868248284088, "rougeL_precision_stderr": 0.0018118702472072936, "rougeL_recall": 0.05338300755867666, "rougeL_recall_stderr": 0.0033015217413937302, "rougeLsum_fmeasure": 0.0352809944078966, "rougeLsum_fmeasure_stderr": 0.002126337470382871, "rougeLsum_precision": 0.029185508102300205, "rougeLsum_precision_stderr": 0.00188906175149616, "rougeLsum_recall": 0.05628446465745351, "rougeLsum_recall_stderr": 0.0034924975937847573}}, "5": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0002124229718569341, "rouge1_fmeasure_stderr": 0.00012635147814425192, "rouge1_precision": 0.002572898799313894, "rouge1_precision_stderr": 0.0014841881904327362, "rouge1_recall": 0.0001111004565947103, "rouge1_recall_stderr": 6.627016158645234e-05, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0002124229718569341, "rougeL_fmeasure_stderr": 0.00012635147814425192, "rougeL_precision": 0.002572898799313894, "rougeL_precision_stderr": 0.0014841881904327362, "rougeL_recall": 0.0001111004565947103, "rougeL_recall_stderr": 6.627016158645234e-05, "rougeLsum_fmeasure": 0.0002124229718569341, "rougeLsum_fmeasure_stderr": 0.00012635147814425192, "rougeLsum_precision": 0.002572898799313894, "rougeLsum_precision_stderr": 0.0014841881904327362, "rougeLsum_recall": 0.0001111004565947103, "rougeLsum_recall_stderr": 6.627016158645234e-05}, "DOC_tldr": {"bleu": 1.5125237582746943e-42, "bleu_stderr": 1.0979696174822401e-35, "rouge1_fmeasure": 0.0022081044921040066, "rouge1_fmeasure_stderr": 0.0006238927634391291, "rouge1_precision": 0.0024406380399721135, "rouge1_precision_stderr": 0.0006673287893985838, "rouge1_recall": 0.002125069578891824, "rouge1_recall_stderr": 0.0006354178152671431, "rouge2_fmeasure": 0.0003107051777238192, "rouge2_fmeasure_stderr": 0.0001860944713122803, "rouge2_precision": 0.0003296901951174981, "rouge2_precision_stderr": 0.00018804946463358912, "rouge2_recall": 0.0003008969679774049, "rouge2_recall_stderr": 0.00018767733482335345, "rougeL_fmeasure": 0.0017991228311771584, "rougeL_fmeasure_stderr": 0.0004996511492676782, "rougeL_precision": 0.0019949852219552553, "rougeL_precision_stderr": 0.0005366470253705943, "rougeL_recall": 0.0017177557215678593, "rougeL_recall_stderr": 0.00049942930432405, "rougeLsum_fmeasure": 0.0019064317416731434, "rougeLsum_fmeasure_stderr": 0.0005368859371026533, "rougeLsum_precision": 0.002116903629112286, "rougeLsum_precision_stderr": 0.0005752535478594655, "rougeLsum_recall": 0.001816496355313458, "rougeLsum_recall_stderr": 0.0005356117222057922}, "article_DOC_summary": {"bleu": 2.9417748605436574e-39, "bleu_stderr": 1.644365126953672e-33, "rouge1_fmeasure": 0.002218415772902317, "rouge1_fmeasure_stderr": 0.0005977307451849004, "rouge1_precision": 0.0025292500918997793, "rouge1_precision_stderr": 0.0006843948420078455, "rouge1_recall": 0.0020532297175197265, "rouge1_recall_stderr": 0.0005556296741534733, "rouge2_fmeasure": 0.00041371259854665804, "rouge2_fmeasure_stderr": 0.0002415623180229552, "rouge2_precision": 0.0004376650603065697, "rouge2_precision_stderr": 0.00024169751059179606, "rouge2_recall": 0.00040004436924525715, "rouge2_recall_stderr": 0.0002447488200268485, "rougeL_fmeasure": 0.0018828340816919485, "rougeL_fmeasure_stderr": 0.0005121954193145257, "rougeL_precision": 0.0021231909904583543, "rougeL_precision_stderr": 0.0005742362353931501, "rougeL_recall": 0.0017609197535564964, "rougeL_recall_stderr": 0.0004851127145778472, "rougeLsum_fmeasure": 0.0019227239855572795, "rougeLsum_fmeasure_stderr": 0.0005197852399034371, "rougeLsum_precision": 0.002170837264519722, "rougeLsum_precision_stderr": 0.0005838940116621144, "rougeLsum_recall": 0.0017952250708806817, "rougeLsum_recall_stderr": 0.0004910386484249093}, "summarize_DOC": {"bleu": 4.273566994720392e-38, "bleu_stderr": 1.8291126911657586e-32, "rouge1_fmeasure": 0.0026601966494432523, "rouge1_fmeasure_stderr": 0.0007275199319288471, "rouge1_precision": 0.0030006953279340686, "rouge1_precision_stderr": 0.0008260104673619008, "rouge1_recall": 0.002448875117757725, "rouge1_recall_stderr": 0.0006686883097147919, "rouge2_fmeasure": 0.00046275158053195667, "rouge2_fmeasure_stderr": 0.00021785531794482434, "rouge2_precision": 0.0005102355407571833, "rouge2_precision_stderr": 0.00024289107214722606, "rouge2_recall": 0.0004277053874723131, "rouge2_recall_stderr": 0.00019999235661580537, "rougeL_fmeasure": 0.002154351416462934, "rougeL_fmeasure_stderr": 0.0005935536504835335, "rougeL_precision": 0.002398796026490401, "rougeL_precision_stderr": 0.000661743079139523, "rougeL_recall": 0.0020040874122394833, "rougeL_recall_stderr": 0.0005526887153193406, "rougeLsum_fmeasure": 0.002139637125943983, "rougeLsum_fmeasure_stderr": 0.0005838831183619806, "rougeLsum_precision": 0.00239564296423634, "rougeLsum_precision_stderr": 0.0006561431092058282, "rougeLsum_recall": 0.0019841067166653605, "rougeLsum_recall_stderr": 0.0005417182019917762}, "summarize_this_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0008504247165137163, "rouge1_fmeasure_stderr": 0.00029663122917791953, "rouge1_precision": 0.003716409376786735, "rouge1_precision_stderr": 0.0012903356644372312, "rouge1_recall": 0.0004836451099249516, "rouge1_recall_stderr": 0.00016932836035127237, "rouge2_fmeasure": 6.352836541515787e-05, "rouge2_fmeasure_stderr": 6.352836541515829e-05, "rouge2_precision": 0.0004288164665523156, "rouge2_precision_stderr": 0.00042881646655231734, "rouge2_recall": 3.430531732418525e-05, "rouge2_recall_stderr": 3.430531732418596e-05, "rougeL_fmeasure": 0.0007687453895513705, "rougeL_fmeasure_stderr": 0.000260905903643579, "rougeL_precision": 0.003430531732418525, "rougeL_precision_stderr": 0.0011921960276713975, "rougeL_recall": 0.0004359988358635832, "rougeL_recall_stderr": 0.00014796403981705004, "rougeLsum_fmeasure": 0.0007687453895513705, "rougeLsum_fmeasure_stderr": 0.000260905903643579, "rougeLsum_precision": 0.003430531732418525, "rougeLsum_precision_stderr": 0.0011921960276713975, "rougeLsum_recall": 0.0004359988358635832, "rougeLsum_recall_stderr": 0.00014796403981705004}}}, "piqa": {"0": {"Correct the solution": {"bleu": 4.4726418073213345, "bleu_stderr": 0.16753723606834903, "rouge1_fmeasure": 0.13738094874969342, "rouge1_fmeasure_stderr": 0.0040877314692411865, "rouge1_precision": 0.09990759568043395, "rouge1_precision_stderr": 0.003962742821805288, "rouge1_recall": 0.542668484521527, "rouge1_recall_stderr": 0.008163857356083524, "rouge2_fmeasure": 0.09706102035374112, "rouge2_fmeasure_stderr": 0.003653466658553787, "rouge2_precision": 0.07078028109191174, "rouge2_precision_stderr": 0.0034244977285782584, "rouge2_recall": 0.3897797823742886, "rouge2_recall_stderr": 0.008526350760099893, "rougeL_fmeasure": 0.13447315238265775, "rougeL_fmeasure_stderr": 0.004050384508000915, "rougeL_precision": 0.09777072760562562, "rougeL_precision_stderr": 0.0039192045347804325, "rougeL_recall": 0.532792582986373, "rougeL_recall_stderr": 0.008177843897950418, "rougeLsum_fmeasure": 0.13173120941865174, "rougeLsum_fmeasure_stderr": 0.004084755897159187, "rougeLsum_precision": 0.09637631448880932, "rougeLsum_precision_stderr": 0.003951999111601633, "rougeLsum_recall": 0.5181791996400547, "rougeLsum_recall_stderr": 0.008308241749147611}, "choose the most appropriate solution": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "no prompt needed": {"bleu": 0.19326617690121023, "bleu_stderr": 0.00752510845276588, "rouge1_fmeasure": 0.03752514813814976, "rouge1_fmeasure_stderr": 0.0008846335161188192, "rouge1_precision": 0.021769865204723387, "rouge1_precision_stderr": 0.0005817240957449695, "rouge1_recall": 0.23632094446041846, "rouge1_recall_stderr": 0.004083513329156124, "rouge2_fmeasure": 0.005928136888518339, "rouge2_fmeasure_stderr": 0.00027560748473478435, "rouge2_precision": 0.003405753331635085, "rouge2_precision_stderr": 0.00016739880607996655, "rouge2_recall": 0.03960148154546242, "rouge2_recall_stderr": 0.0019531504378148563, "rougeL_fmeasure": 0.0339518071889011, "rougeL_fmeasure_stderr": 0.0007449293744359491, "rougeL_precision": 0.019577427737286695, "rougeL_precision_stderr": 0.00047793881065340873, "rougeL_recall": 0.21917768067787397, "rougeL_recall_stderr": 0.0037979740764717976, "rougeLsum_fmeasure": 0.031229364613330524, "rougeLsum_fmeasure_stderr": 0.0007424902988435048, "rougeLsum_precision": 0.01809711784394688, "rougeLsum_precision_stderr": 0.0004916087758295677, "rougeLsum_recall": 0.20397788862519056, "rougeLsum_recall_stderr": 0.003704689919592339}, "pick_correct_choice_index": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "what_is_the_correct_ending": {"acc": 0.5565832426550599, "acc_norm": 0.5418933623503809, "acc_norm_stderr": 0.011624803747232126, "acc_stderr": 0.011590883373666863}}, "1": {"Correct the solution": {"bleu": 6.388997984440001, "bleu_stderr": 0.22030624130684912, "rouge1_fmeasure": 0.23339725988760335, "rouge1_fmeasure_stderr": 0.005577890928986144, "rouge1_precision": 0.1899985239367049, "rouge1_precision_stderr": 0.005524798334171625, "rouge1_recall": 0.6197096800293467, "rouge1_recall_stderr": 0.007218975234253726, "rouge2_fmeasure": 0.16839814753926893, "rouge2_fmeasure_stderr": 0.005178370714369835, "rouge2_precision": 0.13527730400642915, "rouge2_precision_stderr": 0.00492492204254613, "rouge2_recall": 0.45578433847724403, "rouge2_recall_stderr": 0.008317705400555226, "rougeL_fmeasure": 0.2248230330602144, "rougeL_fmeasure_stderr": 0.005533593484296289, "rougeL_precision": 0.18193802149181904, "rougeL_precision_stderr": 0.005417257807881834, "rougeL_recall": 0.6024769760148035, "rougeL_recall_stderr": 0.007410768687993907, "rougeLsum_fmeasure": 0.2260074903815481, "rougeLsum_fmeasure_stderr": 0.005576688947640353, "rougeLsum_precision": 0.18398509430200422, "rougeLsum_precision_stderr": 0.0054807247743772465, "rougeLsum_recall": 0.5984789556418103, "rougeLsum_recall_stderr": 0.007439193497312767}, "choose the most appropriate solution": {"acc": 0.5087051142546246, "acc_norm": 0.5087051142546246, "acc_norm_stderr": 0.011664055982032837, "acc_stderr": 0.011664055982032837}, "no prompt needed": {"bleu": 0.17941583254637414, "bleu_stderr": 0.008838972109532342, "rouge1_fmeasure": 0.036115788468476816, "rouge1_fmeasure_stderr": 0.0008787800610938908, "rouge1_precision": 0.02104978239320399, "rouge1_precision_stderr": 0.0006182804474426174, "rouge1_recall": 0.22374374255453272, "rouge1_recall_stderr": 0.004184853646640156, "rouge2_fmeasure": 0.005682715949708656, "rouge2_fmeasure_stderr": 0.00026165286218016116, "rouge2_precision": 0.003222523734541128, "rouge2_precision_stderr": 0.00015290781225805715, "rouge2_recall": 0.04029314519755627, "rouge2_recall_stderr": 0.0020801960108103577, "rougeL_fmeasure": 0.033515515939669824, "rougeL_fmeasure_stderr": 0.0007571377468473982, "rougeL_precision": 0.01937672404519423, "rougeL_precision_stderr": 0.0004938315065636547, "rougeL_recall": 0.21137283250925631, "rougeL_recall_stderr": 0.003919038757731567, "rougeLsum_fmeasure": 0.02922662157472046, "rougeLsum_fmeasure_stderr": 0.0006947111027433623, "rougeLsum_precision": 0.01691374585639976, "rougeLsum_precision_stderr": 0.0004598604234910933, "rougeLsum_recall": 0.18918805907421266, "rougeLsum_recall_stderr": 0.0037299738776066143}, "pick_correct_choice_index": {"acc": 0.5076169749727966, "acc_norm": 0.5076169749727966, "acc_norm_stderr": 0.011664470424044976, "acc_stderr": 0.011664470424044976}, "what_is_the_correct_ending": {"acc": 0.5685527747551686, "acc_norm": 0.5674646354733406, "acc_norm_stderr": 0.011559142916063143, "acc_stderr": 0.01155565729886461}}, "2": {"Correct the solution": {"bleu": 7.1654235804393975, "bleu_stderr": 0.23664442530069865, "rouge1_fmeasure": 0.28818203262436376, "rouge1_fmeasure_stderr": 0.006789855255273055, "rouge1_precision": 0.2632982490541137, "rouge1_precision_stderr": 0.0072621786329556, "rouge1_recall": 0.6328871785091988, "rouge1_recall_stderr": 0.007011851144328955, "rouge2_fmeasure": 0.21700191007059494, "rouge2_fmeasure_stderr": 0.006362729074282588, "rouge2_precision": 0.19631851485753235, "rouge2_precision_stderr": 0.006522555304684875, "rouge2_recall": 0.47001882425100405, "rouge2_recall_stderr": 0.008218114595065874, "rougeL_fmeasure": 0.27789781089889687, "rougeL_fmeasure_stderr": 0.006777757204322281, "rougeL_precision": 0.2526725103593678, "rougeL_precision_stderr": 0.007156069139226169, "rougeL_recall": 0.611025032648354, "rougeL_recall_stderr": 0.00729448166920107, "rougeLsum_fmeasure": 0.28052899465966125, "rougeLsum_fmeasure_stderr": 0.00677957486590257, "rougeLsum_precision": 0.2555628679056802, "rougeLsum_precision_stderr": 0.007187190219846719, "rougeLsum_recall": 0.6156889934973935, "rougeLsum_recall_stderr": 0.007241268875757391}, "choose the most appropriate solution": {"acc": 0.5223068552774756, "acc_norm": 0.5223068552774756, "acc_norm_stderr": 0.011654208652596476, "acc_stderr": 0.011654208652596476}, "no prompt needed": {"bleu": 0.17745296114678968, "bleu_stderr": 0.008624930392648838, "rouge1_fmeasure": 0.03499069784081937, "rouge1_fmeasure_stderr": 0.0008446359024973105, "rouge1_precision": 0.02112776839925051, "rouge1_precision_stderr": 0.000877980468137592, "rouge1_recall": 0.218206359468982, "rouge1_recall_stderr": 0.004101332930154146, "rouge2_fmeasure": 0.005621916396892083, "rouge2_fmeasure_stderr": 0.0002732592116855481, "rouge2_precision": 0.0032849283759100656, "rouge2_precision_stderr": 0.00018537681002876464, "rouge2_recall": 0.03820330073739487, "rouge2_recall_stderr": 0.0020212152218120854, "rougeL_fmeasure": 0.032907905075395594, "rougeL_fmeasure_stderr": 0.0007674706272733912, "rougeL_precision": 0.01981043631160736, "rougeL_precision_stderr": 0.0007993400917579435, "rougeL_recall": 0.2071458409919012, "rougeL_recall_stderr": 0.003867415965532175, "rougeLsum_fmeasure": 0.028356416929466385, "rougeLsum_fmeasure_stderr": 0.0006881727791999624, "rougeLsum_precision": 0.017213819703019412, "rougeLsum_precision_stderr": 0.0007926417088864289, "rougeLsum_recall": 0.1841964822860343, "rougeLsum_recall_stderr": 0.003656569397088453}, "pick_correct_choice_index": {"acc": 0.5, "acc_norm": 0.5, "acc_norm_stderr": 0.011665824165343952, "acc_stderr": 0.011665824165343952}, "what_is_the_correct_ending": {"acc": 0.5718171926006529, "acc_norm": 0.5723612622415669, "acc_norm_stderr": 0.01154300962328283, "acc_stderr": 0.011544859155318844}}, "3": {"Correct the solution": {"bleu": 7.130010006945033, "bleu_stderr": 0.36614351972675196, "rouge1_fmeasure": 0.28765257778776016, "rouge1_fmeasure_stderr": 0.007122033948855405, "rouge1_precision": 0.27187069677018444, "rouge1_precision_stderr": 0.007724489648815434, "rouge1_recall": 0.6409983231082317, "rouge1_recall_stderr": 0.0071549123595310085, "rouge2_fmeasure": 0.2220313726729203, "rouge2_fmeasure_stderr": 0.006635540925947997, "rouge2_precision": 0.2074993154694547, "rouge2_precision_stderr": 0.006958791199014906, "rouge2_recall": 0.48909595674061107, "rouge2_recall_stderr": 0.008272314038685013, "rougeL_fmeasure": 0.27918830024031255, "rougeL_fmeasure_stderr": 0.007101399798328581, "rougeL_precision": 0.2622707060445547, "rougeL_precision_stderr": 0.007591102476600222, "rougeL_recall": 0.6217153240581279, "rougeL_recall_stderr": 0.007407513056831873, "rougeLsum_fmeasure": 0.2814597792431154, "rougeLsum_fmeasure_stderr": 0.007102157344044307, "rougeLsum_precision": 0.2651656938002711, "rougeLsum_precision_stderr": 0.007644202201817951, "rougeLsum_recall": 0.626997130842089, "rougeLsum_recall_stderr": 0.007343317727382677}, "choose the most appropriate solution": {"acc": 0.5092491838955386, "acc_norm": 0.5092491838955386, "acc_norm_stderr": 0.011663828032649183, "acc_stderr": 0.011663828032649183}, "no prompt needed": {"bleu": 0.18436487057820314, "bleu_stderr": 0.010658373126425667, "rouge1_fmeasure": 0.034174719119795666, "rouge1_fmeasure_stderr": 0.0008567973159331906, "rouge1_precision": 0.02057704871830096, "rouge1_precision_stderr": 0.0007494006749779765, "rouge1_recall": 0.21357292385095153, "rouge1_recall_stderr": 0.004127038723114807, "rouge2_fmeasure": 0.005486989401606149, "rouge2_fmeasure_stderr": 0.000276704602860595, "rouge2_precision": 0.0033630628167333936, "rouge2_precision_stderr": 0.00028514960802026207, "rouge2_recall": 0.03759233299677987, "rouge2_recall_stderr": 0.0020872224529434385, "rougeL_fmeasure": 0.0321009603843197, "rougeL_fmeasure_stderr": 0.000772867121470867, "rougeL_precision": 0.019175227864817394, "rougeL_precision_stderr": 0.0006222647397159332, "rougeL_recall": 0.2029780421179166, "rougeL_recall_stderr": 0.003897679155093511, "rougeLsum_fmeasure": 0.02777350418405329, "rougeLsum_fmeasure_stderr": 0.0007032861760892736, "rougeLsum_precision": 0.016726612502688427, "rougeLsum_precision_stderr": 0.0006097435643352546, "rougeLsum_recall": 0.18052347991125617, "rougeLsum_recall_stderr": 0.003650693342311041}, "pick_correct_choice_index": {"acc": 0.515778019586507, "acc_norm": 0.515778019586507, "acc_norm_stderr": 0.011660014400426185, "acc_stderr": 0.011660014400426185}, "what_is_the_correct_ending": {"acc": 0.5663764961915125, "acc_norm": 0.5723612622415669, "acc_norm_stderr": 0.011543009623282828, "acc_stderr": 0.011562571737707342}}, "4": {"Correct the solution": {"bleu": 6.664095500884982, "bleu_stderr": 0.34838973765349207, "rouge1_fmeasure": 0.2771124149282886, "rouge1_fmeasure_stderr": 0.007080361587320621, "rouge1_precision": 0.2587985355155675, "rouge1_precision_stderr": 0.007619203655004035, "rouge1_recall": 0.6552543491315841, "rouge1_recall_stderr": 0.007002192904681263, "rouge2_fmeasure": 0.21583669822052345, "rouge2_fmeasure_stderr": 0.006557201662967979, "rouge2_precision": 0.19927034297156443, "rouge2_precision_stderr": 0.0068409334865776284, "rouge2_recall": 0.5044582337397387, "rouge2_recall_stderr": 0.008192238508307557, "rougeL_fmeasure": 0.2697811327830668, "rougeL_fmeasure_stderr": 0.007067126106733718, "rougeL_precision": 0.25108837349831326, "rougeL_precision_stderr": 0.007533159919158992, "rougeL_recall": 0.6368030343144712, "rougeL_recall_stderr": 0.007249065063061643, "rougeLsum_fmeasure": 0.2717897500548224, "rougeLsum_fmeasure_stderr": 0.007057987573730667, "rougeLsum_precision": 0.253246129669511, "rougeLsum_precision_stderr": 0.0075494739014644666, "rougeLsum_recall": 0.6429352137151435, "rougeLsum_recall_stderr": 0.007173543257338698}, "choose the most appropriate solution": {"acc": 0.5282916213275299, "acc_norm": 0.5282916213275299, "acc_norm_stderr": 0.011647134172749322, "acc_stderr": 0.011647134172749322}, "no prompt needed": {"bleu": 0.1662496800572863, "bleu_stderr": 0.009003816484519792, "rouge1_fmeasure": 0.033492958359078145, "rouge1_fmeasure_stderr": 0.0008122527510855113, "rouge1_precision": 0.019771160862529565, "rouge1_precision_stderr": 0.0005967888743923156, "rouge1_recall": 0.21013503593016225, "rouge1_recall_stderr": 0.004080380870732406, "rouge2_fmeasure": 0.005250361302057742, "rouge2_fmeasure_stderr": 0.000250744194605345, "rouge2_precision": 0.002967361151580668, "rouge2_precision_stderr": 0.00014505452013749797, "rouge2_recall": 0.03712110581081379, "rouge2_recall_stderr": 0.0020199022271893104, "rougeL_fmeasure": 0.03142149026212096, "rougeL_fmeasure_stderr": 0.0007237635478632389, "rougeL_precision": 0.018431702367967623, "rougeL_precision_stderr": 0.0005086664104353652, "rougeL_recall": 0.20000526433171545, "rougeL_recall_stderr": 0.0038644792190701566, "rougeLsum_fmeasure": 0.02710335704553083, "rougeLsum_fmeasure_stderr": 0.0006613328324069825, "rougeLsum_precision": 0.016047371965998675, "rougeLsum_precision_stderr": 0.00051159998028551, "rougeLsum_recall": 0.1770149433781237, "rougeLsum_recall_stderr": 0.0035827141690138965}, "pick_correct_choice_index": {"acc": 0.5228509249183896, "acc_norm": 0.5228509249183896, "acc_norm_stderr": 0.01165363483240117, "acc_stderr": 0.01165363483240117}, "what_is_the_correct_ending": {"acc": 0.5865070729053319, "acc_norm": 0.5745375408052231, "acc_norm_stderr": 0.01153546884082453, "acc_stderr": 0.011489895831821135}}, "5": {"Correct the solution": {"bleu": 6.446672886311619, "bleu_stderr": 0.27464231120207433, "rouge1_fmeasure": 0.26736214869813785, "rouge1_fmeasure_stderr": 0.0070988275428792045, "rouge1_precision": 0.2528718402860418, "rouge1_precision_stderr": 0.007719845208329688, "rouge1_recall": 0.654561878141819, "rouge1_recall_stderr": 0.00698855116841065, "rouge2_fmeasure": 0.20868674330105244, "rouge2_fmeasure_stderr": 0.006522205599689379, "rouge2_precision": 0.193812416956008, "rouge2_precision_stderr": 0.006844516632300937, "rouge2_recall": 0.5042421877334644, "rouge2_recall_stderr": 0.008241858798974178, "rougeL_fmeasure": 0.2599202410891578, "rougeL_fmeasure_stderr": 0.007057224366690571, "rougeL_precision": 0.24481322454808105, "rougeL_precision_stderr": 0.007592775252281026, "rougeL_recall": 0.6363050209515578, "rougeL_recall_stderr": 0.0072450011340579445, "rougeLsum_fmeasure": 0.2621481175778254, "rougeLsum_fmeasure_stderr": 0.007056258826347817, "rougeLsum_precision": 0.24690879630896684, "rougeLsum_precision_stderr": 0.007607566463992895, "rougeLsum_recall": 0.6431100118567753, "rougeLsum_recall_stderr": 0.007155463175240526}, "choose the most appropriate solution": {"acc": 0.5114254624591947, "acc_norm": 0.5114254624591947, "acc_norm_stderr": 0.011662778026451659, "acc_stderr": 0.011662778026451659}, "no prompt needed": {"bleu": 0.17909655705334981, "bleu_stderr": 0.01001948544264451, "rouge1_fmeasure": 0.0329209856904888, "rouge1_fmeasure_stderr": 0.0008425698682080111, "rouge1_precision": 0.019621328243135686, "rouge1_precision_stderr": 0.0007677978709294303, "rouge1_recall": 0.20744825708534467, "rouge1_recall_stderr": 0.004062728081792751, "rouge2_fmeasure": 0.005515135528910162, "rouge2_fmeasure_stderr": 0.000282385997180886, "rouge2_precision": 0.003330941009610586, "rouge2_precision_stderr": 0.0002884450625712378, "rouge2_recall": 0.03734761717066073, "rouge2_recall_stderr": 0.0019901336224895593, "rougeL_fmeasure": 0.030971756359625915, "rougeL_fmeasure_stderr": 0.0007421166076215399, "rougeL_precision": 0.018273289410454367, "rougeL_precision_stderr": 0.0005985872356689969, "rougeL_recall": 0.19766569572595405, "rougeL_recall_stderr": 0.0038632884293854372, "rougeLsum_fmeasure": 0.02675062591564841, "rougeLsum_fmeasure_stderr": 0.0006935141958106178, "rougeLsum_precision": 0.01596955228121598, "rougeLsum_precision_stderr": 0.0006686144046345224, "rougeLsum_recall": 0.1762042254326998, "rougeLsum_recall_stderr": 0.0036326407496276346}, "pick_correct_choice_index": {"acc": 0.5021762785636561, "acc_norm": 0.5021762785636561, "acc_norm_stderr": 0.011665713661738877, "acc_stderr": 0.011665713661738877}, "what_is_the_correct_ending": {"acc": 0.5848748639825898, "acc_norm": 0.5761697497279652, "acc_norm_stderr": 0.011529663270276293, "acc_stderr": 0.011496520442659124}}}, "sciq": {"0": {"Direct Question": {"acc": 0.83, "acc_norm": 0.741, "acc_norm_stderr": 0.013860415257527911, "acc_stderr": 0.011884495834541665}, "Direct Question (Closed Book)": {"acc": 0.613, "acc_norm": 0.543, "acc_norm_stderr": 0.01576069159013639, "acc_stderr": 0.015410011955493933}, "Multiple Choice": {"acc": 0.342, "acc_norm": 0.346, "acc_norm_stderr": 0.015050266127564445, "acc_stderr": 0.015008706182121728}, "Multiple Choice (Closed Book)": {"acc": 0.287, "acc_norm": 0.315, "acc_norm_stderr": 0.0146966319607925, "acc_stderr": 0.014312087053809965}, "Multiple Choice Question First": {"acc": 0.349, "acc_norm": 0.339, "acc_norm_stderr": 0.014976758771620349, "acc_stderr": 0.015080663991563104}}, "1": {"Direct Question": {"acc": 0.846, "acc_norm": 0.794, "acc_norm_stderr": 0.012795613612786525, "acc_stderr": 0.011419913065098698}, "Direct Question (Closed Book)": {"acc": 0.663, "acc_norm": 0.622, "acc_norm_stderr": 0.015341165254026642, "acc_stderr": 0.014955087918653591}, "Multiple Choice": {"acc": 0.378, "acc_norm": 0.371, "acc_norm_stderr": 0.01528373621182319, "acc_stderr": 0.015341165254026644}, "Multiple Choice (Closed Book)": {"acc": 0.378, "acc_norm": 0.358, "acc_norm_stderr": 0.015167928865407559, "acc_stderr": 0.015341165254026649}, "Multiple Choice Question First": {"acc": 0.392, "acc_norm": 0.38, "acc_norm_stderr": 0.015356947477797579, "acc_stderr": 0.01544585946377129}}, "2": {"Direct Question": {"acc": 0.853, "acc_norm": 0.805, "acc_norm_stderr": 0.012535235623319325, "acc_stderr": 0.011203415395160335}, "Direct Question (Closed Book)": {"acc": 0.673, "acc_norm": 0.637, "acc_norm_stderr": 0.015213890444671287, "acc_stderr": 0.01484221315341124}, "Multiple Choice": {"acc": 0.344, "acc_norm": 0.323, "acc_norm_stderr": 0.014794927843348635, "acc_stderr": 0.015029633724408948}, "Multiple Choice (Closed Book)": {"acc": 0.372, "acc_norm": 0.37, "acc_norm_stderr": 0.015275252316519362, "acc_stderr": 0.015292149942040577}, "Multiple Choice Question First": {"acc": 0.363, "acc_norm": 0.386, "acc_norm_stderr": 0.015402637476784376, "acc_stderr": 0.015213890444671278}}, "3": {"Direct Question": {"acc": 0.856, "acc_norm": 0.804, "acc_norm_stderr": 0.012559527926707345, "acc_stderr": 0.011107987548939149}, "Direct Question (Closed Book)": {"acc": 0.662, "acc_norm": 0.65, "acc_norm_stderr": 0.015090650341444236, "acc_stderr": 0.014965960710224472}, "Multiple Choice": {"acc": 0.329, "acc_norm": 0.362, "acc_norm_stderr": 0.015204840912919501, "acc_stderr": 0.014865395385928369}, "Multiple Choice (Closed Book)": {"acc": 0.349, "acc_norm": 0.364, "acc_norm_stderr": 0.015222868840522024, "acc_stderr": 0.015080663991563102}, "Multiple Choice Question First": {"acc": 0.363, "acc_norm": 0.363, "acc_norm_stderr": 0.015213890444671283, "acc_stderr": 0.015213890444671285}}, "4": {"Direct Question": {"acc": 0.849, "acc_norm": 0.81, "acc_norm_stderr": 0.012411851354816324, "acc_stderr": 0.01132816522334168}, "Direct Question (Closed Book)": {"acc": 0.671, "acc_norm": 0.66, "acc_norm_stderr": 0.014987482264363935, "acc_stderr": 0.014865395385928369}, "Multiple Choice": {"acc": 0.335, "acc_norm": 0.335, "acc_norm_stderr": 0.014933117490932577, "acc_stderr": 0.014933117490932577}, "Multiple Choice (Closed Book)": {"acc": 0.335, "acc_norm": 0.358, "acc_norm_stderr": 0.015167928865407559, "acc_stderr": 0.014933117490932575}, "Multiple Choice Question First": {"acc": 0.319, "acc_norm": 0.345, "acc_norm_stderr": 0.015039986742055237, "acc_stderr": 0.014746404865473487}}, "5": {"Direct Question": {"acc": 0.849, "acc_norm": 0.816, "acc_norm_stderr": 0.012259457340938598, "acc_stderr": 0.011328165223341678}, "Direct Question (Closed Book)": {"acc": 0.682, "acc_norm": 0.678, "acc_norm_stderr": 0.014782913600996683, "acc_stderr": 0.014734079309311901}, "Multiple Choice": {"acc": 0.327, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229857, "acc_stderr": 0.014842213153411249}, "Multiple Choice (Closed Book)": {"acc": 0.362, "acc_norm": 0.356, "acc_norm_stderr": 0.015149042659306625, "acc_stderr": 0.015204840912919498}, "Multiple Choice Question First": {"acc": 0.333, "acc_norm": 0.346, "acc_norm_stderr": 0.01505026612756445, "acc_stderr": 0.014910846164229852}}}, "story_cloze_2016": {"0": {"Answer Given options": {"acc": 0.4719401389631213, "acc_norm": 0.5050774986638161, "acc_norm_stderr": 0.011561836054238783, "acc_stderr": 0.011544210396951672}, "Choose Story Ending": {"acc": 0.4906467129877071, "acc_norm": 0.532870122928915, "acc_norm_stderr": 0.011537420054210306, "acc_stderr": 0.011560409019420364}, "Novel Correct Ending": {"acc": 0.4831640833778728, "acc_norm": 0.51309460181721, "acc_norm_stderr": 0.011558466383367178, "acc_stderr": 0.011555875693960773}, "Story Continuation and Options": {"acc": 0.49706039551042225, "acc_norm": 0.5259219668626403, "acc_norm_stderr": 0.011546883081384903, "acc_stderr": 0.01156223242154194}}, "1": {"Answer Given options": {"acc": 0.4521646178514164, "acc_norm": 0.4767504008551577, "acc_norm_stderr": 0.011549925483927456, "acc_stderr": 0.011509395748220104}, "Choose Story Ending": {"acc": 0.4596472474612507, "acc_norm": 0.4917156600748263, "acc_norm_stderr": 0.011560845076525713, "acc_stderr": 0.011524715486240657}, "Novel Correct Ending": {"acc": 0.4494922501336184, "acc_norm": 0.47033671833244256, "acc_norm_stderr": 0.01154206650976701, "acc_stderr": 0.011503288699799176}, "Story Continuation and Options": {"acc": 0.46392303580972744, "acc_norm": 0.49438802779262425, "acc_norm_stderr": 0.011561703928784337, "acc_stderr": 0.01153229486915312}}, "2": {"Answer Given options": {"acc": 0.4510956707642972, "acc_norm": 0.46285408872260825, "acc_norm_stderr": 0.011530479981182628, "acc_stderr": 0.011506993144185188}, "Choose Story Ending": {"acc": 0.4623196151790486, "acc_norm": 0.47995724211651525, "acc_norm_stderr": 0.011553138977961008, "acc_stderr": 0.011529552555884571}, "Novel Correct Ending": {"acc": 0.4478888295029396, "acc_norm": 0.45056119722073756, "acc_norm_stderr": 0.01150577173876986, "acc_stderr": 0.011499463505491369}, "Story Continuation and Options": {"acc": 0.45911277391769106, "acc_norm": 0.4746125066809193, "acc_norm_stderr": 0.011547518083754583, "acc_stderr": 0.011523708060182082}}, "3": {"Answer Given options": {"acc": 0.4665954035275254, "acc_norm": 0.4660609299839658, "acc_norm_stderr": 0.011535764881641411, "acc_stderr": 0.011536599118298178}, "Choose Story Ending": {"acc": 0.45269909139497594, "acc_norm": 0.4756814537680385, "acc_norm_stderr": 0.011548748301487319, "acc_stderr": 0.011510576955232206}, "Novel Correct Ending": {"acc": 0.4494922501336184, "acc_norm": 0.4569748797434527, "acc_norm_stderr": 0.01151954486592806, "acc_stderr": 0.011503288699799176}, "Story Continuation and Options": {"acc": 0.4521646178514164, "acc_norm": 0.4649919828968466, "acc_norm_stderr": 0.011534056494505864, "acc_stderr": 0.011509395748220108}}, "4": {"Answer Given options": {"acc": 0.45537145911277394, "acc_norm": 0.46285408872260825, "acc_norm_stderr": 0.011530479981182626, "acc_stderr": 0.011516282203726655}, "Choose Story Ending": {"acc": 0.46125066809192944, "acc_norm": 0.467129877071085, "acc_norm_stderr": 0.011537420054210297, "acc_stderr": 0.011527657726586461}, "Novel Correct Ending": {"acc": 0.44200962052378406, "acc_norm": 0.45323356493853556, "acc_norm_stderr": 0.011511744771088355, "acc_stderr": 0.011484402719452577}, "Story Continuation and Options": {"acc": 0.4510956707642972, "acc_norm": 0.4633885622661678, "acc_norm_stderr": 0.011531394084549621, "acc_stderr": 0.011506993144185188}}, "5": {"Answer Given options": {"acc": 0.4665954035275254, "acc_norm": 0.47033671833244256, "acc_norm_stderr": 0.011542066509767012, "acc_stderr": 0.011536599118298173}, "Choose Story Ending": {"acc": 0.467129877071085, "acc_norm": 0.46178514163548906, "acc_norm_stderr": 0.011528611805439891, "acc_stderr": 0.011537420054210303}, "Novel Correct Ending": {"acc": 0.45056119722073756, "acc_norm": 0.45430251202565475, "acc_norm_stderr": 0.011514040245583501, "acc_stderr": 0.01150577173876986}, "Story Continuation and Options": {"acc": 0.4665954035275254, "acc_norm": 0.4735435595938001, "acc_norm_stderr": 0.011546234813777397, "acc_stderr": 0.011536599118298173}}}, "superglue_rte": {"0": {"GPT-3 style": {"acc": 0.5270758122743683, "acc_norm": 0.48375451263537905, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030052303463143706}, "MNLI crowdsource": {"acc": 0.5342960288808665, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030025579819366426}, "does it follow that": {"acc": 0.5270758122743683, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030052303463143706}, "guaranteed true": {"acc": 0.5054151624548736, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.03009469812323996}, "should assume": {"acc": 0.5415162454873647, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.029992535385373324}}, "1": {"GPT-3 style": {"acc": 0.4729241877256318, "acc_norm": 0.4657039711191336, "acc_norm_stderr": 0.030025579819366426, "acc_stderr": 0.030052303463143713}, "MNLI crowdsource": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "does it follow that": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331327, "acc_stderr": 0.030091559826331334}, "guaranteed true": {"acc": 0.49097472924187724, "acc_norm": 0.4981949458483754, "acc_norm_stderr": 0.030096267148976633, "acc_stderr": 0.030091559826331334}, "should assume": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}}, "2": {"GPT-3 style": {"acc": 0.51985559566787, "acc_norm": 0.4981949458483754, "acc_norm_stderr": 0.030096267148976633, "acc_stderr": 0.030072723167317177}, "MNLI crowdsource": {"acc": 0.51985559566787, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030072723167317177}, "does it follow that": {"acc": 0.5090252707581228, "acc_norm": 0.4981949458483754, "acc_norm_stderr": 0.030096267148976626, "acc_stderr": 0.030091559826331334}, "guaranteed true": {"acc": 0.5270758122743683, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.0300523034631437}, "should assume": {"acc": 0.5090252707581228, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030091559826331334}}, "3": {"GPT-3 style": {"acc": 0.5090252707581228, "acc_norm": 0.4657039711191336, "acc_norm_stderr": 0.030025579819366426, "acc_stderr": 0.030091559826331334}, "MNLI crowdsource": {"acc": 0.49097472924187724, "acc_norm": 0.5126353790613718, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030091559826331334}, "does it follow that": {"acc": 0.48375451263537905, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030080573208738064}, "guaranteed true": {"acc": 0.516245487364621, "acc_norm": 0.5126353790613718, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030080573208738064}, "should assume": {"acc": 0.5018050541516246, "acc_norm": 0.5126353790613718, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030096267148976633}}, "4": {"GPT-3 style": {"acc": 0.4620938628158845, "acc_norm": 0.49458483754512633, "acc_norm_stderr": 0.030094698123239966, "acc_stderr": 0.030009848912529113}, "MNLI crowdsource": {"acc": 0.48736462093862815, "acc_norm": 0.48736462093862815, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030086851767188564}, "does it follow that": {"acc": 0.48014440433212996, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.0300727231673172}, "guaranteed true": {"acc": 0.5090252707581228, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.030039730592197812, "acc_stderr": 0.030091559826331334}, "should assume": {"acc": 0.48014440433212996, "acc_norm": 0.5018050541516246, "acc_norm_stderr": 0.030096267148976626, "acc_stderr": 0.0300727231673172}}, "5": {"GPT-3 style": {"acc": 0.4548736462093863, "acc_norm": 0.4620938628158845, "acc_norm_stderr": 0.030009848912529113, "acc_stderr": 0.029973636495415252}, "MNLI crowdsource": {"acc": 0.4693140794223827, "acc_norm": 0.4729241877256318, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.03003973059219781}, "does it follow that": {"acc": 0.4981949458483754, "acc_norm": 0.48736462093862815, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030096267148976633}, "guaranteed true": {"acc": 0.4693140794223827, "acc_norm": 0.5126353790613718, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030039730592197812}, "should assume": {"acc": 0.4729241877256318, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030052303463143706}}}, "winogrande": {"0": {"Replace": {"acc": 0.5059194948697711, "acc_norm": 0.4988161010260458, "acc_norm_stderr": 0.014052446290529015, "acc_stderr": 0.014051500838485807}, "True or False": {"acc": 0.494869771112865, "acc_norm": 0.4972375690607735, "acc_norm_stderr": 0.014052271211616441, "acc_stderr": 0.014051745961790516}, "does underscore refer to": {"acc": 0.4964483030781373, "acc_norm": 0.4877663772691397, "acc_norm_stderr": 0.014048278820405612, "acc_stderr": 0.014052131146915867}, "stand for": {"acc": 0.5098658247829518, "acc_norm": 0.49013417521704816, "acc_norm_stderr": 0.014049749833367585, "acc_stderr": 0.014049749833367592}, "underscore refer to": {"acc": 0.5177584846093133, "acc_norm": 0.4964483030781373, "acc_norm_stderr": 0.01405213114691586, "acc_stderr": 0.014043619596174964}}, "1": {"Replace": {"acc": 0.5114443567482242, "acc_norm": 0.4996053670086819, "acc_norm_stderr": 0.014052481306049512, "acc_stderr": 0.01404880419985932}, "True or False": {"acc": 0.494869771112865, "acc_norm": 0.494869771112865, "acc_norm_stderr": 0.014051745961790516, "acc_stderr": 0.014051745961790516}, "does underscore refer to": {"acc": 0.49329123914759276, "acc_norm": 0.47908445146014206, "acc_norm_stderr": 0.014040185494212943, "acc_stderr": 0.014051220692330349}, "stand for": {"acc": 0.5090765588003157, "acc_norm": 0.489344909234412, "acc_norm_stderr": 0.0140492945362904, "acc_stderr": 0.014050170094497704}, "underscore refer to": {"acc": 0.4964483030781373, "acc_norm": 0.500394632991318, "acc_norm_stderr": 0.014052481306049516, "acc_stderr": 0.014052131146915873}}, "2": {"Replace": {"acc": 0.5043409629044988, "acc_norm": 0.5019731649565904, "acc_norm_stderr": 0.014052376259225632, "acc_stderr": 0.014051956064076896}, "True or False": {"acc": 0.49329123914759276, "acc_norm": 0.505130228887135, "acc_norm_stderr": 0.014051745961790513, "acc_stderr": 0.014051220692330349}, "does underscore refer to": {"acc": 0.49171270718232046, "acc_norm": 0.4861878453038674, "acc_norm_stderr": 0.014047122916440419, "acc_stderr": 0.014050555322824197}, "stand for": {"acc": 0.49329123914759276, "acc_norm": 0.505130228887135, "acc_norm_stderr": 0.01405174596179051, "acc_stderr": 0.014051220692330346}, "underscore refer to": {"acc": 0.5019731649565904, "acc_norm": 0.5067087608524072, "acc_norm_stderr": 0.014051220692330349, "acc_stderr": 0.014052376259225629}}, "3": {"Replace": {"acc": 0.5059194948697711, "acc_norm": 0.49013417521704816, "acc_norm_stderr": 0.014049749833367592, "acc_stderr": 0.014051500838485807}, "True or False": {"acc": 0.4988161010260458, "acc_norm": 0.5185477505919495, "acc_norm_stderr": 0.014042813708888378, "acc_stderr": 0.014052446290529024}, "does underscore refer to": {"acc": 0.48855564325177586, "acc_norm": 0.4925019731649566, "acc_norm_stderr": 0.014050905521228587, "acc_stderr": 0.014048804199859316}, "stand for": {"acc": 0.4980268350434096, "acc_norm": 0.5074980268350434, "acc_norm_stderr": 0.014050905521228573, "acc_stderr": 0.01405237625922564}, "underscore refer to": {"acc": 0.5209155485398579, "acc_norm": 0.5067087608524072, "acc_norm_stderr": 0.014051220692330349, "acc_stderr": 0.014040185494212947}}, "4": {"Replace": {"acc": 0.5019731649565904, "acc_norm": 0.4925019731649566, "acc_norm_stderr": 0.01405090552122858, "acc_stderr": 0.01405237625922564}, "True or False": {"acc": 0.5098658247829518, "acc_norm": 0.5169692186266772, "acc_norm_stderr": 0.014044390401612967, "acc_stderr": 0.014049749833367589}, "does underscore refer to": {"acc": 0.4877663772691397, "acc_norm": 0.48303078137332284, "acc_norm_stderr": 0.01404439040161297, "acc_stderr": 0.014048278820405616}, "stand for": {"acc": 0.4980268350434096, "acc_norm": 0.4956590370955012, "acc_norm_stderr": 0.014051956064076892, "acc_stderr": 0.014052376259225632}, "underscore refer to": {"acc": 0.5193370165745856, "acc_norm": 0.4972375690607735, "acc_norm_stderr": 0.014052271211616438, "acc_stderr": 0.014041972733712965}}, "5": {"Replace": {"acc": 0.4956590370955012, "acc_norm": 0.48697711128650356, "acc_norm_stderr": 0.014047718393997663, "acc_stderr": 0.014051956064076887}, "True or False": {"acc": 0.5019731649565904, "acc_norm": 0.5146014206787688, "acc_norm_stderr": 0.014046492383275842, "acc_stderr": 0.014052376259225636}, "does underscore refer to": {"acc": 0.4925019731649566, "acc_norm": 0.49013417521704816, "acc_norm_stderr": 0.014049749833367585, "acc_stderr": 0.014050905521228584}, "stand for": {"acc": 0.489344909234412, "acc_norm": 0.48066298342541436, "acc_norm_stderr": 0.014041972733712976, "acc_stderr": 0.014049294536290403}, "underscore refer to": {"acc": 0.5090765588003157, "acc_norm": 0.5011838989739542, "acc_norm_stderr": 0.014052446290529009, "acc_stderr": 0.014050170094497707}}}} \ No newline at end of file