pminervini commited on
Commit
fd7beec
·
1 Parent(s): 1591f9d
plots/clustermap_all.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf"],"index":["TruthfulQA MC1, Accuracy","TriviaQA, EM","HaluEval Dialog, Accuracy","XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy","RACE, Accuracy","NQ, EM","TruthfulQA MC2, Accuracy","HaluEval Summarization, Accuracy","True-False, Accuracy","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P","HaluEval QA, Accuracy","SelfCheckGPT, AVG"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465],[0.5928,0.6085,0.6471,null,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393],[0.0371901778,0.0347891843,0.0428959698,null,null,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,null,0.1854881243,0.2370092437,null,0.0432590646,0.2647281333,0.042540476,null,null,0.1822281713,0.1657837663,0.0141983599,null,null,0.0340250019,0.0445139517,0.0359765055,0.0413856309],[0.0401970892,0.0394161696,0.0428647574,null,null,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,null,0.5666702185,0.4707080535,null,0.0407451505,0.3412336695,0.0400569668,null,null,0.4089209967,0.3192473228,0.1421243598,null,null,0.0379350583,0.0428665575,0.0387781905,0.0411868449],[0.3949103208,0.3837605404,0.4030390077,null,null,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,null,0.6324491786,0.709515113,null,0.4014707047,0.7352085092,0.3991319337,null,null,0.6545591434,0.5731293539,0.4255551427,null,null,0.3853941111,0.4050690645,0.3917698265,0.401477077],[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547],[0.4574162679,0.4220095694,0.4325358852,0.4392344498,0.4229665072,0.3655502392,0.3023923445,0.4688995215,0.2755980861,0.35215311,0.376076555,0.3406698565,0.433492823,0.4717703349,0.3674641148,0.3827751196,0.4555023923,0.3770334928,0.3645933014,0.4,0.4641148325,0.4583732057,0.3722488038,0.3741626794,0.2937799043,0.4593301435,0.2220095694,0.4373205742,0.395215311,0.4612440191,0.404784689],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684],[0.4645,0.4193,0.4436,0.5464,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,null,0.0774,0.4906,0.4279,0.4772,null],[0.8534100247,0.7733771569,0.8087099425,0.8895645029,0.8348397699,0.5314708299,0.5041906327,0.8917009039,0.4940016434,0.5059983566,0.5413311422,0.507641742,0.8698438784,0.8793755136,0.5860312243,0.5393590797,0.883483977,0.8059161873,0.6271158587,0.8195562859,0.8854560394,0.8323746919,0.6364831553,0.5814297453,0.4926869351,0.8670501233,0.5064913722,0.8023007395,0.7268693509,0.8514379622,0.8315529992],[0.0132261781,0.0098347434,0.012722468,0.0114292948,0.008927303,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.0157445278,0.2242330745,0.2130128961,0.0142701427,0.0133793406,0.2408143362,0.012671213,0.0117323751,0.0111061484,0.1882467392,0.1691898288,0.0164863838,0.0131859896,0.0000716868,0.0103604417,0.0145397085,null,0.0135674045],[0.1686613542,0.2305715843,0.1762471835,0.1795055867,0.1750336135,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.2233354576,0.8669051787,0.9215527018,0.1965546069,0.1575508605,0.94858605,0.1617593629,0.1759309876,0.1893680342,0.8267075767,0.901661868,0.7995222351,0.1795224981,0.0805047156,0.1580946885,0.1517425501,null,0.155634531],[0.3668251897,0.3243336913,0.3548213557,0.3652777882,0.3544185533,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.3898203604,0.6159694987,0.6594958169,0.3785245521,0.3663761784,0.7087181979,0.3645764744,0.3628082484,0.3673437638,0.5926767984,0.5112991874,0.4542439283,0.3722801591,0.3306992617,0.3566615503,0.367965746,null,0.3640197324],[0.5454,0.4519,0.5206,0.6605,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879],[0.0900088111,0.0378151261,0.1105999164,0.0288242793,0.0042016807,0.012605042,0.063491636,0.0504201681,0.0865653082,0.0168067227,0.0042016807,0.0885709807,0.0782453898,0.987394958,0.0336183781,0.0797089047,0.0358374659,0.3529411765,0.012605042,0.1020439002,0.0210084034,0.0758733581,0.0291536913,0.0210084034,0.0084033613,0.1206660487,0.6692579906,0.0413645344,0.1043527865,0.0,null]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["TruthfulQA MC1, Accuracy","TriviaQA, EM","HaluEval Dialog, Accuracy","XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy","RACE, Accuracy","NQ, EM","TruthfulQA MC2, Accuracy","HaluEval Summarization, Accuracy","True-False, Accuracy","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P","HaluEval QA, Accuracy","SelfCheckGPT, AVG"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.2741738066,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241,null],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.7003455194,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465,0.4198060633],[0.5928,0.6085,0.6471,null,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.558,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393,null],[0.0371901778,0.0347891843,0.0428959698,null,null,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,null,0.1854881243,0.2370092437,null,0.0432590646,0.2647281333,0.042540476,null,null,0.1822281713,0.1657837663,0.0141983599,null,null,null,0.0340250019,0.0445139517,0.0359765055,0.0413856309,0.0011478244],[0.0401970892,0.0394161696,0.0428647574,null,null,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,null,0.5666702185,0.4707080535,null,0.0407451505,0.3412336695,0.0400569668,null,null,0.4089209967,0.3192473228,0.1421243598,null,null,null,0.0379350583,0.0428665575,0.0387781905,0.0411868449,0.1811100146],[0.3949103208,0.3837605404,0.4030390077,null,null,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,null,0.6324491786,0.709515113,null,0.4014707047,0.7352085092,0.3991319337,null,null,0.6545591434,0.5731293539,0.4255551427,null,null,null,0.3853941111,0.4050690645,0.3917698265,0.401477077,0.0021477235],[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5641025641,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009,null],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,null,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547,null],[0.4574162679,0.4220095694,0.4325358852,0.4392344498,0.4229665072,0.3655502392,0.3023923445,0.4688995215,0.2755980861,0.35215311,0.376076555,0.3406698565,0.433492823,0.4717703349,0.3674641148,0.3827751196,0.4555023923,0.3770334928,0.3645933014,0.4,0.4641148325,0.4583732057,0.3722488038,0.3741626794,0.2937799043,0.4076555024,0.4593301435,0.2220095694,0.4373205742,0.395215311,0.4612440191,0.404784689,0.419138756],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,null,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227,0.1991689751],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.4225232911,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684,null],[0.4645,0.4193,0.4436,0.5464,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,0.4588,null,0.0774,0.4906,0.4279,0.4772,null,0.5574],[0.8534100247,0.7733771569,0.8087099425,0.8895645029,0.8348397699,0.5314708299,0.5041906327,0.8917009039,0.4940016434,0.5059983566,0.5413311422,0.507641742,0.8698438784,0.8793755136,0.5860312243,0.5393590797,0.883483977,0.8059161873,0.6271158587,0.8195562859,0.8854560394,0.8323746919,0.6364831553,0.5814297453,0.4926869351,0.8376335251,0.8670501233,0.5064913722,0.8023007395,0.7268693509,0.8514379622,0.8315529992,0.8933442892],[0.0132261781,0.0098347434,0.012722468,0.0114292948,0.008927303,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.0157445278,0.2242330745,0.2130128961,0.0142701427,0.0133793406,0.2408143362,0.012671213,0.0117323751,0.0111061484,0.1882467392,0.1691898288,0.0164863838,0.0116271987,0.0131859896,0.0000716868,0.0103604417,0.0145397085,null,0.0135674045,0.0006113776],[0.1686613542,0.2305715843,0.1762471835,0.1795055867,0.1750336135,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.2233354576,0.8669051787,0.9215527018,0.1965546069,0.1575508605,0.94858605,0.1617593629,0.1759309876,0.1893680342,0.8267075767,0.901661868,0.7995222351,0.166116994,0.1795224981,0.0805047156,0.1580946885,0.1517425501,null,0.155634531,0.9509124595],[0.3668251897,0.3243336913,0.3548213557,0.3652777882,0.3544185533,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.3898203604,0.6159694987,0.6594958169,0.3785245521,0.3663761784,0.7087181979,0.3645764744,0.3628082484,0.3673437638,0.5926767984,0.5112991874,0.4542439283,0.3637139385,0.3722801591,0.3306992617,0.3566615503,0.367965746,null,0.3640197324,0.0013591523],[0.5454,0.4519,0.5206,0.6605,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5699,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879,null],[0.0900088111,0.0378151261,0.1105999164,0.0288242793,0.0042016807,0.012605042,0.063491636,0.0504201681,0.0865653082,0.0168067227,0.0042016807,0.0885709807,0.0782453898,0.987394958,0.0336183781,0.0797089047,0.0358374659,0.3529411765,0.012605042,0.1020439002,0.0210084034,0.0758733581,0.0291536913,0.0210084034,0.0084033613,null,0.1206660487,0.6692579906,0.0413645344,0.1043527865,0.0,null,null]]}
plots/clustermap_all.pdf CHANGED
Binary files a/plots/clustermap_all.pdf and b/plots/clustermap_all.pdf differ
 
plots/clustermap_all.png CHANGED

Git LFS Details

  • SHA256: 5e54f03d44e832da7ece78c2467087677a18d98d7cae3cacb1aac0b480961c95
  • Pointer size: 132 Bytes
  • Size of remote file: 1.61 MB

Git LFS Details

  • SHA256: 2404aac3305fc38d738d7973fe5aa3b27adaa52738ff772042657992338a4ed7
  • Pointer size: 132 Bytes
  • Size of remote file: 1.68 MB
plots/clustermap_det.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B"],"index":["HaluEval Dialog, Accuracy","HaluEval Summarization, Accuracy","HaluEval QA, Accuracy","SelfCheckGPT, AVG"],"data":[[0.5928,0.6085,0.6471,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393,null],[0.4645,0.4193,0.4436,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,null,0.0774,0.4906,0.4279,0.4772,null,0.5464],[0.5454,0.4519,0.5206,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879,0.6605],[0.0900088111,0.0378151261,0.1105999164,0.0042016807,0.012605042,0.063491636,0.0504201681,0.0865653082,0.0168067227,0.0042016807,0.0885709807,0.0782453898,0.987394958,0.0336183781,0.0797089047,0.0358374659,0.3529411765,0.012605042,0.1020439002,0.0210084034,0.0758733581,0.0291536913,0.0210084034,0.0084033613,0.1206660487,0.6692579906,0.0413645344,0.1043527865,0.0,null,0.0288242793]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["HaluEval Dialog, Accuracy","HaluEval Summarization, Accuracy","HaluEval QA, Accuracy","SelfCheckGPT, AVG"],"data":[[0.5928,0.6085,0.6471,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.558,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393,null,null],[0.4645,0.4193,0.4436,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,0.4588,null,0.0774,0.4906,0.4279,0.4772,null,0.5464,0.5574],[0.5454,0.4519,0.5206,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5699,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879,0.6605,null],[0.0900088111,0.0378151261,0.1105999164,0.0042016807,0.012605042,0.063491636,0.0504201681,0.0865653082,0.0168067227,0.0042016807,0.0885709807,0.0782453898,0.987394958,0.0336183781,0.0797089047,0.0358374659,0.3529411765,0.012605042,0.1020439002,0.0210084034,0.0758733581,0.0291536913,0.0210084034,0.0084033613,null,0.1206660487,0.6692579906,0.0413645344,0.1043527865,0.0,null,0.0288242793,null]]}
plots/clustermap_det.pdf CHANGED
Binary files a/plots/clustermap_det.pdf and b/plots/clustermap_det.pdf differ
 
plots/clustermap_det.png CHANGED

Git LFS Details

  • SHA256: d6b1985fe86e22b9f8482fc40d95439d99595a90a7eeb11e610618f8f2342490
  • Pointer size: 131 Bytes
  • Size of remote file: 758 kB

Git LFS Details

  • SHA256: 301afe11433364aa3c795318aef476416ba3a1348a10e30bb8ba02a6446d77c2
  • Pointer size: 131 Bytes
  • Size of remote file: 802 kB
plots/clustermap_instr.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf"],"index":["MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy"],"data":[[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf"],"index":["MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy"],"data":[[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5641025641,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,null,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547]]}
plots/clustermap_instr.pdf CHANGED
Binary files a/plots/clustermap_instr.pdf and b/plots/clustermap_instr.pdf differ
 
plots/clustermap_instr.png CHANGED

Git LFS Details

  • SHA256: b015ab8ffc20966e52ce9930ab669db24e6675ad4b1ebcbf25bceeac9b9694aa
  • Pointer size: 131 Bytes
  • Size of remote file: 629 kB

Git LFS Details

  • SHA256: 6cb1ae218b59a5641df32948125628cb2b1f33fe3de4783359b6368225c74e90
  • Pointer size: 131 Bytes
  • Size of remote file: 651 kB
plots/clustermap_qa.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf"],"index":["TruthfulQA MC1, Accuracy","TriviaQA, EM","NQ, EM","TruthfulQA MC2, Accuracy"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["TruthfulQA MC1, Accuracy","TriviaQA, EM","NQ, EM","TruthfulQA MC2, Accuracy"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.2741738066,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241,null],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.7003455194,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465,0.4198060633],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,null,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227,0.1991689751],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.4225232911,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684,null]]}
plots/clustermap_qa.pdf CHANGED
Binary files a/plots/clustermap_qa.pdf and b/plots/clustermap_qa.pdf differ
 
plots/clustermap_qa.png CHANGED

Git LFS Details

  • SHA256: f30511ccb280429306fb2d2ac50270259adc532919bdf7479a01f7e7143a4876
  • Pointer size: 131 Bytes
  • Size of remote file: 731 kB

Git LFS Details

  • SHA256: 55e73a18937b90c3ef546e5cbb6ad6f146e390a0a6b791d623a4d2ad7f31471f
  • Pointer size: 131 Bytes
  • Size of remote file: 775 kB
plots/clustermap_summ.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","Open-Orca\/Mistral-7B-OpenOrca","ehartford\/dolphin-2.1-mistral-7b","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert"],"index":["XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P"],"data":[[0.0371901778,0.0347891843,0.0428959698,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,0.1854881243,0.2370092437,0.0432590646,0.2647281333,0.042540476,0.1822281713,0.1657837663,0.0141983599,0.0340250019,0.0445139517,0.0359765055,0.0413856309,null,null,null,null,null,null,null,null],[0.0401970892,0.0394161696,0.0428647574,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,0.5666702185,0.4707080535,0.0407451505,0.3412336695,0.0400569668,0.4089209967,0.3192473228,0.1421243598,0.0379350583,0.0428665575,0.0387781905,0.0411868449,null,null,null,null,null,null,null,null],[0.3949103208,0.3837605404,0.4030390077,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,0.6324491786,0.709515113,0.4014707047,0.7352085092,0.3991319337,0.6545591434,0.5731293539,0.4255551427,0.3853941111,0.4050690645,0.3917698265,0.401477077,null,null,null,null,null,null,null,null],[0.0132261781,0.0098347434,0.012722468,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.2242330745,0.2130128961,0.0133793406,0.2408143362,0.012671213,0.1882467392,0.1691898288,0.0164863838,0.0103604417,0.0145397085,null,0.0135674045,0.0114292948,0.008927303,0.0157445278,0.0142701427,0.0117323751,0.0111061484,0.0131859896,0.0000716868],[0.1686613542,0.2305715843,0.1762471835,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.8669051787,0.9215527018,0.1575508605,0.94858605,0.1617593629,0.8267075767,0.901661868,0.7995222351,0.1580946885,0.1517425501,null,0.155634531,0.1795055867,0.1750336135,0.2233354576,0.1965546069,0.1759309876,0.1893680342,0.1795224981,0.0805047156],[0.3668251897,0.3243336913,0.3548213557,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.6159694987,0.6594958169,0.3663761784,0.7087181979,0.3645764744,0.5926767984,0.5112991874,0.4542439283,0.3566615503,0.367965746,null,0.3640197324,0.3652777882,0.3544185533,0.3898203604,0.3785245521,0.3628082484,0.3673437638,0.3722801591,0.3306992617]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","upstage\/SOLAR-10.7B-Instruct-v1.0","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","Open-Orca\/Mistral-7B-OpenOrca","ehartford\/dolphin-2.1-mistral-7b","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert"],"index":["XSum, ROUGE-1","XSum, ROUGE-2","XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","CNN\/DM, ROUGE-1","CNN\/DM, ROUGE-2","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P"],"data":[[0.0371901778,0.0347891843,0.0428959698,0.2203326049,0.1749600982,0.0504142569,0.1607960902,0.1560967861,0.2724328188,0.1948865046,0.0384377825,0.2457075618,0.298490147,0.0432590646,0.3344535486,0.042540476,0.2352382212,0.2094982527,0.0153408429,0.001309379,0.0340250019,0.0445139517,0.0359765055,0.0413856309,null,null,null,null,null,null,null,null,null],[0.0,0.0,0.0,0.0475386414,0.0262246323,0.0,0.0214328855,0.0377156872,0.0872967218,0.0385742496,0.0,0.0653846178,0.105823501,0.0,0.1240294423,0.0,0.0614764833,0.0613314645,0.0010001753,0.0007904875,0.0,0.0,0.0,0.0,null,null,null,null,null,null,null,null,null],[0.0371901778,0.0347891843,0.0428959698,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,0.1854881243,0.2370092437,0.0432590646,0.2647281333,0.042540476,0.1822281713,0.1657837663,0.0141983599,0.0011478244,0.0340250019,0.0445139517,0.0359765055,0.0413856309,null,null,null,null,null,null,null,null,null],[0.0401970892,0.0394161696,0.0428647574,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,0.5666702185,0.4707080535,0.0407451505,0.3412336695,0.0400569668,0.4089209967,0.3192473228,0.1421243598,0.1811100146,0.0379350583,0.0428665575,0.0387781905,0.0411868449,null,null,null,null,null,null,null,null,null],[0.3949103208,0.3837605404,0.4030390077,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,0.6324491786,0.709515113,0.4014707047,0.7352085092,0.3991319337,0.6545591434,0.5731293539,0.4255551427,0.0021477235,0.3853941111,0.4050690645,0.3917698265,0.401477077,null,null,null,null,null,null,null,null,null],[0.0132261781,0.0098347434,0.012722468,0.2307383556,0.1357875781,0.0169222534,0.1400837608,0.2215130966,0.2555628211,0.2302678427,0.0135334171,0.2558424993,0.2379621599,0.0133793406,0.2670262038,0.012671213,0.2131071975,0.1893372238,0.0165830818,0.0006702693,0.0103604417,0.0145397085,null,0.0135674045,0.0114292948,0.008927303,0.0157445278,0.0142701427,0.0117323751,0.0111061484,0.0116271987,0.0131859896,0.0000716868],[0.0,0.0,0.000001892,0.089367922,0.0423122873,0.000001892,0.0429218938,0.0925038287,0.1067787126,0.0938882235,0.0,0.0985488725,0.0919108951,0.0,0.1076334391,0.0,0.0774515089,0.0724591632,0.000524905,0.0003540662,0.0,0.0,null,0.0,0.0,0.0,0.000001892,0.000001892,0.0,0.0,0.0,0.0,0.0],[0.0132261781,0.0098347434,0.012722468,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.2242330745,0.2130128961,0.0133793406,0.2408143362,0.012671213,0.1882467392,0.1691898288,0.0164863838,0.0006113776,0.0103604417,0.0145397085,null,0.0135674045,0.0114292948,0.008927303,0.0157445278,0.0142701427,0.0117323751,0.0111061484,0.0116271987,0.0131859896,0.0000716868],[0.1686613542,0.2305715843,0.1762471835,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.8669051787,0.9215527018,0.1575508605,0.94858605,0.1617593629,0.8267075767,0.901661868,0.7995222351,0.9509124595,0.1580946885,0.1517425501,null,0.155634531,0.1795055867,0.1750336135,0.2233354576,0.1965546069,0.1759309876,0.1893680342,0.166116994,0.1795224981,0.0805047156],[0.3668251897,0.3243336913,0.3548213557,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.6159694987,0.6594958169,0.3663761784,0.7087181979,0.3645764744,0.5926767984,0.5112991874,0.4542439283,0.0013591523,0.3566615503,0.367965746,null,0.3640197324,0.3652777882,0.3544185533,0.3898203604,0.3785245521,0.3628082484,0.3673437638,0.3637139385,0.3722801591,0.3306992617]]}
plots/clustermap_summ.pdf CHANGED
Binary files a/plots/clustermap_summ.pdf and b/plots/clustermap_summ.pdf differ
 
plots/clustermap_summ.png CHANGED

Git LFS Details

  • SHA256: 8c9555dc15166f0c768e7b5ab67519357eb0ab39026f316cf4e82a4125b8546d
  • Pointer size: 131 Bytes
  • Size of remote file: 790 kB

Git LFS Details

  • SHA256: 0f80554cccdbc8765019771f37c64fd72b4e83856e785790460c8123397e5fca
  • Pointer size: 131 Bytes
  • Size of remote file: 997 kB
src/backend/envs.py CHANGED
@@ -32,6 +32,9 @@ class Tasks(Enum):
32
  task8 = Task("xsum", "rougeL", "XSum", 2)
33
  task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
34
 
 
 
 
35
  task10 = Task("memo-trap", "acc", "memo-trap", 0)
36
  task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
37
 
 
32
  task8 = Task("xsum", "rougeL", "XSum", 2)
33
  task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
34
 
35
+ task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
36
+ task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
37
+
38
  task10 = Task("memo-trap", "acc", "memo-trap", 0)
39
  task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
40
 
src/backend/run_eval_suite.py CHANGED
@@ -4,7 +4,11 @@ from lm_eval.tasks import initialize_tasks, include_task_folder
4
  from src.backend.manage_requests import EvalRequest
5
 
6
  from src.backend.tasks.xsum.task import XSum
 
 
7
  from src.backend.tasks.cnndm.task import CNNDM
 
 
8
  from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
9
 
10
 
 
4
  from src.backend.manage_requests import EvalRequest
5
 
6
  from src.backend.tasks.xsum.task import XSum
7
+ from src.backend.tasks.xsum.task_v2 import XSumv2
8
+
9
  from src.backend.tasks.cnndm.task import CNNDM
10
+ from src.backend.tasks.cnndm.task_v2 import CNNDMv2
11
+
12
  from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
13
 
14
 
src/backend/tasks/__init__.py CHANGED
@@ -1,3 +1,7 @@
1
  from src.backend.tasks.xsum.task import XSum
 
 
2
  from src.backend.tasks.cnndm.task import CNNDM
 
 
3
  from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
 
1
  from src.backend.tasks.xsum.task import XSum
2
+ from src.backend.tasks.xsum.task_v2 import XSumv2
3
+
4
  from src.backend.tasks.cnndm.task import CNNDM
5
+ from src.backend.tasks.cnndm.task_v2 import CNNDMv2
6
+
7
  from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
src/backend/tasks/cnndm/task_v2.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval.api.task import Task
2
+ from lm_eval.api.instance import Instance
3
+ from lm_eval.api.registry import register_task
4
+ from lm_eval.api.metrics import mean
5
+
6
+ import torch
7
+ import sacrebleu
8
+ from rouge_score import rouge_scorer, scoring
9
+
10
+
11
+ def bleu(refs, preds):
12
+ """
13
+ Returns `t5` style BLEU scores. See the related implementation:
14
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
15
+
16
+ :param refs:
17
+ A `list` of `list` of reference `str`s.
18
+ :param preds:
19
+ A `list` of predicted `str`s.
20
+ """
21
+ score = sacrebleu.corpus_bleu(
22
+ preds,
23
+ refs,
24
+ smooth_method="exp",
25
+ smooth_value=0.0,
26
+ force=False,
27
+ lowercase=False,
28
+ tokenize="intl",
29
+ use_effective_order=False,
30
+ ).score
31
+ return score
32
+
33
+
34
+ def rouge(refs, preds):
35
+ """
36
+ Returns `t5` style ROUGE scores. See the related implementation:
37
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
38
+
39
+ :param refs:
40
+ A `list` of reference `strs`.
41
+ :param preds:
42
+ A `list` of predicted `strs`.
43
+ """
44
+ rouge_types = ["rouge1", "rouge2", "rougeLsum"]
45
+ scorer = rouge_scorer.RougeScorer(rouge_types)
46
+ # Add newlines between sentences to correctly compute `rougeLsum`.
47
+
48
+ def _prepare_summary(summary):
49
+ summary = summary.replace(" . ", ".\n")
50
+ return summary
51
+
52
+ # Accumulate confidence intervals.
53
+ aggregator = scoring.BootstrapAggregator()
54
+ for ref, pred in zip(refs, preds):
55
+ ref = _prepare_summary(ref)
56
+ pred = _prepare_summary(pred)
57
+ aggregator.add_scores(scorer.score(ref, pred))
58
+ result = aggregator.aggregate()
59
+ return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
60
+
61
+
62
+ @register_task("cnndm_v2")
63
+ class CNNDMv2(Task):
64
+ VERSION = 0
65
+ DATASET_PATH = "cnn_dailymail"
66
+ DATASET_NAME = "3.0.0"
67
+
68
+ def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
69
+ super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
70
+ self.factkb_tokenizer = None
71
+ self.factkb_model = None
72
+ self.bert_score = None
73
+
74
+ def maybe_init_factkb(self):
75
+ if self.factkb_tokenizer is None or self.factkb_model is None:
76
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
77
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
78
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
79
+
80
+ def maybe_init_bertscore(self):
81
+ if self.bert_score is None:
82
+ from evaluate import load
83
+ self.bert_score = load("bertscore")
84
+
85
+ def has_training_docs(self):
86
+ return True
87
+
88
+ def has_validation_docs(self):
89
+ return True
90
+
91
+ def has_test_docs(self):
92
+ return True
93
+
94
+ def training_docs(self):
95
+ return self.dataset["train"]
96
+
97
+ def validation_docs(self):
98
+ return self.dataset["validation"]
99
+
100
+ def test_docs(self):
101
+ return self.dataset["test"]
102
+
103
+ def prompt(self):
104
+ res = "Provide a summary of the provided article."
105
+ return res
106
+
107
+ def doc_to_text(self, doc):
108
+ return f'{self.prompt()}\n\nArticle: {doc["article"]}\nSummary:'
109
+
110
+ @staticmethod
111
+ def should_decontaminate():
112
+ return True
113
+
114
+ def doc_to_decontamination_query(self, doc):
115
+ return doc["article"]
116
+
117
+ def doc_to_target(self, doc):
118
+ return doc["highlights"]
119
+
120
+ def construct_requests(self, doc, ctx, **kwargs):
121
+ """Uses RequestFactory to construct Requests and returns an iterable of
122
+ Requests which will be sent to the LM.
123
+
124
+ :param doc:
125
+ The document as returned from training_docs, validation_docs, or test_docs.
126
+ :param ctx: str
127
+ The context string, generated by fewshot_context. This includes the natural
128
+ language description, as well as the few shot examples, and the question
129
+ part of the document for `doc`.
130
+ """
131
+
132
+ return [
133
+ Instance(
134
+ request_type="generate_until",
135
+ doc=doc,
136
+ arguments=(ctx, {"until": ["\n"]}),
137
+ idx=0,
138
+ **kwargs
139
+ )
140
+ ]
141
+
142
+ def process_results(self, doc, results):
143
+ completion = results[0]
144
+ # true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
145
+ # all_refs = true_refs + false_refs
146
+
147
+ document = doc["article"]
148
+ gold_summary = doc["highlights"]
149
+
150
+ true_refs = [doc["highlights"]]
151
+ all_refs = true_refs
152
+
153
+ # ROUGE-N
154
+ rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
155
+ # ROUGE-1
156
+ rouge1_scores = [score["rouge1"] for score in rouge_scores]
157
+ # ROUGE-2
158
+ rouge2_scores = [score["rouge2"] for score in rouge_scores]
159
+ # ROUGE-L
160
+ rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
161
+
162
+ self.maybe_init_factkb()
163
+ input_factkb = [[completion, document]]
164
+ factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
165
+ factkb_logits = self.factkb_model(**factkb_tokens).logits
166
+ factkb_res = torch.softmax(factkb_logits, dim=1)
167
+
168
+ self.maybe_init_bertscore()
169
+ bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
170
+
171
+ res = {
172
+ "rouge1": rouge1_scores[0],
173
+ "rouge2": rouge2_scores[0],
174
+ "rougeL": rougeL_scores[0],
175
+ "factKB": float(factkb_res[0][1]),
176
+ "bertscore_precision": float(bert_score_res["precision"][0]),
177
+ "bertscore_recall": float(bert_score_res["recall"][0]),
178
+ "bertscore_f1": float(bert_score_res["f1"][0])
179
+ }
180
+
181
+ return res
182
+
183
+ def aggregation(self):
184
+ """
185
+ :returns: {str: [float] -> float}
186
+ A dictionary where keys are the names of submetrics and values are
187
+ functions that aggregate a list of metrics
188
+ """
189
+ return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
190
+
191
+ def higher_is_better(self):
192
+ """
193
+ :returns: {str: bool}
194
+ A dictionary where keys are the names of submetrics and values are
195
+ whether a higher value of the submetric is better
196
+ """
197
+ return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
198
+
src/backend/tasks/xsum/task_v2.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval.api.task import Task
2
+ from lm_eval.api.instance import Instance
3
+ from lm_eval.api.registry import register_task
4
+ from lm_eval.api.metrics import mean
5
+
6
+ import torch
7
+ import sacrebleu
8
+ from rouge_score import rouge_scorer, scoring
9
+
10
+
11
+ def bleu(refs, preds):
12
+ """
13
+ Returns `t5` style BLEU scores. See the related implementation:
14
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
15
+
16
+ :param refs:
17
+ A `list` of `list` of reference `str`s.
18
+ :param preds:
19
+ A `list` of predicted `str`s.
20
+ """
21
+ score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False,
22
+ lowercase=False, tokenize="intl", use_effective_order=False).score
23
+ return score
24
+
25
+
26
+ def rouge(refs, preds):
27
+ """
28
+ Returns `t5` style ROUGE scores. See the related implementation:
29
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
30
+
31
+ :param refs:
32
+ A `list` of reference `strs`.
33
+ :param preds:
34
+ A `list` of predicted `strs`.
35
+ """
36
+ rouge_types = ["rouge1", "rouge2", "rougeLsum"]
37
+ scorer = rouge_scorer.RougeScorer(rouge_types)
38
+ # Add newlines between sentences to correctly compute `rougeLsum`.
39
+
40
+ def _prepare_summary(summary):
41
+ summary = summary.replace(" . ", ".\n")
42
+ return summary
43
+
44
+ # Accumulate confidence intervals.
45
+ aggregator = scoring.BootstrapAggregator()
46
+ for ref, pred in zip(refs, preds):
47
+ ref = _prepare_summary(ref)
48
+ pred = _prepare_summary(pred)
49
+ aggregator.add_scores(scorer.score(ref, pred))
50
+ result = aggregator.aggregate()
51
+ return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
52
+
53
+
54
+ @register_task("xsum_v2")
55
+ class XSumv2(Task):
56
+ VERSION = 0
57
+ DATASET_PATH = "EdinburghNLP/xsum"
58
+ DATASET_NAME = None
59
+
60
+ def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
61
+ super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
62
+ self.factkb_tokenizer = None
63
+ self.factkb_model = None
64
+ self.bert_score = None
65
+
66
+ def maybe_init_factkb(self):
67
+ if self.factkb_tokenizer is None or self.factkb_model is None:
68
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
69
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
70
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
71
+
72
+ def maybe_init_bertscore(self):
73
+ if self.bert_score is None:
74
+ from evaluate import load
75
+ self.bert_score = load("bertscore")
76
+
77
+ def has_training_docs(self):
78
+ return True
79
+
80
+ def has_validation_docs(self):
81
+ return True
82
+
83
+ def has_test_docs(self):
84
+ return True
85
+
86
+ def training_docs(self):
87
+ return self.dataset["train"]
88
+
89
+ def validation_docs(self):
90
+ return self.dataset["validation"]
91
+
92
+ def test_docs(self):
93
+ return self.dataset["test"]
94
+
95
+ def prompt(self):
96
+ res = "Provide a summary of the provided document."
97
+ return res
98
+
99
+ def doc_to_text(self, doc):
100
+ return f'{self.prompt()}\n\nDocument: {doc["document"]}\nSummary:'
101
+
102
+ @staticmethod
103
+ def should_decontaminate():
104
+ return True
105
+
106
+ def doc_to_decontamination_query(self, doc):
107
+ return doc["document"]
108
+
109
+ def doc_to_target(self, doc):
110
+ return doc["summary"]
111
+
112
+ def construct_requests(self, doc, ctx, **kwargs):
113
+ """Uses RequestFactory to construct Requests and returns an iterable of
114
+ Requests which will be sent to the LM.
115
+
116
+ :param doc:
117
+ The document as returned from training_docs, validation_docs, or test_docs.
118
+ :param ctx: str
119
+ The context string, generated by fewshot_context. This includes the natural
120
+ language description, as well as the few shot examples, and the question
121
+ part of the document for `doc`.
122
+ """
123
+
124
+ return [
125
+ Instance(
126
+ request_type="generate_until",
127
+ doc=doc,
128
+ # arguments=(ctx, {"until": ["\n", "."]}),
129
+ arguments=(ctx, {"until": ["\n"]}),
130
+ idx=0,
131
+ **kwargs
132
+ )
133
+ ]
134
+
135
+ def process_results(self, doc, results):
136
+ completion = results[0]
137
+
138
+ # breakpoint()
139
+
140
+ document = doc["document"]
141
+ gold_summary = doc["summary"]
142
+
143
+ true_refs = [doc["summary"]]
144
+ all_refs = true_refs
145
+
146
+ # ROUGE-N
147
+ rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
148
+ # ROUGE-1
149
+ rouge1_scores = [score["rouge1"] for score in rouge_scores]
150
+ # ROUGE-2
151
+ rouge2_scores = [score["rouge2"] for score in rouge_scores]
152
+ # ROUGE-L
153
+ rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
154
+
155
+ self.maybe_init_factkb()
156
+ input_factkb = [[completion, document]]
157
+ factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
158
+ factkb_logits = self.factkb_model(**factkb_tokens).logits
159
+ factkb_res = torch.softmax(factkb_logits, dim=1)
160
+
161
+ self.maybe_init_bertscore()
162
+ bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
163
+
164
+ res = {
165
+ "rouge1": rouge1_scores[0],
166
+ "rouge2": rouge2_scores[0],
167
+ "rougeL": rougeL_scores[0],
168
+ "factKB": float(factkb_res[0][1]),
169
+ "bertscore_precision": float(bert_score_res["precision"][0]),
170
+ "bertscore_recall": float(bert_score_res["recall"][0]),
171
+ "bertscore_f1": float(bert_score_res["f1"][0]),
172
+ }
173
+
174
+ # breakpoint()
175
+
176
+ return res
177
+
178
+ def aggregation(self):
179
+ """
180
+ :returns: {str: [float] -> float}
181
+ A dictionary where keys are the names of submetrics and values are
182
+ functions that aggregate a list of metrics
183
+ """
184
+ return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
185
+
186
+ def higher_is_better(self):
187
+ """
188
+ :returns: {str: bool}
189
+ A dictionary where keys are the names of submetrics and values are
190
+ whether a higher value of the submetric is better
191
+ """
192
+ return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}