Spaces:

mrm8488
/

PromptSource

Build error

App Files Files Community

PromptSource / promptsource /seqio_tasks /experiment_D4.csv

mrm8488

First commit

c32ee7d about 3 years ago

raw

history blame

36.3 kB

	HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference
	crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
	super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012
	winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012
	winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
	winogrande,winogrande_debiased,coreference,ext,"""debiased"" = adversarially filtered",GPT,TRUE,,TRUE,9248,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
	glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,,TRUE,8551,0,,TRUE,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019
	super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,TRUE,,TRUE,250,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019
	super_glue,rte,NLI,cls,,,TRUE,,TRUE,2490,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
	anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,,TRUE,162865,0,,TRUE,0,accuracy,,https://arxiv.org/abs/1910.14599,,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020
	hans,,NLI,cls,,,TRUE,,TRUE,0,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,,sentence pair,syntax?,,McCoy et al. 2019
	super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,,
	glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005
	glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link)
	paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019
	ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
	chooses the correct answer and 1/k if it reports a k-way tie
	(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018
	ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
	chooses the correct answer and 1/k if it reports a k-way tie
	(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,,
	nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,,
	kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018
	trivia_qa,unfiltered,QA_closed_book,gen,,GPT,TRUE,,TRUE,87622,0,TRUE,,87622,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,,Trivia QA,,,,,
	web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,,,TRUE,3778,0,TRUE,,3778,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,,web questions,qa/closed-book qa,,,,Berant et al. 2013
	wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclanthology.org/D15-1237.pdf,,,wiki qa,cls/other,,,,Yang et al. 2015
	adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020
	adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
	adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
	coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
	against n human answers resulting in n F1 scores,
	the maximum of which is chosen as the prediction’s
	F1.For each question, we average out F1 across
	these n sets, both for humans and models. In our
	final evaluation, we use n = 4 human answers for
	every question (the original answer and 3 additionally collected answers). The articles a, an and the
	and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
	duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018
	duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018
	ropes,,QA_extractive,ext,,,TRUE,TRUE,,10924,10924,TRUE,,10924,,,,modest,,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019
	squad_v2,,QA_extractive,ext,,GPT,,,TRUE,130319,0,TRUE,,130319,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018
	super_glue,record,QA_extractive,ext,,,TRUE,,TRUE,100730,0,TRUE,TRUE,100730,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018
	qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,,TRUE,6414,0,TRUE,TRUE,6414,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,,qa srl,other,,semantic role,,He et al. 2015
	quac,,QA_extractive,ext,,GPT,,,TRUE,11567,,,,,"average_maximum_f1;HEQ-Q;HEQ-D: To make oracle human and system performance comparable,
	given n references, we report the average of the
	maximum F1 computed from each n − 1 subset
	with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,,dialogue,,
	quoref,,QA_extractive,ext,,,TRUE,TRUE,,19399,19399,TRUE,,19399,,,https://aclanthology.org/D19-1606.pdf,,,Quoref,Extractive QA,,,,Dasigi et al. 2019
	tydiqa,,QA_extractive,ext,,Eval WG,,TRUE,,9211,9211,,,,,,,,,,,,,,
	drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,TRUE,,TRUE,,,,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019
	cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019
	cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019
	dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019
	openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
	chooses the correct answer and 1/k if it reports a k-way tie
	(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018
	qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020
	quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020
	quarel,,QA_multiple_choice,cls,,CrossFit,,TRUE,,1941,1941,,,,,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a
	quartz,,QA_multiple_choice,cls,,,TRUE,TRUE,,2696,2696,TRUE,,2696,,,https://aclanthology.org/D19-1608.pdf,given?,,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
	race,high,QA_multiple_choice,cls,GPT-hard,GPT,,,TRUE,62445,0,TRUE,TRUE,62445,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
	race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,,,TRUE,25421,0,TRUE,TRUE,25421,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
	sciq,,QA_multiple_choice,cls,,,TRUE,TRUE,,11679,11679,TRUE,,11679,,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017
	social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,TRUE,TRUE,TRUE,33410,33410,TRUE,TRUE,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019
	super_glue,boolq,QA_multiple_choice,cls,,,TRUE,,TRUE,9427,0,TRUE,TRUE,9427,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,,superglue-boolq,,,knowledge-? reading comprehension,,
	super_glue,copa,QA_multiple_choice,cls,,,TRUE,,TRUE,400,0,TRUE,TRUE,400,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012
	super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,TRUE,,TRUE,27243,0,TRUE,TRUE,27243,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018
	wiki_hop,original,QA_multiple_choice,cls,,,TRUE,TRUE,,43738,43738,TRUE,,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,
	wiqa,,QA_multiple_choice,cls,,,TRUE,TRUE,,29808,29808,TRUE,,29808,,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019
	circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,,TRUE,34268,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,,circa,cls/other,,pragmatics,,Louis et al. 2020
	mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,,TRUE,0,0,,TRUE,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019
	piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,,,TRUE,16113,0,TRUE,,16113,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020
	amazon_polarity,,sentiment,cls,,,TRUE,TRUE,,3600000,500000,TRUE,,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013
	app_reviews,,sentiment,cls,,,TRUE,TRUE,,288065,288065,TRUE,,288065,,,,,,app reviews,other/regression,,,,Missing
	imdb,,sentiment,cls,,,TRUE,TRUE,,25000,25000,TRUE,,25000,,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011
	rotten_tomatoes,,sentiment,cls,,,TRUE,TRUE,,8530,8530,TRUE,,8530,,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005
	yelp_review_full,,sentiment,cls,no dev set,,TRUE,TRUE,,650000,500000,TRUE,,500000,,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link)
	lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
	craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
	story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,,TRUE,,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,
	hellaswag,,story_completion,cls,,GPT,,,TRUE,39905,0,TRUE,,39905,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019
	common_gen,,structure_to_text,gen,,,TRUE,TRUE,,67389,67389,TRUE,,67389,,,,,,common gen,other,,,,Lin et al. 2020b
	wiki_bio,,structure_to_text,gen,,,TRUE,TRUE,,582659,500000,TRUE,,500000,,,,,,wiki bio,cg/other,,,,Lebret et al. 2016
	cnn_dailymail,3.0.0,summarization,gen,,,TRUE,TRUE,,287113,287113,TRUE,,287113,,,,,,,,,,,
	gigaword,,summarization,gen,,,TRUE,TRUE,,3803957,500000,TRUE,,500000,,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012
	multi_news,,summarization,gen,,CrossFit,,TRUE,,44972,44972,,,,,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019
	samsum,,summarization,gen,,CrossFit,,TRUE,,14732,14732,,,,,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019
	xsum,,summarization,gen,,,TRUE,TRUE,TRUE,204045,204045,TRUE,TRUE,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,,xsum,cg/summarization,,,,Narayan et al. 2018
	ag_news,,topic_classification,cls,,,TRUE,TRUE,,120000,120000,TRUE,,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,,ag news,cls/topic,,,,Gulli (link)
	dbpedia_14,,topic_classification,cls,,,TRUE,TRUE,,560000,500000,TRUE,,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015
	trec,,topic_classification,cls,,,TRUE,TRUE,,5452,5452,TRUE,,5452,,,https://trec.nist.gov/data/qa.html,,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001
	super_glue,wic,word_sense_disambiguation,cls,,,TRUE,,TRUE,5428,0,TRUE,TRUE,5428,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019
	Staging Area,,,,,,,,,,,,,,,,,,,,,,,,
	Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,,,,,
	definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,,,,,,deﬁnite pronoun resolution,other,,,,Rahman and Ng 2012
	jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,,,,,promptsource download error,jeopardy,qa/closed-book qa,,,,(link)
	blimp,,,cls,no prompts yet; collapse subsets,,,,,,0,,,0,,,,,,,,,,,
	Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,,,,,
	Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,,7088,,,,,,,,,,,,,,,
	Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,,1211,,,,,,,,,,,,,,,
	MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,,14191,,,,,,,,,,,,,,,
	narrativeqa,,,,very long input sequence,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,NarQA,Abstractive QA,,,,
	newsqa,,,,download error,TaskEmbed,,,,,,,,,,,,,promptsource download error,NewsQA,Extractive QA,,,,Trischler et al. 2017
	eli5,,,,dataset split error,CrossFit,,,,,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,skip: HF datasets error the split field is used for subsets,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019
	Maybe Reconsider,,,,,,,,,,,,,,,,,,,,,,,,
	zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,,,,,
	swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,,73546,0,TRUE,,73546,,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018
	codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,,2776,0,TRUE,,2776,,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019
	wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,,,,,no prompt yet,wiki auto,cls/other,,text simplification,,Jiang et al. 2020
	proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,,,,,no prompt yet,proto qa,other,,,,Boratko et al. 2020
	empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,no prompt yet,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019
	qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,,,,,
	kilt_tasks,aidayago2,,,,,,,,,,,,,,,,,no prompt yet,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011
	kilt_tasks,wow,,,,,,,,,,,,,,,,,no prompt yet,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019
	lama,conceptnet,,,,,,,,,,,,,,,,,no prompt yet,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
	lama,google_re,,,,,,,,,,,,,,,,,no prompt yet,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
	lama,squad,,,,,,,,,,,,,,,,,no prompt yet,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
	lama,trex,,,,,,,,,,,,,,,,,no prompt yet,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
	limit,,physical cognition,,,,,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,label errors in dataset itself? also no validation set otherwise well motivated by semantic theories,limit,other,,physical semantic repr.,,Manotas et al. 2020
	kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,temporary skip: prompts available in non-benchmark standalone dataset,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018
	Skipped,,,,,,,,,,,,,,,,,,,,,,,,
	fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,,,,,skip: awkward prompts as closed-book qa,FEVER,,,,,
	hotpot_qa,distractor,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
	hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
	emo,,sentiment,cls,skip: offensive and ungrammatical text,,merged,,,30160,0,TRUE,TRUE,30160,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,skip: offensive and ungrammatical text,emo,cls/emotion,,,,Chatterjee et al. 2019
	freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,,20358,0,TRUE,,20358,,,,intensive,,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019
	aqua_rat,,,,,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,skip: nontrivial math,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017
	math_qa,,,,,,,,,,,,,,,,,,skip: nontrivial math,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019
	numer_sense,,,,,,,,,,,,,,,,,,skip: closed-book trivia ,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a
	squad_adversarial,,,,,,,,,,,,,,,,,,validation set only,,,,,,
	squadshifts,,,,,,,,,,,,,,,,,,test set only,,,,,,
	sms_spam,,,,,,,,,,,,,,,,,,skip: unclean corpus and likely harmful content,sms spam,cls/other,,,,Almeida et al. 2011
	search_qa,,,,,,,,,,,,,,,,,,skip: seems like a very unclean corpus,search qa,qa/closed-book qa,,,,Dunn et al. 2017
	kilt_tasks,trex,,,,,,,,,,,,,,,,,skip: non-natural language,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018
	kilt_tasks,structured_zeroshot,,,,,,,,,,,,,,,,,skip: non-natural language,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017
	spider,,,,,,,,,,,,,,,,,,skip: non-natural language,spider,cg/other,,,,Yu et al. 2018
	wikisql,,,,,,,,,,,,,,,,,,skip: non-natural language,wikisql,cg/other,,,,Zhong et al. 2017
	com_qa,,,,,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,skip: non-human language: URL,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers,
	climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,skip: no train set,climate fever,cls/fact checking,,,,Diggelmann et al. 2020
	art,,,,,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,skip: NLI reserved for generalization studies (although this one is not a traditionally defined NLI),art (abductive nli),other,,,,Bhagavatula et al. 2020
	glue,mnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-mnli,cls/nli,,,,Williams et al. 2018
	glue,qnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016
	glue,rte,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-rte,cls/nli,,,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
	glue,wnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-wnli,cls/nli,,,,Levesque et al. 2012
	,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,scitail,cls/nli,,,,Khot et al. 2018
	,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,sick,cls/nli,,,,Marelli et al. 2014
	,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,SNLI (Bowman et al. 2015),NLI,,,misc.,
	aeslc,,,,summarization by email subject line,,,,,,,,,,,,https://arxiv.org/abs/1906.03497,,skip: niche task,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019
	onestop_english,,,,,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,skip: niche task: classify curriculum diffculty,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018
	mocha,,,,,,,,,,,,,,,,,,skip: model generated text,mocha,other/regression,,,,Chen et al. 2020a
	commonsense_qa,,,,duplicate with cos_e,Vania,,,,9741,,,,,,,https://arxiv.org/pdf/1811.00937.pdf,,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019
	,,,,,,,,,,,,,,,,,,skip: maybe harmful content from Twitter,emotion,cls/emotion,,,,Saravia et al. 2018
	,,,,the authors themselves seem to have renounced their own work,,,,,,,,,,,,https://github.com/nyu-mll/crows-pairs,,skip: harmful content,crows pairs,other,,,,Nangia et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017
	,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018
	,,,,,,,,,,,,,,,,,,skip: harmful content,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-title,cg/summarization,,,,Kim et al. 2019
	,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020
	,,,,,,,,,,,,,,,,,,skip: harmful content,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019
	yelp_polarity,,,,,,,,,,,,,,,,,,skip: duplicate with yelp_review_full,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link)
	quora,,,,,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,skip: duplicate under GLUE,QQP,paraphrase identiﬁcation,,,social QA,Iyer et al. 2017
	squad,,,,,,,,,,,,,,,,,,skip: duplicate under Squad 2.0,SQuAD 1.1,Extractive QA,,,,
	yahoo_answers_topics,,,,,,,,,,,,,,,,,,skip for early experiments: unclean corpus,yahoo answers topics,cls/topic,,,,(link)
	tab_fact,,,,,,,,,,,,,,,,,,skip for early experiments: tabular data,tab fact,cls/fact checking,,,,Chen et al. 2020b
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-existential there quantiﬁers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
	poem_sentiment,,,,,,,,,,,,,,,,,,skip for early experiments: poetry domain,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020
	acronym_identification,,,,,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,skip for early experiments: niche/hard task,acronym identiﬁcation,other,,,,Pouran Ben Veyseh et al. 2020
	google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,google wellformed query,cls/other,,,,Faruqui and Das 2018
	liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,liar,cls/fact checking,,,,Wang 2017
	,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,crawl domain,other,,,,Zhang et al. 2020
	discovery,discovery,,,,,,,,,,,,,,,,,skip for early experiments: niche task no cannonical answer,discovery,cls/other,,generative-ish,,Sileo et al. 2019
	wiki_split,,,,,,,,,,,,,,,,,,skip for early experiments: niche task,wiki split,cg/other,,,,Botha et al. 2018
	,,,,,,,,,,,,,,,,,,skip for early experiments: multilingual,aslg pc12,other,,,,Othman and Jemni 2012
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Conj (Ficler and Goldberg 2016),conjunct identiﬁcation,,syntax,Penn Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc.,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank,
	,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank,
	financial_phrasebank,,,,,,,,,,,,,,,,,,skip for early experiments: financial domain,ﬁnancial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014
	health_fact,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,health fact,cls/fact checking,,,,Kotonya and Toni 2020
	,,,,,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,skip for early experiments: biomedical domain,ade corpus v2-classiﬁcation,cls/other,,,,Gurulingappa et al. 2012
	,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-dosage,other/slot ﬁlling,,,,Gurulingappa et al. 2012
	,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-effect,other/slot ﬁlling,,,,Gurulingappa et al. 2012
	,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020
	scicite,,,,,,,,,,,,,,,,,,skip for early experiments: academic domain + niche/hard task,scicite,cls/other,,,,Cohan et al. 2019
	,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,break-QDMR,other,,logical form,,Wolfson et al. 2020
	,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019
	glue,sst2,,,,,,,,,,,,,,,,,revisit: very short and often ill-formed movie reviews,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013
	glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-grain regression tasks,glue-stsb,semantic similarity,,,misc.,
	,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016
	,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016
	,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,BoolQ-CS,Binary yes/no,,,,
	,,,,,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,double check: missing from HF datasets,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB,
	,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,DROP-CS,Abstractive QA,,,,
	,,,,,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,double check: missing from HF datasets,MCTest,Multiple choice,,,,
	,,,,,,,,,,,,,,,,,,double check: missing from HF datasets,MRPC (Dolan and Brockett 2005),paraphrase identiﬁcation,,,news,
	,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,,
	,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
	,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,,
	,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,